Crawlee adalah framework scraping Node.js modern dari Apify. Berikut cara mengintegrasikan CaptchaAI untuk solve CAPTCHA otomatis di spider Crawlee Anda.
Mengapa Crawlee + CaptchaAI
| Fitur | Manfaat |
|---|---|
| Manajemen session bawaan | TLS sinyal browser konsisten dengan CAPTCHA yang sudah di-solve |
| Auto retry | Retry request yang gagal setelah solve CAPTCHA |
| Rotasi proxy | Pasangkan dengan dukungan proxy CaptchaAI |
| Request queue | Queue CAPTCHA di-solve bersamaan dengan scraping |
Integrasi Dasar
const { CheerioCrawler } = require('crawlee');
const https = require('https');
const API_KEY = process.env.CAPTCHAAI_API_KEY;
async function solveCaptcha(sitekey, pageurl) {
// Submit task
const submitData = new URLSearchParams({
key: API_KEY,
method: 'userrecaptcha',
googlekey: sitekey,
pageurl: pageurl,
json: '1',
});
const submitResp = await fetch('https://ocr.captchaai.com/in.php', {
method: 'POST',
body: submitData,
});
const submitResult = await submitResp.json();
if (submitResult.status !== 1) {
throw new Error(`Submit error: ${submitResult.request}`);
}
const taskId = submitResult.request;
// Poll for result
await new Promise(r => setTimeout(r, 15000));
for (let i = 0; i < 24; i++) {
const pollResp = await fetch(
`https://ocr.captchaai.com/res.php?key=${API_KEY}&action=get&id=${taskId}&json=1`
);
const pollResult = await pollResp.json();
if (pollResult.status === 1) return pollResult.request;
if (pollResult.request !== 'CAPCHA_NOT_READY') {
throw new Error(`Solve error: ${pollResult.request}`);
}
await new Promise(r => setTimeout(r, 5000));
}
throw new Error('Solve timeout');
}
// Crawlee spider with CAPTCHA handling
const crawler = new CheerioCrawler({
maxConcurrency: 5,
requestHandlerTimeoutSecs: 180,
async requestHandler({ request, $, log }) {
// Check if page has CAPTCHA
const captchaDiv = $('[data-sitekey]');
if (captchaDiv.length > 0) {
const sitekey = captchaDiv.attr('data-sitekey');
log.info(`CAPTCHA found on ${request.url}, solving...`);
const token = await solveCaptcha(sitekey, request.url);
log.info('CAPTCHA solved, submitting form');
// Submit form with token
const formData = new URLSearchParams({
'g-recaptcha-response': token,
});
const resp = await fetch(request.url, {
method: 'POST',
body: formData,
});
const html = await resp.text();
// Parse the result page...
}
// Extract data
const title = $('title').text();
const data = $('table tr').map((i, row) => ({
col1: $(row).find('td:eq(0)').text().trim(),
col2: $(row).find('td:eq(1)').text().trim(),
})).get();
log.info(`Scraped ${data.length} rows from ${request.url}`);
},
failedRequestHandler({ request, log }) {
log.error(`Failed: ${request.url}`);
},
});
// Run
(async () => {
await crawler.run([
'https://example.com/page1',
'https://example.com/page2',
]);
})();
PlaywrightCrawler dengan CAPTCHA
const { PlaywrightCrawler } = require('crawlee');
const crawler = new PlaywrightCrawler({
maxConcurrency: 3,
requestHandlerTimeoutSecs: 180,
launchContext: {
launchOptions: {
headless: true,
args: [],
},
},
async requestHandler({ request, page, log }) {
await page.goto(request.url, { waitUntil: 'networkidle' });
// Check for reCAPTCHA
const sitekey = await page.evaluate(() => {
const el = document.querySelector('[data-sitekey]');
return el ? el.getAttribute('data-sitekey') : null;
});
if (sitekey) {
log.info(`CAPTCHA detected, solving for ${request.url}`);
const token = await solveCaptcha(sitekey, request.url);
// Inject token
await page.evaluate((t) => {
const ta = document.querySelector('[name="g-recaptcha-response"]');
if (ta) {
ta.style.display = 'block';
ta.value = t;
}
// Trigger callback
const widget = document.querySelector('.g-recaptcha');
if (widget) {
const cb = widget.getAttribute('data-callback');
if (cb && typeof window[cb] === 'function') {
window[cb](t);
}
}
}, token);
await page.click('button[type="submit"]');
await page.waitForNavigation({ waitUntil: 'networkidle' });
}
// Extract data
const title = await page.title();
const content = await page.textContent('body');
log.info(`Page: ${title}, length: ${content.length}`);
},
});
Solve CAPTCHA dengan Session Pool
const { CheerioCrawler, Session } = require('crawlee');
const crawler = new CheerioCrawler({
useSessionPool: true,
sessionPoolOptions: {
maxPoolSize: 10,
sessionOptions: {
maxUsageCount: 50,
},
},
async requestHandler({ request, $, session, log }) {
// If blocked, solve CAPTCHA and mark session as usable
if ($('.captcha-container').length > 0) {
const sitekey = $('[data-sitekey]').attr('data-sitekey');
const token = await solveCaptcha(sitekey, request.url);
// Store token in session for subsequent requests
session.userData = session.userData || {};
session.userData.captchaToken = token;
session.userData.tokenTime = Date.now();
log.info('CAPTCHA solved, session updated');
}
// Normal scraping
const items = $('div.item').map((i, el) => ({
name: $(el).find('.name').text().trim(),
price: $(el).find('.price').text().trim(),
})).get();
log.info(`Found ${items.length} items`);
},
});
Pertanyaan Umum
Apakah Crawlee memiliki dukungan CAPTCHA bawaan?
Tidak. Crawlee menangani session, proxy, dan retry, tapi Anda perlu menambahkan solve CAPTCHA via CaptchaAI atau layanan lain.
Crawler Crawlee mana yang harus saya gunakan?
Gunakan CheerioCrawler untuk halaman statis, PlaywrightCrawler untuk halaman yang di-render JavaScript dengan CAPTCHA, dan PuppeteerCrawler sebagai alternatif Playwright.
Bisakah saya menggunakan Crawlee dengan CaptchaAI di Apify?
Ya. Deploy actor Crawlee Anda di Apify dan gunakan CaptchaAI melalui HTTP API call. Set API key sebagai environment variable Apify.
Panduan Terkait
- Scrapy Spider Middleware untuk CaptchaAI
- Quickstart CaptchaAI
Tambahkan solve CAPTCHA ke Crawlee — dapatkan API key CaptchaAI Anda.