Node.js unggul dalam beban kerja scraping I/O-heavy. Saat situs target menyajikan CAPTCHA, API CaptchaAI menyelesaikannya sementara skrip Anda menangani permintaan HTTP. Tutorial ini mencakup alur kerja lengkap menggunakan axios dan cheerio.
Persyaratan
| Persyaratan | Detail |
|---|---|
| Node.js 16+ | Dengan npm |
axios |
npm install axios |
cheerio |
npm install cheerio |
| Kunci API CaptchaAI | Dari captchaai.com |
Modul Pemecah CaptchaAI
// captcha-solver.js
const axios = require("axios");
class CaptchaSolver {
constructor(apiKey) {
this.apiKey = apiKey;
this.baseUrl = "https://ocr.captchaai.com";
}
async _submit(params) {
params.key = this.apiKey;
const resp = await axios.get(`${this.baseUrl}/in.php`, { params });
if (!resp.data.startsWith("OK|")) {
throw new Error(`Submit error: ${resp.data}`);
}
return resp.data.split("|")[1];
}
async _poll(taskId, timeout = 300000) {
const deadline = Date.now() + timeout;
while (Date.now() < deadline) {
await new Promise((r) => setTimeout(r, 5000));
const resp = await axios.get(`${this.baseUrl}/res.php`, {
params: { key: this.apiKey, action: "get", id: taskId },
});
if (resp.data === "CAPCHA_NOT_READY") continue;
if (resp.data.startsWith("OK|")) return resp.data.split("|")[1];
throw new Error(`Solve error: ${resp.data}`);
}
throw new Error("Solve timed out");
}
async solveRecaptchaV2(siteKey, pageUrl) {
const taskId = await this._submit({
method: "userrecaptcha",
googlekey: siteKey,
pageurl: pageUrl,
});
return this._poll(taskId);
}
async solveRecaptchaV3(siteKey, pageUrl, action = "verify") {
const taskId = await this._submit({
method: "userrecaptcha",
googlekey: siteKey,
pageurl: pageUrl,
version: "v3",
action,
});
return this._poll(taskId);
}
async solveTurnstile(siteKey, pageUrl) {
const taskId = await this._submit({
method: "turnstile",
sitekey: siteKey,
pageurl: pageUrl,
});
return this._poll(taskId);
}
}
module.exports = CaptchaSolver;
Mengikis Halaman yang Dilindungi reCAPTCHA
const axios = require("axios");
const cheerio = require("cheerio");
const CaptchaSolver = require("./captcha-solver");
const solver = new CaptchaSolver("YOUR_API_KEY");
async function scrapeProtectedPage(url) {
// Step 1: Load the page
const { data: html } = await axios.get(url, {
headers: {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
},
});
const $ = cheerio.load(html);
// Step 2: Extract site key
const siteKey = $(".g-recaptcha").attr("data-sitekey");
if (!siteKey) {
console.log("No CAPTCHA found, page loaded directly");
return html;
}
console.log("Site key found:", siteKey);
// Step 3: Solve the CAPTCHA
const token = await solver.solveRecaptchaV2(siteKey, url);
console.log("Token received:", token.substring(0, 50));
// Step 4: Submit with the token
const result = await axios.post(
url,
new URLSearchParams({
"g-recaptcha-response": token,
q: "search query",
}),
{
headers: {
"Content-Type": "application/x-www-form-urlencoded",
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
},
}
);
return result.data;
}
Scraping Beberapa Halaman Secara Concurrent
async function scrapePages(urls, siteKey, concurrency = 3) {
const results = [];
const queue = [...urls];
const worker = async () => {
while (queue.length > 0) {
const url = queue.shift();
try {
const token = await solver.solveRecaptchaV2(siteKey, url);
const { data } = await axios.post(
url,
new URLSearchParams({ "g-recaptcha-response": token }),
{
headers: {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
},
}
);
results.push({ url, data, success: true });
console.log(`Scraped: ${url}`);
} catch (err) {
results.push({ url, error: err.message, success: false });
console.error(`Failed: ${url} - ${err.message}`);
}
}
};
// Run workers concurrently
const workers = Array(concurrency)
.fill(null)
.map(() => worker());
await Promise.all(workers);
return results;
}
// Usage
const urls = [
"https://example.com/page/1",
"https://example.com/page/2",
"https://example.com/page/3",
];
const results = await scrapePages(urls, "6Le-wvkS...", 3);
Menangani Cookie dan Sesi
Gunakan axios dengan persistensi cookie untuk situs yang memerlukan session cookie:
const { wrapper } = require("axios-cookiejar-support");
const { CookieJar } = require("tough-cookie");
const jar = new CookieJar();
const client = wrapper(
axios.create({
jar,
headers: {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
},
})
);
async function scrapeWithSession(url, siteKey) {
// Initial page load sets cookies
await client.get(url);
// Solve CAPTCHA
const token = await solver.solveRecaptchaV2(siteKey, url);
// Submit with maintained cookies
const result = await client.post(
url,
new URLSearchParams({ "g-recaptcha-response": token })
);
return result.data;
}
Parsing Hasil dengan Cheerio
function parseResults(html) {
const $ = cheerio.load(html);
const items = [];
$(".result-item").each((_, el) => {
items.push({
title: $(el).find(".title").text().trim(),
url: $(el).find("a").attr("href"),
description: $(el).find(".description").text().trim(),
});
});
return items;
}
Pemecahan Masalah
| Masalah | Penyebab | Solusi |
|---|---|---|
CAPTCHA_NOT_READY berputar tanpa batas |
Sitekey salah atau solve lambat | Verifikasi sitekey; naikkan timeout |
403 Forbidden saat POST |
Cookie atau header tidak ada | Gunakan session cookie; tambahkan header Referer |
| Cheerio tidak menemukan elemen | Konten dinamis | Gunakan Puppeteer untuk situs yang dirender JS |
ECONNREFUSED |
Rate limiting oleh situs target | Tambahkan delay; rotasi proxy |
Pertanyaan Umum
Kapan saya harus menggunakan Puppeteer daripada axios?
Gunakan axios + cheerio ketika situs target mengembalikan HTML dengan form submit standar. Gunakan Puppeteer ketika situs memerlukan eksekusi JavaScript, rendering dinamis, atau interaksi pengguna yang kompleks.
Bisakah saya menyelesaikan beberapa CAPTCHA sekaligus?
Ya. Submit beberapa tugas CAPTCHA ke CaptchaAI secara concurrent dan poll setiap hasilnya. Contoh concurrent scraping di atas menunjukkan pola ini.
Bagaimana cara menangani situs yang dilindungi Cloudflare?
Jika situs menggunakan Cloudflare Turnstile, gunakan solver.solveTurnstile(). Untuk halaman Cloudflare Challenge penuh, gunakan Pemecahan Cloudflare Challenge yang mengembalikan cookie qa_validation_cookie.
Panduan Terkait
- Pemecahan CAPTCHA Puppeteer dengan Node.js
- Scraping CAPTCHA dengan Python
- Rotasi Proxy untuk Scraping CAPTCHA