Integrasi

Colly + CaptchaAI: Go Scraping dengan CAPTCHA Solving

Colly adalah framework web scraping Go yang populer. Berikut cara mengintegrasikan CaptchaAI untuk menangani CAPTCHA di scraper Go Anda.


Klien CaptchaAI di Go

package captchaai

import (
    "encoding/json"
    "errors"
    "fmt"
    "net/http"
    "net/url"
    "strings"
    "time"
)

type Client struct {
    APIKey     string
    HTTPClient *http.Client
}

type apiResponse struct {
    Status  int    `json:"status"`
    Request string `json:"request"`
}

func NewClient(apiKey string) *Client {
    return &Client{
        APIKey: apiKey,
        HTTPClient: &http.Client{Timeout: 30 * time.Second},
    }
}

func (c *Client) SolveRecaptchaV2(sitekey, pageurl string) (string, error) {
    // Submit task
    data := url.Values{
        "key":       {c.APIKey},
        "method":    {"userrecaptcha"},
        "googlekey": {sitekey},
        "pageurl":   {pageurl},
        "json":      {"1"},
    }

    resp, err := c.HTTPClient.PostForm("https://ocr.captchaai.com/in.php", data)
    if err != nil {
        return "", fmt.Errorf("submit error: %w", err)
    }
    defer resp.Body.Close()

    var result apiResponse
    if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
        return "", fmt.Errorf("decode error: %w", err)
    }

    if result.Status != 1 {
        return "", fmt.Errorf("submit failed: %s", result.Request)
    }

    taskID := result.Request

    // Poll for result
    time.Sleep(15 * time.Second)

    for i := 0; i < 24; i++ {
        pollURL := fmt.Sprintf(
            "https://ocr.captchaai.com/res.php?key=%s&action=get&id=%s&json=1",
            c.APIKey, taskID,
        )
        resp, err := c.HTTPClient.Get(pollURL)
        if err != nil {
            time.Sleep(5 * time.Second)
            continue
        }

        var pollResult apiResponse
        json.NewDecoder(resp.Body).Decode(&pollResult)
        resp.Body.Close()

        if pollResult.Status == 1 {
            return pollResult.Request, nil
        }
        if pollResult.Request != "CAPCHA_NOT_READY" {
            return "", fmt.Errorf("solve error: %s", pollResult.Request)
        }

        time.Sleep(5 * time.Second)
    }

    return "", errors.New("solve timeout")
}

Integrasi Colly

package main

import (
    "fmt"
    "log"
    "os"
    "strings"

    "github.com/gocolly/colly/v2"
)

func main() {
    apiKey := os.Getenv("CAPTCHAAI_API_KEY")
    solver := captchaai.NewClient(apiKey)

    c := colly.NewCollector(
        colly.AllowedDomains("example.com"),
        colly.MaxDepth(2),
    )

    // Detect CAPTCHA pages
    c.OnHTML("[data-sitekey]", func(e *colly.HTMLElement) {
        sitekey := e.Attr("data-sitekey")
        pageURL := e.Request.URL.String()

        log.Printf("CAPTCHA detected on %s, solving...", pageURL)

        token, err := solver.SolveRecaptchaV2(sitekey, pageURL)
        if err != nil {
            log.Printf("Solve failed: %v", err)
            return
        }

        log.Printf("CAPTCHA solved, token length: %d", len(token))

        // Post form with token
        err = c.Post(pageURL, map[string]string{
            "g-recaptcha-response": token,
        })
        if err != nil {
            log.Printf("Form submit failed: %v", err)
        }
    })

    // Extract data
    c.OnHTML("table tr", func(e *colly.HTMLElement) {
        cols := []string{}
        e.ForEach("td", func(_ int, td *colly.HTMLElement) {
            cols = append(cols, strings.TrimSpace(td.Text))
        })
        if len(cols) > 0 {
            fmt.Printf("Row: %s\n", strings.Join(cols, " | "))
        }
    })

    c.OnError(func(r *colly.Response, err error) {
        log.Printf("Error %s: %v", r.Request.URL, err)
    })

    c.Visit("https://example.com/data")
}

Colly dengan Rate Limiting

package main

import (
    "time"

    "github.com/gocolly/colly/v2"
)

func main() {
    c := colly.NewCollector()

    // Rate limit: 1 request per 3 seconds per domain
    c.Limit(&colly.LimitRule{
        DomainGlob:  "*",
        Parallelism: 1,
        Delay:       3 * time.Second,
        RandomDelay: 2 * time.Second,
    })

    // ... CAPTCHA handling as above ...

    c.Visit("https://example.com")
}

Turnstile Solving di Go

func (c *Client) SolveTurnstile(sitekey, pageurl string) (string, error) {
    data := url.Values{
        "key":       {c.APIKey},
        "method":    {"turnstile"},
        "sitekey":   {sitekey},
        "pageurl":   {pageurl},
        "json":      {"1"},
    }

    resp, err := c.HTTPClient.PostForm("https://ocr.captchaai.com/in.php", data)
    if err != nil {
        return "", fmt.Errorf("submit error: %w", err)
    }
    defer resp.Body.Close()

    var result apiResponse
    json.NewDecoder(resp.Body).Decode(&result)

    if result.Status != 1 {
        return "", fmt.Errorf("submit failed: %s", result.Request)
    }

    // Poll (same as reCAPTCHA)
    time.Sleep(5 * time.Second)
    for i := 0; i < 20; i++ {
        pollURL := fmt.Sprintf(
            "https://ocr.captchaai.com/res.php?key=%s&action=get&id=%s&json=1",
            c.APIKey, result.Request,
        )
        resp, err := c.HTTPClient.Get(pollURL)
        if err != nil {
            time.Sleep(3 * time.Second)
            continue
        }

        var pr apiResponse
        json.NewDecoder(resp.Body).Decode(&pr)
        resp.Body.Close()

        if pr.Status == 1 {
            return pr.Request, nil
        }
        if pr.Request != "CAPCHA_NOT_READY" {
            return "", fmt.Errorf("error: %s", pr.Request)
        }
        time.Sleep(3 * time.Second)
    }

    return "", errors.New("timeout")
}

Pertanyaan Umum

Mengapa menggunakan Colly dibandingkan scraper Go lainnya?

Colly adalah framework Go scraping paling populer dengan built-in caching, rate limiting, dan concurrent request handling. Ini cocok dipadukan dengan HTTP API CaptchaAI.

Bisakah saya menggunakan Colly dengan headless browser?

Untuk halaman yang memerlukan rendering JavaScript, gunakan chromedp atau rod bersama Colly. Gunakan Colly untuk halaman statis dan headless browser untuk halaman dinamis yang dilindungi CAPTCHA.

Apakah API CaptchaAI kompatibel dengan Go?

Ya. CaptchaAI menggunakan endpoint HTTP standar yang berfungsi dengan package net/http Go. Tidak diperlukan SDK.


Panduan Terkait

  • Integrasi Crawlee + CaptchaAI
  • Cara Solve reCAPTCHA v2 dengan API CaptchaAI

Tambahkan CAPTCHA solving ke scraper Go Anda — dapatkan CaptchaAI.

Komentar dinonaktifkan untuk artikel ini.