Platform keuangan seperti screener saham, SEC EDGAR, dan platform trading melindungi data dengan CAPTCHA untuk mencegah ekstraksi otomatis. CaptchaAI menangani challenge ini secara programatik sehingga Anda dapat mengumpulkan data pasar dalam skala besar.
Di Mana CAPTCHA Muncul di Sektor Keuangan
| Sumber | Tipe CAPTCHA | Pemicu | Nilai Data |
|---|---|---|---|
| SEC EDGAR | reCAPTCHA v2 | Request volume tinggi | Filing perusahaan |
| Yahoo Finance | reCAPTCHA v2 | Deteksi scraping | Harga saham, riwayat |
| Bloomberg | Cloudflare Turnstile | Semua akses otomatis | Data pasar |
| Finviz | reCAPTCHA v2 | Akses stock screener | Hasil screener |
| TradingView | Cloudflare Challenge | Rate limiting | Chart, indikator |
| Morningstar | reCAPTCHA v3 | Halaman ekspor data | Analisis reksa dana |
Scraping Stock Screener
import requests
import time
from bs4 import BeautifulSoup
import re
CAPTCHAAI_KEY = "YOUR_API_KEY"
CAPTCHAAI_URL = "https://ocr.captchaai.com"
def solve_captcha(method, sitekey, pageurl, **kwargs):
data = {
"key": CAPTCHAAI_KEY,
"method": method,
"googlekey": sitekey,
"pageurl": pageurl,
"json": 1,
}
data.update(kwargs)
resp = requests.post(f"{CAPTCHAAI_URL}/in.php", data=data)
task_id = resp.json()["request"]
for _ in range(60):
time.sleep(5)
result = requests.get(f"{CAPTCHAAI_URL}/res.php", params={
"key": CAPTCHAAI_KEY, "action": "get",
"id": task_id, "json": 1,
})
r = result.json()
if r["request"] != "CAPCHA_NOT_READY":
return r["request"]
raise TimeoutError("Solve timeout")
class FinancialScraper:
def __init__(self, proxy=None):
self.session = requests.Session()
if proxy:
self.session.proxies = {"http": proxy, "https": proxy}
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 Chrome/126.0.0.0 Safari/537.36",
"Accept-Language": "en-US,en;q=0.9",
})
def scrape_screener(self, url):
"""Scrape stock screener, handling CAPTCHA if triggered."""
resp = self.session.get(url, timeout=30)
# Check for CAPTCHA
sitekey_match = re.search(r'data-sitekey="([^"]+)"', resp.text)
if sitekey_match:
sitekey = sitekey_match.group(1)
token = solve_captcha("userrecaptcha", sitekey, url)
# Resubmit with token
resp = self.session.post(url, data={
"g-recaptcha-response": token,
})
return self._parse_stocks(resp.text)
def _parse_stocks(self, html):
soup = BeautifulSoup(html, "html.parser")
stocks = []
for row in soup.select("table.screener-table tr")[1:]:
cols = row.select("td")
if len(cols) >= 8:
stocks.append({
"ticker": cols[1].get_text(strip=True),
"company": cols[2].get_text(strip=True),
"sector": cols[3].get_text(strip=True),
"price": cols[6].get_text(strip=True),
"change": cols[7].get_text(strip=True),
})
return stocks
# Usage
scraper = FinancialScraper(
proxy="http://user:pass@residential.proxy.com:5000"
)
stocks = scraper.scrape_screener("https://screener.example.com/screener.ashx?v=111")
for stock in stocks[:5]:
print(f"{stock['ticker']}: {stock['price']} ({stock['change']})")
Ekstraksi Filing SEC EDGAR
SEC EDGAR menerapkan rate limiting dan CAPTCHA untuk akses volume tinggi:
import json
class SECFilingScraper:
BASE_URL = "https://efts.sec.gov/LATEST"
def __init__(self, user_agent_email, proxy=None):
self.session = requests.Session()
if proxy:
self.session.proxies = {"http": proxy, "https": proxy}
# SEC requires identifying User-Agent
self.session.headers.update({
"User-Agent": f"CompanyName admin@{user_agent_email}",
"Accept": "application/json",
})
def search_filings(self, company, filing_type="10-K"):
"""Search EDGAR for specific filing types."""
url = f"{self.BASE_URL}/search-index"
params = {
"q": company,
"dateRange": "custom",
"forms": filing_type,
}
resp = self.session.get(url, params=params, timeout=30)
# Handle CAPTCHA if triggered
if "captcha" in resp.text.lower() or resp.status_code == 403:
sitekey = self._extract_sitekey(resp.text)
if sitekey:
token = solve_captcha("userrecaptcha", sitekey, url)
resp = self.session.post(url, data={
**params,
"g-recaptcha-response": token,
})
return resp.json() if resp.status_code == 200 else {}
def download_filing(self, filing_url):
"""Download individual filing document."""
resp = self.session.get(filing_url, timeout=60)
if resp.status_code == 200:
return resp.text
return None
def _extract_sitekey(self, html):
match = re.search(r'data-sitekey="([^"]+)"', html)
return match.group(1) if match else None
# Usage
sec = SECFilingScraper(
user_agent_email="example.com",
proxy="http://user:pass@proxy.example.com:5000",
)
filings = sec.search_filings("Apple Inc", "10-K")
Data Pasar yang Dilindungi Turnstile
def scrape_turnstile_market_data(url, sitekey):
"""Handle Cloudflare Turnstile on financial data sites."""
token = solve_captcha("turnstile", sitekey, url)
session = requests.Session()
session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 Chrome/126.0.0.0 Safari/537.36",
})
resp = session.post(url, data={
"cf-turnstile-response": token,
}, timeout=30)
return resp.json() if resp.status_code == 200 else None
Pengumpulan Data Terjadwal
import csv
from datetime import datetime
def daily_market_snapshot(tickers, output_dir="data"):
"""Collect daily stock data, handling CAPTCHAs automatically."""
scraper = FinancialScraper(
proxy="http://user:pass@residential.proxy.com:5000"
)
date_str = datetime.now().strftime("%Y-%m-%d")
results = []
for ticker in tickers:
url = f"https://screener.example.com/quote.ashx?t={ticker}"
try:
data = scraper.scrape_screener(url)
if data:
results.extend(data)
time.sleep(2) # Rate limit
except Exception as e:
print(f"Error on {ticker}: {e}")
# Save to CSV
filepath = f"{output_dir}/market_{date_str}.csv"
with open(filepath, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=["ticker", "company", "sector", "price", "change"])
writer.writeheader()
writer.writerows(results)
print(f"Saved {len(results)} records to {filepath}")
return results
# Run daily
tickers = ["AAPL", "GOOGL", "MSFT", "AMZN", "TSLA"]
daily_market_snapshot(tickers)
Best Practice Rate Limiting
Situs keuangan lebih ketat terhadap akses otomatis:
| Praktik | Rekomendasi |
|---|---|
| Delay antar request | 2–5 detik per halaman |
| Concurrent connection | Maks 3–5 per domain |
| Tipe proxy | Residensial atau ISP |
| Durasi sesi | Sticky session 5–10 menit |
| User-Agent | Realistis, konsisten per sesi |
| SEC EDGAR | Sertakan email kontak di UA (wajib) |
| Jam pasar | Scrape di luar jam sibuk jika memungkinkan |
Pemecahan Masalah
| Masalah | Penyebab | Perbaikan |
|---|---|---|
| 403 di SEC EDGAR | User-Agent tanpa email tidak ada | Tambahkan header CompanyName email@domain |
| CAPTCHA di setiap request | Rate limit terlampaui | Tambahkan delay 3–5 detik antar request |
| Data harga basi | Respons ter-cache | Tambahkan query parameter cache-bust |
| JSON parse error | Halaman CAPTCHA yang dikembalikan | Cek CAPTCHA sebelum parsing |
| IP diblokir | Terlalu banyak request dari IP yang sama | Beralih ke rotating egress jaringan yang diotorisasi |
Pertanyaan Umum
Apakah scraping data keuangan legal?
Data keuangan publik (filing SEC, harga saham) secara umum diperbolehkan. Selalu hormati terms of service dan rate limit. SEC EDGAR secara eksplisit mengizinkan akses EDGAR untuk tujuan penelitian.
Mengapa situs keuangan menggunakan CAPTCHA?
Untuk mencegah ekstraksi otomatis volume tinggi yang dapat memungkinkan manipulasi pasar, pengumpulan intelijen kompetitif, atau beban server berlebihan.
Seberapa sering saya harus mengumpulkan data pasar?
Untuk harga saham: maksimal sekali per menit saat jam pasar. Untuk filing: biasanya sekali sehari. Scraping berlebihan memicu CAPTCHA lebih cepat.
Panduan Terkait
- Rotating egress jaringan yang diotorisasi untuk Solve CAPTCHA
Kumpulkan data keuangan tanpa gangguan CAPTCHA — dapatkan API key CaptchaAI Anda dan otomatiskan riset pasar.