Portal data healthcare — direktori penyedia, database harga obat, dan registrasi uji klinis — menggunakan CAPTCHA untuk mencegah scraping otomatis. Peneliti dan platform health-tech memerlukan data ini untuk analisis, kepatuhan, dan pengambilan keputusan.
Dimana CAPTCHA Muncul
| Sumber | Jenis CAPTCHA | Data | Kasus Penggunaan |
|---|---|---|---|
| Direktori penyedia (NPI) | Gambar CAPTCHA | Pencarian Doctor/facility | Kecukupan jaringan |
| Portal harga obat | reCAPTCHA v2 | Harga obat | Transparansi harga |
| Pendaftaran uji klinis | reCAPTCHA v2 | Data percobaan, hasil | Analisis penelitian |
| Formularium asuransi | reCAPTCHA v2 | Daftar cakupan obat | Perbandingan formularium |
| Badan perizinan negara | Gambar CAPTCHA | Verifikasi lisensi | Pemeriksaan kredensial |
| Penilaian kualitas rumah sakit | Cloudflare Turnstile | Metrik kualitas | Analisis kinerja |
Scraper Direktori Penyedia
import requests
import time
import re
import base64
from bs4 import BeautifulSoup
import csv
CAPTCHAAI_KEY = "YOUR_API_KEY"
CAPTCHAAI_URL = "https://ocr.captchaai.com"
def solve_recaptcha(sitekey, pageurl):
resp = requests.post(f"{CAPTCHAAI_URL}/in.php", data={
"key": CAPTCHAAI_KEY, "method": "userrecaptcha",
"googlekey": sitekey, "pageurl": pageurl, "json": 1,
})
task_id = resp.json()["request"]
for _ in range(60):
time.sleep(5)
result = requests.get(f"{CAPTCHAAI_URL}/res.php", params={
"key": CAPTCHAAI_KEY, "action": "get",
"id": task_id, "json": 1,
})
data = result.json()
if data["request"] != "CAPCHA_NOT_READY":
return data["request"]
raise TimeoutError("Timeout")
def solve_image_captcha(image_bytes):
img_b64 = base64.b64encode(image_bytes).decode()
resp = requests.post(f"{CAPTCHAAI_URL}/in.php", data={
"key": CAPTCHAAI_KEY, "method": "base64",
"body": img_b64, "json": 1,
})
task_id = resp.json()["request"]
for _ in range(20):
time.sleep(3)
result = requests.get(f"{CAPTCHAAI_URL}/res.php", params={
"key": CAPTCHAAI_KEY, "action": "get",
"id": task_id, "json": 1,
})
data = result.json()
if data["request"] != "CAPCHA_NOT_READY":
return data["request"]
raise TimeoutError("Timeout")
class HealthcareDataCollector:
def __init__(self, proxy=None):
self.session = requests.Session()
if proxy:
self.session.proxies = {"http": proxy, "https": proxy}
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 Chrome/126.0.0.0 Safari/537.36",
})
def search_providers(self, portal_url, specialty, location, sitekey=None):
"""Search provider directory with CAPTCHA handling."""
resp = self.session.get(portal_url, timeout=30)
data = {"specialty": specialty, "location": location}
# Handle CAPTCHA
if sitekey:
token = solve_recaptcha(sitekey, portal_url)
data["g-recaptcha-response"] = token
else:
captcha_img = re.search(r'src="(/captcha[^"]+)"', resp.text)
if captcha_img:
img_url = portal_url.rstrip("/") + captcha_img.group(1)
img = self.session.get(img_url)
data["captcha"] = solve_image_captcha(img.content)
resp = self.session.post(portal_url, data=data)
return self._parse_providers(resp.text)
def lookup_drug_prices(self, pricing_url, drug_name, zip_code, sitekey):
"""Look up drug prices with CAPTCHA solving."""
# Load search page
self.session.get(pricing_url)
# Solve CAPTCHA
token = solve_recaptcha(sitekey, pricing_url)
resp = self.session.post(pricing_url, data={
"drug": drug_name,
"zip": zip_code,
"g-recaptcha-response": token,
})
if resp.status_code == 200:
return self._parse_prices(resp.text)
return []
def batch_provider_lookup(self, portal_url, specialties, locations, output_file):
"""Batch search across specialties and locations."""
all_providers = []
for specialty in specialties:
for location in locations:
try:
providers = self.search_providers(
portal_url, specialty, location,
)
for p in providers:
p["specialty_search"] = specialty
p["location_search"] = location
all_providers.extend(providers)
print(f"{specialty} / {location}: {len(providers)} providers")
time.sleep(5)
except Exception as e:
print(f"Error: {specialty} / {location}: {e}")
# Export
if all_providers:
keys = all_providers[0].keys()
with open(output_file, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=keys)
writer.writeheader()
writer.writerows(all_providers)
return all_providers
def _parse_providers(self, html):
soup = BeautifulSoup(html, "html.parser")
providers = []
for card in soup.select(".provider-card, .doctor-result, tr.provider"):
providers.append({
"name": self._text(card, ".name, .provider-name"),
"specialty": self._text(card, ".specialty"),
"address": self._text(card, ".address"),
"phone": self._text(card, ".phone"),
"accepting": self._text(card, ".accepting-patients"),
})
return providers
def _parse_prices(self, html):
soup = BeautifulSoup(html, "html.parser")
prices = []
for row in soup.select(".pharmacy-row, .price-result"):
prices.append({
"pharmacy": self._text(row, ".pharmacy-name"),
"price": self._text(row, ".price, .drug-price"),
"quantity": self._text(row, ".quantity"),
})
return prices
def _text(self, el, selector):
found = el.select_one(selector)
return found.get_text(strip=True) if found else ""
# Usage
collector = HealthcareDataCollector(
proxy="http://user:pass@residential.proxy.com:5000"
)
# Provider search
providers = collector.search_providers(
portal_url="https://provider-directory.example.com/search",
specialty="Cardiology",
location="New York, NY",
)
# Drug pricing
prices = collector.lookup_drug_prices(
pricing_url="https://drug-prices.example.com/compare",
drug_name="atorvastatin",
zip_code="10001",
sitekey="6Lc_xxxxxxx",
)
Pengumpulan Data Uji Klinis
def collect_clinical_trials(search_url, condition, sitekey):
"""Collect clinical trial data for a medical condition."""
collector = HealthcareDataCollector(
proxy="http://user:pass@residential.proxy.com:5000"
)
token = solve_recaptcha(sitekey, search_url)
resp = collector.session.post(search_url, data={
"condition": condition,
"status": "recruiting",
"g-recaptcha-response": token,
})
if resp.status_code != 200:
return []
soup = BeautifulSoup(resp.text, "html.parser")
trials = []
for item in soup.select(".trial-item, .study-result"):
trials.append({
"title": collector._text(item, ".title, h3"),
"status": collector._text(item, ".status"),
"sponsor": collector._text(item, ".sponsor"),
"phase": collector._text(item, ".phase"),
"enrollment": collector._text(item, ".enrollment"),
"location": collector._text(item, ".location"),
})
return trials
Pertimbangan Privasi Data
| Tipe Data | Sensitivitas | Rekomendasi |
|---|---|---|
| Direktori penyedia | Rendah (info publik) | Umumnya aman untuk dikumpulkan |
| Penetapan harga obat | Rendah (harga publik) | Diizinkan untuk transparansi |
| Metadata uji klinis | Rendah (registrasi publik) | Penggunaan penelitian sesuai |
| Ulasan pasien | Sedang | Anonimkan sebelum analisis |
| Detail paket asuransi | Rendah (tarif dipublikasikan) | Diizinkan untuk perbandingan |
Penting: Jangan pernah mencoba mengumpulkan informasi kesehatan yang dilindungi (PHI). Fokus hanya pada data yang tersedia untuk umum dan tidak spesifik untuk pasien.
Pemecahan Masalah
| Masalah | Penyebab | Solusi |
|---|---|---|
| Image CAPTCHA tidak bisa dibaca | Gambar kualitas rendah | Coba lagi — gambar baru akan di-generate |
| Pencarian penyedia kembali kosong | CAPTCHA memblokir pencarian | Solve CAPTCHA sebelum submit |
| Harga obat bervariasi per lokasi | Harga berbasis geografis | Cocokkan lokasi proxy dengan kode pos |
| Sesi berakhir pada multi-halaman | Timeout portal | Selesaikan pencarian lebih cepat |
| Rate limit pada pencarian batch | Terlalu banyak request | Tambahkan delay 5-10 detik |
Pertanyaan Umum
Apakah pengumpulan data harga layanan kesehatan diperbolehkan?
Transparansi harga obat didorong oleh peraturan (CMS Price Transparency Rule). Data direktori penyedia publik umumnya dapat diakses.
Bisakah saya membandingkan harga obat antar apotek?
Ya. Layanan seperti GoodRx melakukan hal ini dalam skala besar. CaptchaAI menangani CAPTCHA yang digunakan portal harga untuk membatasi akses otomatis.
Bagaimana cara saya menangani HIPAA saat menghapus situs layanan kesehatan?
HIPAA berlaku untuk informasi kesehatan yang dilindungi (PHI). Data publik seperti direktori penyedia, harga obat, dan registrasi uji klinis bukanlah PHI. Jangan pernah mengikis catatan pasien secara individu.
Panduan Terkait
- Otomasi Portal Pemerintah
- Scraping Riset Akademik
- Rotating egress jaringan yang diotorisasi
Kumpulkan data healthcare secara efisien — dapatkan API key CaptchaAI dan otomasi pencarian penyedia dan harga.