flow / src /services /classification.py
zbq111's picture
Upload 75 files
504b397 verified
"""Image classification solvers for various captcha types.
Supports HCaptchaClassification, ReCaptchaV2Classification,
FunCaptchaClassification, and AwsClassification task types.
All classification tasks send images + question text to an OpenAI-compatible
vision model for analysis and return structured coordinate/index results.
"""
from __future__ import annotations
import base64
import io
import json
import logging
import re
from typing import Any
from openai import AsyncOpenAI
from PIL import Image
from ..core.config import Config
log = logging.getLogger(__name__)
HCAPTCHA_SYSTEM_PROMPT = """\
You are an image classification assistant for HCaptcha challenges.
Given a question and one or more base64-encoded images, determine which images match the question.
Return STRICT JSON only. No markdown, no extra text.
For single-image questions (is this image X?):
{"answer": true} or {"answer": false}
For multi-image grid questions (select all images containing X):
{"answer": [0, 2, 5]}
where numbers are 0-indexed positions of matching images.
Rules:
- Return ONLY the JSON object, nothing else.
- Be precise with your classification.
"""
RECAPTCHA_V2_SYSTEM_PROMPT = """\
You are an image classification assistant for reCAPTCHA v2 challenges.
Given a question and a grid image (3x3 or 4x4), identify which cells match the question.
The image cells are numbered 0-8 (3x3) or 0-15 (4x4), left-to-right, top-to-bottom.
Return STRICT JSON only:
{"objects": [0, 3, 6]}
where numbers are 0-indexed positions of matching cells.
Rules:
- Return ONLY the JSON object, nothing else.
- If no cells match, return {"objects": []}.
"""
FUNCAPTCHA_SYSTEM_PROMPT = """\
You are an image classification assistant for FunCaptcha challenges.
Given a question and a grid image (typically 2x3 = 6 cells), identify which cell
is the correct answer.
Cells are numbered 0-5, left-to-right, top-to-bottom.
Return STRICT JSON only:
{"objects": [3]}
where the number is the 0-indexed position of the correct cell.
Rules:
- Return ONLY the JSON object, nothing else.
- Usually only one cell is correct.
"""
AWS_SYSTEM_PROMPT = """\
You are an image classification assistant for AWS CAPTCHA challenges.
Given a question and one or more images, identify the correct answer.
Return STRICT JSON only:
{"objects": [1]}
where the number is the 0-indexed position of the matching image.
Rules:
- Return ONLY the JSON object, nothing else.
"""
class ClassificationSolver:
"""Solves image classification captchas using a vision model."""
def __init__(self, config: Config) -> None:
self._config = config
self._client = AsyncOpenAI(
base_url=config.local_base_url,
api_key=config.local_api_key,
)
async def solve(self, params: dict[str, Any]) -> dict[str, Any]:
task_type = params.get("type", "")
system_prompt = self._get_system_prompt(task_type)
question = params.get("question", "") or params.get("queries", "")
# Handle different image field names across task types
images = self._extract_images(params)
if not images:
raise ValueError("No image data provided")
result = await self._classify(system_prompt, question, images)
return result
@staticmethod
def _get_system_prompt(task_type: str) -> str:
prompts = {
"HCaptchaClassification": HCAPTCHA_SYSTEM_PROMPT,
"ReCaptchaV2Classification": RECAPTCHA_V2_SYSTEM_PROMPT,
"FunCaptchaClassification": FUNCAPTCHA_SYSTEM_PROMPT,
"AwsClassification": AWS_SYSTEM_PROMPT,
}
return prompts.get(task_type, RECAPTCHA_V2_SYSTEM_PROMPT)
@staticmethod
def _extract_images(params: dict[str, Any]) -> list[str]:
"""Extract base64 image(s) from various param formats."""
images: list[str] = []
if "image" in params:
images.append(params["image"])
if "images" in params:
imgs = params["images"]
if isinstance(imgs, list):
images.extend(imgs)
elif isinstance(imgs, str):
images.append(imgs)
if "body" in params and not images:
images.append(params["body"])
# HCaptcha queries format: list of base64 strings
if "queries" in params and isinstance(params["queries"], list):
images.extend(params["queries"])
return images
@staticmethod
def _prepare_image(b64_data: str) -> str:
"""Ensure image is properly formatted as a data URL."""
if b64_data.startswith("data:image"):
return b64_data
try:
img_bytes = base64.b64decode(b64_data)
img = Image.open(io.BytesIO(img_bytes))
fmt = img.format or "PNG"
mime = f"image/{fmt.lower()}"
return f"data:{mime};base64,{b64_data}"
except Exception:
return f"data:image/png;base64,{b64_data}"
async def _classify(
self, system_prompt: str, question: str, images: list[str]
) -> dict[str, Any]:
content: list[dict[str, Any]] = []
for img_b64 in images:
data_url = self._prepare_image(img_b64)
content.append({
"type": "image_url",
"image_url": {"url": data_url, "detail": "high"},
})
user_text = question if question else "Classify this captcha image."
content.append({"type": "text", "text": user_text})
last_error: Exception | None = None
for attempt in range(self._config.captcha_retries):
try:
response = await self._client.chat.completions.create(
model=self._config.captcha_multimodal_model,
temperature=0.05,
max_tokens=512,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": content},
],
)
raw = response.choices[0].message.content or ""
return self._parse_json(raw)
except Exception as exc:
last_error = exc
log.warning("Classification attempt %d failed: %s", attempt + 1, exc)
raise RuntimeError(
f"Classification failed after {self._config.captcha_retries} attempts: {last_error}"
)
@staticmethod
def _parse_json(text: str) -> dict[str, Any]:
match = re.search(r"```(?:json)?\s*(.*?)\s*```", text, re.DOTALL)
cleaned = match.group(1) if match else text.strip()
data = json.loads(cleaned)
if not isinstance(data, dict):
raise ValueError(f"Expected JSON object, got {type(data).__name__}")
return data