"""Image classification solvers for various captcha types. Supports HCaptchaClassification, ReCaptchaV2Classification, FunCaptchaClassification, and AwsClassification task types. All classification tasks send images + question text to an OpenAI-compatible vision model for analysis and return structured coordinate/index results. """ from __future__ import annotations import base64 import io import json import logging import re from typing import Any from openai import AsyncOpenAI from PIL import Image from ..core.config import Config log = logging.getLogger(__name__) HCAPTCHA_SYSTEM_PROMPT = """\ You are an image classification assistant for HCaptcha challenges. Given a question and one or more base64-encoded images, determine which images match the question. Return STRICT JSON only. No markdown, no extra text. For single-image questions (is this image X?): {"answer": true} or {"answer": false} For multi-image grid questions (select all images containing X): {"answer": [0, 2, 5]} where numbers are 0-indexed positions of matching images. Rules: - Return ONLY the JSON object, nothing else. - Be precise with your classification. """ RECAPTCHA_V2_SYSTEM_PROMPT = """\ You are an image classification assistant for reCAPTCHA v2 challenges. Given a question and a grid image (3x3 or 4x4), identify which cells match the question. The image cells are numbered 0-8 (3x3) or 0-15 (4x4), left-to-right, top-to-bottom. Return STRICT JSON only: {"objects": [0, 3, 6]} where numbers are 0-indexed positions of matching cells. Rules: - Return ONLY the JSON object, nothing else. - If no cells match, return {"objects": []}. """ FUNCAPTCHA_SYSTEM_PROMPT = """\ You are an image classification assistant for FunCaptcha challenges. Given a question and a grid image (typically 2x3 = 6 cells), identify which cell is the correct answer. Cells are numbered 0-5, left-to-right, top-to-bottom. Return STRICT JSON only: {"objects": [3]} where the number is the 0-indexed position of the correct cell. Rules: - Return ONLY the JSON object, nothing else. - Usually only one cell is correct. """ AWS_SYSTEM_PROMPT = """\ You are an image classification assistant for AWS CAPTCHA challenges. Given a question and one or more images, identify the correct answer. Return STRICT JSON only: {"objects": [1]} where the number is the 0-indexed position of the matching image. Rules: - Return ONLY the JSON object, nothing else. """ class ClassificationSolver: """Solves image classification captchas using a vision model.""" def __init__(self, config: Config) -> None: self._config = config self._client = AsyncOpenAI( base_url=config.local_base_url, api_key=config.local_api_key, ) async def solve(self, params: dict[str, Any]) -> dict[str, Any]: task_type = params.get("type", "") system_prompt = self._get_system_prompt(task_type) question = params.get("question", "") or params.get("queries", "") # Handle different image field names across task types images = self._extract_images(params) if not images: raise ValueError("No image data provided") result = await self._classify(system_prompt, question, images) return result @staticmethod def _get_system_prompt(task_type: str) -> str: prompts = { "HCaptchaClassification": HCAPTCHA_SYSTEM_PROMPT, "ReCaptchaV2Classification": RECAPTCHA_V2_SYSTEM_PROMPT, "FunCaptchaClassification": FUNCAPTCHA_SYSTEM_PROMPT, "AwsClassification": AWS_SYSTEM_PROMPT, } return prompts.get(task_type, RECAPTCHA_V2_SYSTEM_PROMPT) @staticmethod def _extract_images(params: dict[str, Any]) -> list[str]: """Extract base64 image(s) from various param formats.""" images: list[str] = [] if "image" in params: images.append(params["image"]) if "images" in params: imgs = params["images"] if isinstance(imgs, list): images.extend(imgs) elif isinstance(imgs, str): images.append(imgs) if "body" in params and not images: images.append(params["body"]) # HCaptcha queries format: list of base64 strings if "queries" in params and isinstance(params["queries"], list): images.extend(params["queries"]) return images @staticmethod def _prepare_image(b64_data: str) -> str: """Ensure image is properly formatted as a data URL.""" if b64_data.startswith("data:image"): return b64_data try: img_bytes = base64.b64decode(b64_data) img = Image.open(io.BytesIO(img_bytes)) fmt = img.format or "PNG" mime = f"image/{fmt.lower()}" return f"data:{mime};base64,{b64_data}" except Exception: return f"data:image/png;base64,{b64_data}" async def _classify( self, system_prompt: str, question: str, images: list[str] ) -> dict[str, Any]: content: list[dict[str, Any]] = [] for img_b64 in images: data_url = self._prepare_image(img_b64) content.append({ "type": "image_url", "image_url": {"url": data_url, "detail": "high"}, }) user_text = question if question else "Classify this captcha image." content.append({"type": "text", "text": user_text}) last_error: Exception | None = None for attempt in range(self._config.captcha_retries): try: response = await self._client.chat.completions.create( model=self._config.captcha_multimodal_model, temperature=0.05, max_tokens=512, messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": content}, ], ) raw = response.choices[0].message.content or "" return self._parse_json(raw) except Exception as exc: last_error = exc log.warning("Classification attempt %d failed: %s", attempt + 1, exc) raise RuntimeError( f"Classification failed after {self._config.captcha_retries} attempts: {last_error}" ) @staticmethod def _parse_json(text: str) -> dict[str, Any]: match = re.search(r"```(?:json)?\s*(.*?)\s*```", text, re.DOTALL) cleaned = match.group(1) if match else text.strip() data = json.loads(cleaned) if not isinstance(data, dict): raise ValueError(f"Expected JSON object, got {type(data).__name__}") return data