Spaces:
Paused
Paused
| """Image classification solvers for various captcha types. | |
| Supports HCaptchaClassification, ReCaptchaV2Classification, | |
| FunCaptchaClassification, and AwsClassification task types. | |
| All classification tasks send images + question text to an OpenAI-compatible | |
| vision model for analysis and return structured coordinate/index results. | |
| """ | |
| from __future__ import annotations | |
| import base64 | |
| import io | |
| import json | |
| import logging | |
| import re | |
| from typing import Any | |
| from openai import AsyncOpenAI | |
| from PIL import Image | |
| from ..core.config import Config | |
| log = logging.getLogger(__name__) | |
| HCAPTCHA_SYSTEM_PROMPT = """\ | |
| You are an image classification assistant for HCaptcha challenges. | |
| Given a question and one or more base64-encoded images, determine which images match the question. | |
| Return STRICT JSON only. No markdown, no extra text. | |
| For single-image questions (is this image X?): | |
| {"answer": true} or {"answer": false} | |
| For multi-image grid questions (select all images containing X): | |
| {"answer": [0, 2, 5]} | |
| where numbers are 0-indexed positions of matching images. | |
| Rules: | |
| - Return ONLY the JSON object, nothing else. | |
| - Be precise with your classification. | |
| """ | |
| RECAPTCHA_V2_SYSTEM_PROMPT = """\ | |
| You are an image classification assistant for reCAPTCHA v2 challenges. | |
| Given a question and a grid image (3x3 or 4x4), identify which cells match the question. | |
| The image cells are numbered 0-8 (3x3) or 0-15 (4x4), left-to-right, top-to-bottom. | |
| Return STRICT JSON only: | |
| {"objects": [0, 3, 6]} | |
| where numbers are 0-indexed positions of matching cells. | |
| Rules: | |
| - Return ONLY the JSON object, nothing else. | |
| - If no cells match, return {"objects": []}. | |
| """ | |
| FUNCAPTCHA_SYSTEM_PROMPT = """\ | |
| You are an image classification assistant for FunCaptcha challenges. | |
| Given a question and a grid image (typically 2x3 = 6 cells), identify which cell | |
| is the correct answer. | |
| Cells are numbered 0-5, left-to-right, top-to-bottom. | |
| Return STRICT JSON only: | |
| {"objects": [3]} | |
| where the number is the 0-indexed position of the correct cell. | |
| Rules: | |
| - Return ONLY the JSON object, nothing else. | |
| - Usually only one cell is correct. | |
| """ | |
| AWS_SYSTEM_PROMPT = """\ | |
| You are an image classification assistant for AWS CAPTCHA challenges. | |
| Given a question and one or more images, identify the correct answer. | |
| Return STRICT JSON only: | |
| {"objects": [1]} | |
| where the number is the 0-indexed position of the matching image. | |
| Rules: | |
| - Return ONLY the JSON object, nothing else. | |
| """ | |
| class ClassificationSolver: | |
| """Solves image classification captchas using a vision model.""" | |
| def __init__(self, config: Config) -> None: | |
| self._config = config | |
| self._client = AsyncOpenAI( | |
| base_url=config.local_base_url, | |
| api_key=config.local_api_key, | |
| ) | |
| async def solve(self, params: dict[str, Any]) -> dict[str, Any]: | |
| task_type = params.get("type", "") | |
| system_prompt = self._get_system_prompt(task_type) | |
| question = params.get("question", "") or params.get("queries", "") | |
| # Handle different image field names across task types | |
| images = self._extract_images(params) | |
| if not images: | |
| raise ValueError("No image data provided") | |
| result = await self._classify(system_prompt, question, images) | |
| return result | |
| def _get_system_prompt(task_type: str) -> str: | |
| prompts = { | |
| "HCaptchaClassification": HCAPTCHA_SYSTEM_PROMPT, | |
| "ReCaptchaV2Classification": RECAPTCHA_V2_SYSTEM_PROMPT, | |
| "FunCaptchaClassification": FUNCAPTCHA_SYSTEM_PROMPT, | |
| "AwsClassification": AWS_SYSTEM_PROMPT, | |
| } | |
| return prompts.get(task_type, RECAPTCHA_V2_SYSTEM_PROMPT) | |
| def _extract_images(params: dict[str, Any]) -> list[str]: | |
| """Extract base64 image(s) from various param formats.""" | |
| images: list[str] = [] | |
| if "image" in params: | |
| images.append(params["image"]) | |
| if "images" in params: | |
| imgs = params["images"] | |
| if isinstance(imgs, list): | |
| images.extend(imgs) | |
| elif isinstance(imgs, str): | |
| images.append(imgs) | |
| if "body" in params and not images: | |
| images.append(params["body"]) | |
| # HCaptcha queries format: list of base64 strings | |
| if "queries" in params and isinstance(params["queries"], list): | |
| images.extend(params["queries"]) | |
| return images | |
| def _prepare_image(b64_data: str) -> str: | |
| """Ensure image is properly formatted as a data URL.""" | |
| if b64_data.startswith("data:image"): | |
| return b64_data | |
| try: | |
| img_bytes = base64.b64decode(b64_data) | |
| img = Image.open(io.BytesIO(img_bytes)) | |
| fmt = img.format or "PNG" | |
| mime = f"image/{fmt.lower()}" | |
| return f"data:{mime};base64,{b64_data}" | |
| except Exception: | |
| return f"data:image/png;base64,{b64_data}" | |
| async def _classify( | |
| self, system_prompt: str, question: str, images: list[str] | |
| ) -> dict[str, Any]: | |
| content: list[dict[str, Any]] = [] | |
| for img_b64 in images: | |
| data_url = self._prepare_image(img_b64) | |
| content.append({ | |
| "type": "image_url", | |
| "image_url": {"url": data_url, "detail": "high"}, | |
| }) | |
| user_text = question if question else "Classify this captcha image." | |
| content.append({"type": "text", "text": user_text}) | |
| last_error: Exception | None = None | |
| for attempt in range(self._config.captcha_retries): | |
| try: | |
| response = await self._client.chat.completions.create( | |
| model=self._config.captcha_multimodal_model, | |
| temperature=0.05, | |
| max_tokens=512, | |
| messages=[ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": content}, | |
| ], | |
| ) | |
| raw = response.choices[0].message.content or "" | |
| return self._parse_json(raw) | |
| except Exception as exc: | |
| last_error = exc | |
| log.warning("Classification attempt %d failed: %s", attempt + 1, exc) | |
| raise RuntimeError( | |
| f"Classification failed after {self._config.captcha_retries} attempts: {last_error}" | |
| ) | |
| def _parse_json(text: str) -> dict[str, Any]: | |
| match = re.search(r"```(?:json)?\s*(.*?)\s*```", text, re.DOTALL) | |
| cleaned = match.group(1) if match else text.strip() | |
| data = json.loads(cleaned) | |
| if not isinstance(data, dict): | |
| raise ValueError(f"Expected JSON object, got {type(data).__name__}") | |
| return data | |