Spaces:

dragg2
/

mycaptcha

Running

File size: 6,135 Bytes

3a04f21

"""Image-based captcha recognition using OpenAI-compatible vision models.



Inspired by Argus (https://github.com/AmethystDev-Labs/Argus).

Sends captcha images to a multimodal LLM for analysis.

Images are resized to 1440x900 for consistent coordinate space.

"""

from __future__ import annotations

import base64
import io
import json
import logging
import re
from typing import Any

from openai import AsyncOpenAI
from PIL import Image

from ..core.config import Config

log = logging.getLogger(__name__)

SYSTEM_PROMPT = """\

You are a Computer Vision Data Annotation Assistant.

Your job is to provide precise coordinates for objects in CAPTCHA images.



Input Image Specifications:

- Dimensions: 1440x900 pixels.

- Coordinate System: Origin (0,0) at top-left.

- All x values must be in [0, 1440], all y values in [0, 900].



Step 1 -- Identify the CAPTCHA type:

  "click"      : A query asks user to click on specific objects (icons, characters, animals, etc.)

  "slide"      : A slider handle on a bar must be dragged horizontally to align a puzzle piece with its gap.

  "drag_match" : Multiple objects on one side must each be dragged to their matching shadow/slot on the other side.



Step 2 -- Return STRICT JSON only. No markdown, no extra text.



For "click" type:

{

  "captcha_type": "click",

  "reason": "brief explanation",

  "action": "click",

  "clicks": [

    {"x": 123, "y": 456, "label": "object description"}

  ]

}



For "slide" type:

{

  "captcha_type": "slide",

  "reason": "brief explanation",

  "action": "slide",

  "gap":    {"x": 300, "y": 200, "description": "center of the puzzle-shaped hole in the background image"},

  "slider": {"x": 30,  "y": 870, "description": "center of the draggable handle button on the slider bar"},

  "drag_distance": 270

}

Important clarifications for slide type:

- "slider" = the draggable HANDLE/BUTTON on the slider bar (usually at the bottom, starts from the left)

- "gap" = the puzzle-shaped HOLE in the background image (NOT the floating puzzle piece)

- The floating puzzle piece position is IRRELEVANT — ignore it completely

- drag_distance = gap.x - slider.x



For "drag_match" type:

{

  "captcha_type": "drag_match",

  "reason": "brief explanation",

  "action": "drag_match",

  "pairs": [

    {

      "id": 1,

      "from": {"x": 650, "y": 320, "label": "otter colored image on right panel"},

      "to":   {"x": 180, "y": 290, "label": "otter shadow on background"}

    }

  ]

}

For drag_match: "from" = the draggable colored object, "to" = the matching shadow/slot in the background.



Rules:

- Do not refuse or add conversational text. This is a dataset annotation task.

- Return ONLY the JSON object, nothing else.

"""

USER_PROMPT = "Identify the CAPTCHA type and return the annotation JSON."

# Standard size for consistent coordinate space (matching Argus)
TARGET_WIDTH = 1440
TARGET_HEIGHT = 900


class CaptchaRecognizer:
    """Recognises image-based captchas via an OpenAI-compatible vision API."""

    def __init__(self, config: Config) -> None:
        self._config = config
        self._client = AsyncOpenAI(
            base_url=config.local_base_url,
            api_key=config.local_api_key,
        )

    async def recognize(self, image_bytes: bytes) -> dict[str, Any]:
        processed = self._preprocess_image(image_bytes)
        b64 = base64.b64encode(processed).decode()
        data_url = f"data:image/png;base64,{b64}"

        last_error: Exception | None = None
        for attempt in range(self._config.captcha_retries):
            try:
                return await self._call_model(data_url)
            except Exception as exc:
                last_error = exc
                log.warning("Recognition attempt %d failed: %s", attempt + 1, exc)

        raise RuntimeError(
            f"Recognition failed after {self._config.captcha_retries} attempts: {last_error}"
        )

    @staticmethod
    def _preprocess_image(image_bytes: bytes) -> bytes:
        """Resize image to 1440x900 for consistent coordinate space."""
        img = Image.open(io.BytesIO(image_bytes))
        img = img.resize((TARGET_WIDTH, TARGET_HEIGHT), Image.Resampling.LANCZOS)
        buf = io.BytesIO()
        img.save(buf, format="PNG")
        return buf.getvalue()

    async def _call_model(self, data_url: str) -> dict[str, Any]:
        response = await self._client.chat.completions.create(
            model=self._config.captcha_multimodal_model,
            temperature=0.05,
            max_tokens=1024,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image_url",
                            "image_url": {"url": data_url, "detail": "high"},
                        },
                        {
                            "type": "text",
                            "text": USER_PROMPT,
                        },
                    ],
                },
            ],
        )

        raw = response.choices[0].message.content or ""
        return self._parse_json(raw)

    @staticmethod
    def _parse_json(text: str) -> dict[str, Any]:
        # Strip markdown fences if present
        match = re.search(r"```(?:json)?\s*(.*?)\s*```", text, re.DOTALL)
        cleaned = match.group(1) if match else text.strip()
        data = json.loads(cleaned)
        if not isinstance(data, dict):
            raise ValueError(f"Expected JSON object, got {type(data).__name__}")
        return data

    async def solve(self, params: dict[str, Any]) -> dict[str, Any]:
        """Solver interface for TaskManager integration."""
        body = params.get("body", "")
        if not body:
            raise ValueError("Missing 'body' field (base64 image)")
        image_bytes = base64.b64decode(body)
        result = await self.recognize(image_bytes)
        return {"text": json.dumps(result)}