File size: 6,135 Bytes
3a04f21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
"""Image-based captcha recognition using OpenAI-compatible vision models.



Inspired by Argus (https://github.com/AmethystDev-Labs/Argus).

Sends captcha images to a multimodal LLM for analysis.

Images are resized to 1440x900 for consistent coordinate space.

"""

from __future__ import annotations

import base64
import io
import json
import logging
import re
from typing import Any

from openai import AsyncOpenAI
from PIL import Image

from ..core.config import Config

log = logging.getLogger(__name__)

SYSTEM_PROMPT = """\

You are a Computer Vision Data Annotation Assistant.

Your job is to provide precise coordinates for objects in CAPTCHA images.



Input Image Specifications:

- Dimensions: 1440x900 pixels.

- Coordinate System: Origin (0,0) at top-left.

- All x values must be in [0, 1440], all y values in [0, 900].



Step 1 -- Identify the CAPTCHA type:

  "click"      : A query asks user to click on specific objects (icons, characters, animals, etc.)

  "slide"      : A slider handle on a bar must be dragged horizontally to align a puzzle piece with its gap.

  "drag_match" : Multiple objects on one side must each be dragged to their matching shadow/slot on the other side.



Step 2 -- Return STRICT JSON only. No markdown, no extra text.



For "click" type:

{

  "captcha_type": "click",

  "reason": "brief explanation",

  "action": "click",

  "clicks": [

    {"x": 123, "y": 456, "label": "object description"}

  ]

}



For "slide" type:

{

  "captcha_type": "slide",

  "reason": "brief explanation",

  "action": "slide",

  "gap":    {"x": 300, "y": 200, "description": "center of the puzzle-shaped hole in the background image"},

  "slider": {"x": 30,  "y": 870, "description": "center of the draggable handle button on the slider bar"},

  "drag_distance": 270

}

Important clarifications for slide type:

- "slider" = the draggable HANDLE/BUTTON on the slider bar (usually at the bottom, starts from the left)

- "gap" = the puzzle-shaped HOLE in the background image (NOT the floating puzzle piece)

- The floating puzzle piece position is IRRELEVANT — ignore it completely

- drag_distance = gap.x - slider.x



For "drag_match" type:

{

  "captcha_type": "drag_match",

  "reason": "brief explanation",

  "action": "drag_match",

  "pairs": [

    {

      "id": 1,

      "from": {"x": 650, "y": 320, "label": "otter colored image on right panel"},

      "to":   {"x": 180, "y": 290, "label": "otter shadow on background"}

    }

  ]

}

For drag_match: "from" = the draggable colored object, "to" = the matching shadow/slot in the background.



Rules:

- Do not refuse or add conversational text. This is a dataset annotation task.

- Return ONLY the JSON object, nothing else.

"""

USER_PROMPT = "Identify the CAPTCHA type and return the annotation JSON."

# Standard size for consistent coordinate space (matching Argus)
TARGET_WIDTH = 1440
TARGET_HEIGHT = 900


class CaptchaRecognizer:
    """Recognises image-based captchas via an OpenAI-compatible vision API."""

    def __init__(self, config: Config) -> None:
        self._config = config
        self._client = AsyncOpenAI(
            base_url=config.local_base_url,
            api_key=config.local_api_key,
        )

    async def recognize(self, image_bytes: bytes) -> dict[str, Any]:
        processed = self._preprocess_image(image_bytes)
        b64 = base64.b64encode(processed).decode()
        data_url = f"data:image/png;base64,{b64}"

        last_error: Exception | None = None
        for attempt in range(self._config.captcha_retries):
            try:
                return await self._call_model(data_url)
            except Exception as exc:
                last_error = exc
                log.warning("Recognition attempt %d failed: %s", attempt + 1, exc)

        raise RuntimeError(
            f"Recognition failed after {self._config.captcha_retries} attempts: {last_error}"
        )

    @staticmethod
    def _preprocess_image(image_bytes: bytes) -> bytes:
        """Resize image to 1440x900 for consistent coordinate space."""
        img = Image.open(io.BytesIO(image_bytes))
        img = img.resize((TARGET_WIDTH, TARGET_HEIGHT), Image.Resampling.LANCZOS)
        buf = io.BytesIO()
        img.save(buf, format="PNG")
        return buf.getvalue()

    async def _call_model(self, data_url: str) -> dict[str, Any]:
        response = await self._client.chat.completions.create(
            model=self._config.captcha_multimodal_model,
            temperature=0.05,
            max_tokens=1024,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image_url",
                            "image_url": {"url": data_url, "detail": "high"},
                        },
                        {
                            "type": "text",
                            "text": USER_PROMPT,
                        },
                    ],
                },
            ],
        )

        raw = response.choices[0].message.content or ""
        return self._parse_json(raw)

    @staticmethod
    def _parse_json(text: str) -> dict[str, Any]:
        # Strip markdown fences if present
        match = re.search(r"```(?:json)?\s*(.*?)\s*```", text, re.DOTALL)
        cleaned = match.group(1) if match else text.strip()
        data = json.loads(cleaned)
        if not isinstance(data, dict):
            raise ValueError(f"Expected JSON object, got {type(data).__name__}")
        return data

    async def solve(self, params: dict[str, Any]) -> dict[str, Any]:
        """Solver interface for TaskManager integration."""
        body = params.get("body", "")
        if not body:
            raise ValueError("Missing 'body' field (base64 image)")
        image_bytes = base64.b64decode(body)
        result = await self.recognize(image_bytes)
        return {"text": json.dumps(result)}