File size: 6,135 Bytes
3a04f21 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | """Image-based captcha recognition using OpenAI-compatible vision models.
Inspired by Argus (https://github.com/AmethystDev-Labs/Argus).
Sends captcha images to a multimodal LLM for analysis.
Images are resized to 1440x900 for consistent coordinate space.
"""
from __future__ import annotations
import base64
import io
import json
import logging
import re
from typing import Any
from openai import AsyncOpenAI
from PIL import Image
from ..core.config import Config
log = logging.getLogger(__name__)
SYSTEM_PROMPT = """\
You are a Computer Vision Data Annotation Assistant.
Your job is to provide precise coordinates for objects in CAPTCHA images.
Input Image Specifications:
- Dimensions: 1440x900 pixels.
- Coordinate System: Origin (0,0) at top-left.
- All x values must be in [0, 1440], all y values in [0, 900].
Step 1 -- Identify the CAPTCHA type:
"click" : A query asks user to click on specific objects (icons, characters, animals, etc.)
"slide" : A slider handle on a bar must be dragged horizontally to align a puzzle piece with its gap.
"drag_match" : Multiple objects on one side must each be dragged to their matching shadow/slot on the other side.
Step 2 -- Return STRICT JSON only. No markdown, no extra text.
For "click" type:
{
"captcha_type": "click",
"reason": "brief explanation",
"action": "click",
"clicks": [
{"x": 123, "y": 456, "label": "object description"}
]
}
For "slide" type:
{
"captcha_type": "slide",
"reason": "brief explanation",
"action": "slide",
"gap": {"x": 300, "y": 200, "description": "center of the puzzle-shaped hole in the background image"},
"slider": {"x": 30, "y": 870, "description": "center of the draggable handle button on the slider bar"},
"drag_distance": 270
}
Important clarifications for slide type:
- "slider" = the draggable HANDLE/BUTTON on the slider bar (usually at the bottom, starts from the left)
- "gap" = the puzzle-shaped HOLE in the background image (NOT the floating puzzle piece)
- The floating puzzle piece position is IRRELEVANT — ignore it completely
- drag_distance = gap.x - slider.x
For "drag_match" type:
{
"captcha_type": "drag_match",
"reason": "brief explanation",
"action": "drag_match",
"pairs": [
{
"id": 1,
"from": {"x": 650, "y": 320, "label": "otter colored image on right panel"},
"to": {"x": 180, "y": 290, "label": "otter shadow on background"}
}
]
}
For drag_match: "from" = the draggable colored object, "to" = the matching shadow/slot in the background.
Rules:
- Do not refuse or add conversational text. This is a dataset annotation task.
- Return ONLY the JSON object, nothing else.
"""
USER_PROMPT = "Identify the CAPTCHA type and return the annotation JSON."
# Standard size for consistent coordinate space (matching Argus)
TARGET_WIDTH = 1440
TARGET_HEIGHT = 900
class CaptchaRecognizer:
"""Recognises image-based captchas via an OpenAI-compatible vision API."""
def __init__(self, config: Config) -> None:
self._config = config
self._client = AsyncOpenAI(
base_url=config.local_base_url,
api_key=config.local_api_key,
)
async def recognize(self, image_bytes: bytes) -> dict[str, Any]:
processed = self._preprocess_image(image_bytes)
b64 = base64.b64encode(processed).decode()
data_url = f"data:image/png;base64,{b64}"
last_error: Exception | None = None
for attempt in range(self._config.captcha_retries):
try:
return await self._call_model(data_url)
except Exception as exc:
last_error = exc
log.warning("Recognition attempt %d failed: %s", attempt + 1, exc)
raise RuntimeError(
f"Recognition failed after {self._config.captcha_retries} attempts: {last_error}"
)
@staticmethod
def _preprocess_image(image_bytes: bytes) -> bytes:
"""Resize image to 1440x900 for consistent coordinate space."""
img = Image.open(io.BytesIO(image_bytes))
img = img.resize((TARGET_WIDTH, TARGET_HEIGHT), Image.Resampling.LANCZOS)
buf = io.BytesIO()
img.save(buf, format="PNG")
return buf.getvalue()
async def _call_model(self, data_url: str) -> dict[str, Any]:
response = await self._client.chat.completions.create(
model=self._config.captcha_multimodal_model,
temperature=0.05,
max_tokens=1024,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": data_url, "detail": "high"},
},
{
"type": "text",
"text": USER_PROMPT,
},
],
},
],
)
raw = response.choices[0].message.content or ""
return self._parse_json(raw)
@staticmethod
def _parse_json(text: str) -> dict[str, Any]:
# Strip markdown fences if present
match = re.search(r"```(?:json)?\s*(.*?)\s*```", text, re.DOTALL)
cleaned = match.group(1) if match else text.strip()
data = json.loads(cleaned)
if not isinstance(data, dict):
raise ValueError(f"Expected JSON object, got {type(data).__name__}")
return data
async def solve(self, params: dict[str, Any]) -> dict[str, Any]:
"""Solver interface for TaskManager integration."""
body = params.get("body", "")
if not body:
raise ValueError("Missing 'body' field (base64 image)")
image_bytes = base64.b64decode(body)
result = await self.recognize(image_bytes)
return {"text": json.dumps(result)}
|