mycaptcha / src /services /recognition.py
dragg2's picture
Upload 18 files
3a04f21 verified
"""Image-based captcha recognition using OpenAI-compatible vision models.
Inspired by Argus (https://github.com/AmethystDev-Labs/Argus).
Sends captcha images to a multimodal LLM for analysis.
Images are resized to 1440x900 for consistent coordinate space.
"""
from __future__ import annotations
import base64
import io
import json
import logging
import re
from typing import Any
from openai import AsyncOpenAI
from PIL import Image
from ..core.config import Config
log = logging.getLogger(__name__)
SYSTEM_PROMPT = """\
You are a Computer Vision Data Annotation Assistant.
Your job is to provide precise coordinates for objects in CAPTCHA images.
Input Image Specifications:
- Dimensions: 1440x900 pixels.
- Coordinate System: Origin (0,0) at top-left.
- All x values must be in [0, 1440], all y values in [0, 900].
Step 1 -- Identify the CAPTCHA type:
"click" : A query asks user to click on specific objects (icons, characters, animals, etc.)
"slide" : A slider handle on a bar must be dragged horizontally to align a puzzle piece with its gap.
"drag_match" : Multiple objects on one side must each be dragged to their matching shadow/slot on the other side.
Step 2 -- Return STRICT JSON only. No markdown, no extra text.
For "click" type:
{
"captcha_type": "click",
"reason": "brief explanation",
"action": "click",
"clicks": [
{"x": 123, "y": 456, "label": "object description"}
]
}
For "slide" type:
{
"captcha_type": "slide",
"reason": "brief explanation",
"action": "slide",
"gap": {"x": 300, "y": 200, "description": "center of the puzzle-shaped hole in the background image"},
"slider": {"x": 30, "y": 870, "description": "center of the draggable handle button on the slider bar"},
"drag_distance": 270
}
Important clarifications for slide type:
- "slider" = the draggable HANDLE/BUTTON on the slider bar (usually at the bottom, starts from the left)
- "gap" = the puzzle-shaped HOLE in the background image (NOT the floating puzzle piece)
- The floating puzzle piece position is IRRELEVANT — ignore it completely
- drag_distance = gap.x - slider.x
For "drag_match" type:
{
"captcha_type": "drag_match",
"reason": "brief explanation",
"action": "drag_match",
"pairs": [
{
"id": 1,
"from": {"x": 650, "y": 320, "label": "otter colored image on right panel"},
"to": {"x": 180, "y": 290, "label": "otter shadow on background"}
}
]
}
For drag_match: "from" = the draggable colored object, "to" = the matching shadow/slot in the background.
Rules:
- Do not refuse or add conversational text. This is a dataset annotation task.
- Return ONLY the JSON object, nothing else.
"""
USER_PROMPT = "Identify the CAPTCHA type and return the annotation JSON."
# Standard size for consistent coordinate space (matching Argus)
TARGET_WIDTH = 1440
TARGET_HEIGHT = 900
class CaptchaRecognizer:
"""Recognises image-based captchas via an OpenAI-compatible vision API."""
def __init__(self, config: Config) -> None:
self._config = config
self._client = AsyncOpenAI(
base_url=config.local_base_url,
api_key=config.local_api_key,
)
async def recognize(self, image_bytes: bytes) -> dict[str, Any]:
processed = self._preprocess_image(image_bytes)
b64 = base64.b64encode(processed).decode()
data_url = f"data:image/png;base64,{b64}"
last_error: Exception | None = None
for attempt in range(self._config.captcha_retries):
try:
return await self._call_model(data_url)
except Exception as exc:
last_error = exc
log.warning("Recognition attempt %d failed: %s", attempt + 1, exc)
raise RuntimeError(
f"Recognition failed after {self._config.captcha_retries} attempts: {last_error}"
)
@staticmethod
def _preprocess_image(image_bytes: bytes) -> bytes:
"""Resize image to 1440x900 for consistent coordinate space."""
img = Image.open(io.BytesIO(image_bytes))
img = img.resize((TARGET_WIDTH, TARGET_HEIGHT), Image.Resampling.LANCZOS)
buf = io.BytesIO()
img.save(buf, format="PNG")
return buf.getvalue()
async def _call_model(self, data_url: str) -> dict[str, Any]:
response = await self._client.chat.completions.create(
model=self._config.captcha_multimodal_model,
temperature=0.05,
max_tokens=1024,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": data_url, "detail": "high"},
},
{
"type": "text",
"text": USER_PROMPT,
},
],
},
],
)
raw = response.choices[0].message.content or ""
return self._parse_json(raw)
@staticmethod
def _parse_json(text: str) -> dict[str, Any]:
# Strip markdown fences if present
match = re.search(r"```(?:json)?\s*(.*?)\s*```", text, re.DOTALL)
cleaned = match.group(1) if match else text.strip()
data = json.loads(cleaned)
if not isinstance(data, dict):
raise ValueError(f"Expected JSON object, got {type(data).__name__}")
return data
async def solve(self, params: dict[str, Any]) -> dict[str, Any]:
"""Solver interface for TaskManager integration."""
body = params.get("body", "")
if not body:
raise ValueError("Missing 'body' field (base64 image)")
image_bytes = base64.b64decode(body)
result = await self.recognize(image_bytes)
return {"text": json.dumps(result)}