ohmycaptcha

Paused

App Files Files Community

ohmycaptcha / src /services /classification.py

zzdccww

🚀 Deploy ohmycaptcha via Automation Tool

faf0e40 verified about 2 months ago

raw

history blame contribute delete

6.83 kB

	"""Image classification solvers for various captcha types.

	Supports HCaptchaClassification, ReCaptchaV2Classification,
	FunCaptchaClassification, and AwsClassification task types.

	All classification tasks send images + question text to an OpenAI-compatible
	vision model for analysis and return structured coordinate/index results.
	"""

	from __future__ import annotations

	import base64
	import io
	import json
	import logging
	import re
	from typing import Any

	from openai import AsyncOpenAI
	from PIL import Image

	from ..core.config import Config

	log = logging.getLogger(__name__)

	HCAPTCHA_SYSTEM_PROMPT = """\
	You are an image classification assistant for HCaptcha challenges.
	Given a question and one or more base64-encoded images, determine which images match the question.

	Return STRICT JSON only. No markdown, no extra text.

	For single-image questions (is this image X?):
	{"answer": true} or {"answer": false}

	For multi-image grid questions (select all images containing X):
	{"answer": [0, 2, 5]}
	where numbers are 0-indexed positions of matching images.

	Rules:
	- Return ONLY the JSON object, nothing else.
	- Be precise with your classification.
	"""

	RECAPTCHA_V2_SYSTEM_PROMPT = """\
	You are an image classification assistant for reCAPTCHA v2 challenges.
	Given a question and a grid image (3x3 or 4x4), identify which cells match the question.

	The image cells are numbered 0-8 (3x3) or 0-15 (4x4), left-to-right, top-to-bottom.

	Return STRICT JSON only:
	{"objects": [0, 3, 6]}
	where numbers are 0-indexed positions of matching cells.

	Rules:
	- Return ONLY the JSON object, nothing else.
	- If no cells match, return {"objects": []}.
	"""

	FUNCAPTCHA_SYSTEM_PROMPT = """\
	You are an image classification assistant for FunCaptcha challenges.
	Given a question and a grid image (typically 2x3 = 6 cells), identify which cell
	is the correct answer.

	Cells are numbered 0-5, left-to-right, top-to-bottom.

	Return STRICT JSON only:
	{"objects": [3]}
	where the number is the 0-indexed position of the correct cell.

	Rules:
	- Return ONLY the JSON object, nothing else.
	- Usually only one cell is correct.
	"""

	AWS_SYSTEM_PROMPT = """\
	You are an image classification assistant for AWS CAPTCHA challenges.
	Given a question and one or more images, identify the correct answer.

	Return STRICT JSON only:
	{"objects": [1]}
	where the number is the 0-indexed position of the matching image.

	Rules:
	- Return ONLY the JSON object, nothing else.
	"""


	class ClassificationSolver:
	"""Solves image classification captchas using a vision model."""

	def __init__(self, config: Config) -> None:
	self._config = config
	self._client = AsyncOpenAI(
	base_url=config.local_base_url,
	api_key=config.local_api_key,
	)

	async def solve(self, params: dict[str, Any]) -> dict[str, Any]:
	task_type = params.get("type", "")
	system_prompt = self._get_system_prompt(task_type)
	question = params.get("question", "") or params.get("queries", "")

	# Handle different image field names across task types
	images = self._extract_images(params)
	if not images:
	raise ValueError("No image data provided")

	result = await self._classify(system_prompt, question, images)
	return result

	@staticmethod
	def _get_system_prompt(task_type: str) -> str:
	prompts = {
	"HCaptchaClassification": HCAPTCHA_SYSTEM_PROMPT,
	"ReCaptchaV2Classification": RECAPTCHA_V2_SYSTEM_PROMPT,
	"FunCaptchaClassification": FUNCAPTCHA_SYSTEM_PROMPT,
	"AwsClassification": AWS_SYSTEM_PROMPT,
	}
	return prompts.get(task_type, RECAPTCHA_V2_SYSTEM_PROMPT)

	@staticmethod
	def _extract_images(params: dict[str, Any]) -> list[str]:
	"""Extract base64 image(s) from various param formats."""
	images: list[str] = []

	if "image" in params:
	images.append(params["image"])

	if "images" in params:
	imgs = params["images"]
	if isinstance(imgs, list):
	images.extend(imgs)
	elif isinstance(imgs, str):
	images.append(imgs)

	if "body" in params and not images:
	images.append(params["body"])

	# HCaptcha queries format: list of base64 strings
	if "queries" in params and isinstance(params["queries"], list):
	images.extend(params["queries"])

	return images

	@staticmethod
	def _prepare_image(b64_data: str) -> str:
	"""Ensure image is properly formatted as a data URL."""
	if b64_data.startswith("data:image"):
	return b64_data
	try:
	img_bytes = base64.b64decode(b64_data)
	img = Image.open(io.BytesIO(img_bytes))
	fmt = img.format or "PNG"
	mime = f"image/{fmt.lower()}"
	return f"data:{mime};base64,{b64_data}"
	except Exception:
	return f"data:image/png;base64,{b64_data}"

	async def _classify(
	self, system_prompt: str, question: str, images: list[str]
	) -> dict[str, Any]:
	content: list[dict[str, Any]] = []

	for img_b64 in images:
	data_url = self._prepare_image(img_b64)
	content.append({
	"type": "image_url",
	"image_url": {"url": data_url, "detail": "high"},
	})

	user_text = question if question else "Classify this captcha image."
	content.append({"type": "text", "text": user_text})

	last_error: Exception \| None = None
	for attempt in range(self._config.captcha_retries):
	try:
	response = await self._client.chat.completions.create(
	model=self._config.captcha_multimodal_model,
	temperature=0.05,
	max_tokens=512,
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": content},
	],
	)
	raw = response.choices[0].message.content or ""
	return self._parse_json(raw)
	except Exception as exc:
	last_error = exc
	log.warning("Classification attempt %d failed: %s", attempt + 1, exc)

	raise RuntimeError(
	f"Classification failed after {self._config.captcha_retries} attempts: {last_error}"
	)

	@staticmethod
	def _parse_json(text: str) -> dict[str, Any]:
	match = re.search(r"```(?:json)?\s(.?)\s*```", text, re.DOTALL)
	cleaned = match.group(1) if match else text.strip()
	data = json.loads(cleaned)
	if not isinstance(data, dict):
	raise ValueError(f"Expected JSON object, got {type(data).__name__}")
	return data