""" image2text: PIL Image を OCR してテキストを返す。 HF版: VLM (Qwen2.5-VL) を使用。Google Vision API は使用しない。 """ import base64 import json from io import BytesIO from src.utils.tracer import customtracer def _vlm_ocr_from_pil(image, model: str = "Qwen/Qwen2.5-VL-7B-Instruct") -> str: """PIL Image → base64 → VLM OCR。""" from src.clients.llm_client import LLMClient from pydantic import BaseModel from typing import List buf = BytesIO() image.save(buf, format="JPEG") b64 = base64.b64encode(buf.getvalue()).decode("utf-8") class OcrEntry(BaseModel): text: str y: int size: int class OcrResult(BaseModel): items: List[OcrEntry] client = LLMClient() result = client.call( prompt=( "Extract all visible text from this image. " "For each text block, estimate its vertical position (y, 0=top) " "and approximate font size in pixels. Sort by y." ), schema=OcrResult, model=model, images=[b64], temperature=0, ) return json.dumps( [{"text": e.text, "y": e.y, "size": e.size} for e in result.items], ensure_ascii=False, ) @customtracer def image2text(image) -> str: """ input1 (image): PIL Image output1 (json): OCR結果 NOTE: HF版は VLM ベースOCR。Google Vision API は使用しない。 """ return _vlm_ocr_from_pil(image)