api_light_hf / apis /image2text.py
Renecto's picture
deploy api_light_hf (2026-03-12 12:47:03)
cf7f643
"""
image2text: PIL Image を OCR してテキストを返す。
HF版: VLM (Qwen2.5-VL) を使用。Google Vision API は使用しない。
"""
import base64
import json
from io import BytesIO
from src.utils.tracer import customtracer
def _vlm_ocr_from_pil(image, model: str = "Qwen/Qwen2.5-VL-7B-Instruct") -> str:
"""PIL Image → base64 → VLM OCR。"""
from src.clients.llm_client import LLMClient
from pydantic import BaseModel
from typing import List
buf = BytesIO()
image.save(buf, format="JPEG")
b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
class OcrEntry(BaseModel):
text: str
y: int
size: int
class OcrResult(BaseModel):
items: List[OcrEntry]
client = LLMClient()
result = client.call(
prompt=(
"Extract all visible text from this image. "
"For each text block, estimate its vertical position (y, 0=top) "
"and approximate font size in pixels. Sort by y."
),
schema=OcrResult,
model=model,
images=[b64],
temperature=0,
)
return json.dumps(
[{"text": e.text, "y": e.y, "size": e.size} for e in result.items],
ensure_ascii=False,
)
@customtracer
def image2text(image) -> str:
"""
input1 (image): PIL Image
output1 (json): OCR結果
NOTE: HF版は VLM ベースOCR。Google Vision API は使用しない。
"""
return _vlm_ocr_from_pil(image)