Spaces:

DLPO
/

api_light_hf

Sleeping

File size: 1,485 Bytes

cf7f643

"""
image2text: PIL Image を OCR してテキストを返す。
HF版: VLM (Qwen2.5-VL) を使用。Google Vision API は使用しない。
"""

import base64
import json
from io import BytesIO

from src.utils.tracer import customtracer


def _vlm_ocr_from_pil(image, model: str = "Qwen/Qwen2.5-VL-7B-Instruct") -> str:
    """PIL Image → base64 → VLM OCR。"""
    from src.clients.llm_client import LLMClient
    from pydantic import BaseModel
    from typing import List

    buf = BytesIO()
    image.save(buf, format="JPEG")
    b64 = base64.b64encode(buf.getvalue()).decode("utf-8")

    class OcrEntry(BaseModel):
        text: str
        y: int
        size: int

    class OcrResult(BaseModel):
        items: List[OcrEntry]

    client = LLMClient()
    result = client.call(
        prompt=(
            "Extract all visible text from this image. "
            "For each text block, estimate its vertical position (y, 0=top) "
            "and approximate font size in pixels. Sort by y."
        ),
        schema=OcrResult,
        model=model,
        images=[b64],
        temperature=0,
    )
    return json.dumps(
        [{"text": e.text, "y": e.y, "size": e.size} for e in result.items],
        ensure_ascii=False,
    )


@customtracer
def image2text(image) -> str:
    """
    input1 (image): PIL Image
    output1 (json): OCR結果

    NOTE: HF版は VLM ベースOCR。Google Vision API は使用しない。
    """
    return _vlm_ocr_from_pil(image)