Spaces:

DLPO
/

api_light_hf

Running

api_light_hf / apis /image2text.py

deploy api_light_hf (2026-03-12 12:47:03)

cf7f643 5 days ago

1.49 kB

	"""
	image2text: PIL Image を OCR してテキストを返す。
	HF版: VLM (Qwen2.5-VL) を使用。Google Vision API は使用しない。
	"""

	import base64
	import json
	from io import BytesIO

	from src.utils.tracer import customtracer


	def _vlm_ocr_from_pil(image, model: str = "Qwen/Qwen2.5-VL-7B-Instruct") -> str:
	"""PIL Image → base64 → VLM OCR。"""
	from src.clients.llm_client import LLMClient
	from pydantic import BaseModel
	from typing import List

	buf = BytesIO()
	image.save(buf, format="JPEG")
	b64 = base64.b64encode(buf.getvalue()).decode("utf-8")

	class OcrEntry(BaseModel):
	text: str
	y: int
	size: int

	class OcrResult(BaseModel):
	items: List[OcrEntry]

	client = LLMClient()
	result = client.call(
	prompt=(
	"Extract all visible text from this image. "
	"For each text block, estimate its vertical position (y, 0=top) "
	"and approximate font size in pixels. Sort by y."
	),
	schema=OcrResult,
	model=model,
	images=[b64],
	temperature=0,
	)
	return json.dumps(
	[{"text": e.text, "y": e.y, "size": e.size} for e in result.items],
	ensure_ascii=False,
	)


	@customtracer
	def image2text(image) -> str:
	"""
	input1 (image): PIL Image
	output1 (json): OCR結果

	NOTE: HF版は VLM ベースOCR。Google Vision API は使用しない。
	"""
	return _vlm_ocr_from_pil(image)