Spaces:

RyouTakahashi
/

ndlocr-lite-api

Sleeping

髙橋凌

Initial: ndlocr-lite OCR API on HuggingFace Spaces

1a33821 27 days ago

2.94 kB

	"""
	NDLOCR-Lite OCR API on HuggingFace Spaces
	PDF/画像ファイルからndlocr-liteで日本語テキストを抽出する
	"""

	import os
	import uuid
	import shutil
	import subprocess
	import tempfile

	import gradio as gr

	NDLOCR_SRC = os.path.join(os.path.dirname(__file__), "ndlocr-lite", "src")


	def pdf_to_image(pdf_path: str, output_path: str):
	"""PDFの1ページ目をJPEG画像に変換"""
	import pypdfium2 as pdfium
	from PIL import Image

	pdf = pdfium.PdfDocument(pdf_path)
	page = pdf[0]
	bitmap = page.render(scale=2)
	img = bitmap.to_pil()
	img.save(output_path, "JPEG", quality=95)


	def run_ocr(file_path: str) -> str:
	"""ファイル（PDF or 画像）をndlocr-liteでOCRしてテキストを返す"""
	if not file_path:
	return "ファイルが選択されていません"

	tmp_dir = os.path.join(tempfile.gettempdir(), f"ocr_{uuid.uuid4().hex}")
	os.makedirs(tmp_dir, exist_ok=True)
	result_dir = os.path.join(tmp_dir, "result")
	os.makedirs(result_dir, exist_ok=True)

	try:
	ext = file_path.rsplit(".", 1)[-1].lower() if "." in file_path else ""

	if ext == "pdf":
	img_path = os.path.join(tmp_dir, "input.jpg")
	pdf_to_image(file_path, img_path)
	else:
	# 画像はそのままコピー
	img_ext = ext if ext in ("jpg", "jpeg", "png", "bmp", "tiff") else "jpg"
	img_path = os.path.join(tmp_dir, f"input.{img_ext}")
	shutil.copy2(file_path, img_path)

	# ndlocr-lite でOCR実行
	result = subprocess.run(
	["python3", os.path.join(NDLOCR_SRC, "ocr.py"), "--sourceimg", img_path, "--output", result_dir],
	timeout=120,
	cwd=NDLOCR_SRC,
	capture_output=True,
	text=True,
	)

	if result.returncode != 0:
	return f"OCRエラー: {result.stderr[:500]}"

	# 結果テキストを読み取り
	txt_path = os.path.join(result_dir, "input.txt")
	if not os.path.exists(txt_path):
	return "OCR結果が生成されませんでした"

	with open(txt_path, "r", encoding="utf-8") as f:
	return f.read()

	except subprocess.TimeoutExpired:
	return "OCR処理がタイムアウトしました（120秒）"
	except Exception as e:
	return f"エラー: {str(e)}"
	finally:
	shutil.rmtree(tmp_dir, ignore_errors=True)


	# Gradio Interface
	demo = gr.Interface(
	fn=run_ocr,
	inputs=gr.File(label="PDF / 画像をアップロード", file_types=[".pdf", ".jpg", ".jpeg", ".png", ".bmp", ".tiff"]),
	outputs=gr.Textbox(label="OCR結果", lines=20),
	title="NDLOCR-Lite OCR",
	description="日本語OCR（国立国会図書館 ndlocr-lite）。請求書・見積書などのPDF/画像からテキストを抽出します。",
	allow_flagging="never",
	)

	if __name__ == "__main__":
	demo.launch()