Spaces:

fumiyaaa
/

dokoCame

Sleeping

dokoCame / core /ocr_engine.py

Fumiya Imazato

Fix PaddleOCR 3.x API and improve overlay display

319f4cb about 1 month ago

5.78 kB

	"""
	PaddleOCR ラッパー

	【重要】PaddleOCR 3.x API変更：
	- ocr() → predict() に変更
	- use_angle_cls は廃止
	- use_doc_orientation_classify, use_doc_unwarping 等を使用
	"""

	from typing import List, Tuple, Optional
	from dataclasses import dataclass
	import numpy as np


	@dataclass
	class OCRResult:
	"""OCR検出結果"""

	text: str
	confidence: float
	bbox: List[List[int]] # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]


	class OCREngine:
	"""
	PaddleOCRラッパークラス
	日本語テキスト抽出に最適化

	PaddleOCR 3.x 対応
	"""

	def __init__(self, lang: str = "japan"):
	"""
	Args:
	lang: 言語設定 (3.xでは使用されない可能性あり)
	"""
	self.lang = lang
	self._ocr = None
	self._initialized = False

	def _init_ocr(self) -> None:
	"""OCRエンジンの遅延初期化"""
	if self._initialized:
	return

	try:
	from paddleocr import PaddleOCR

	# PaddleOCR 3.x 用の初期化
	# 高速化のためドキュメント補正系は無効化
	self._ocr = PaddleOCR(
	use_doc_orientation_classify=False,
	use_doc_unwarping=False,
	use_textline_orientation=False,
	)
	self._initialized = True
	print("[OCR] PaddleOCR 3.x initialized successfully")
	except ImportError:
	print("Warning: PaddleOCR not installed. OCR will not work.")
	self._initialized = False
	except Exception as e:
	print(f"Warning: PaddleOCR init error: {e}")
	self._initialized = False

	def detect(self, frame: np.ndarray) -> List[OCRResult]:
	"""
	フレームからテキストを検出

	Args:
	frame: 入力画像（BGR形式）

	Returns:
	OCRResult のリスト
	"""
	self._init_ocr()

	if self._ocr is None:
	return []

	try:
	# PaddleOCR 3.x: predict() を使用
	result = self._ocr.predict(frame)

	if result is None:
	return []

	ocr_results = []

	# PaddleOCR 3.x の結果形式を解析
	# 結果はジェネレータまたはリストで返される
	for item in result:
	if item is None:
	continue

	# 結果がdict形式の場合
	if isinstance(item, dict):
	rec_texts = item.get("rec_texts", [])
	rec_scores = item.get("rec_scores", [])
	dt_polys = item.get("dt_polys", [])

	for i, text in enumerate(rec_texts):
	if text and len(text.strip()) > 0:
	confidence = rec_scores[i] if i < len(rec_scores) else 0.0
	bbox = dt_polys[i] if i < len(dt_polys) else [[0,0],[0,0],[0,0],[0,0]]
	ocr_results.append(
	OCRResult(
	text=text,
	confidence=float(confidence),
	bbox=bbox,
	)
	)
	# 旧形式のタプル/リストの場合
	elif isinstance(item, (list, tuple)):
	for line in item:
	if line is None:
	continue
	if isinstance(line, (list, tuple)) and len(line) >= 2:
	bbox = line[0]
	text_info = line[1]
	if text_info and len(text_info) >= 2:
	text = text_info[0]
	confidence = float(text_info[1])
	ocr_results.append(
	OCRResult(
	text=text,
	confidence=confidence,
	bbox=bbox,
	)
	)

	return ocr_results

	except Exception as e:
	print(f"OCR error: {e}")
	import traceback
	traceback.print_exc()
	return []

	def detect_text_only(self, frame: np.ndarray) -> List[str]:
	"""
	テキストのみを抽出（信頼度でフィルタリング）

	Args:
	frame: 入力画像

	Returns:
	検出されたテキストのリスト
	"""
	results = self.detect(frame)
	# 信頼度0.5以上のテキストのみ
	return [r.text for r in results if r.confidence >= 0.5]

	def detect_with_positions(
	self, frame: np.ndarray
	) -> List[Tuple[str, float, Tuple[int, int]]]:
	"""
	テキストと位置情報を抽出

	Returns:
	(テキスト, 信頼度, 中心座標) のリスト
	"""
	results = self.detect(frame)
	output = []

	for r in results:
	if r.confidence < 0.5:
	continue
	# バウンディングボックスの中心を計算
	xs = [p[0] for p in r.bbox]
	ys = [p[1] for p in r.bbox]
	center_x = int(sum(xs) / 4)
	center_y = int(sum(ys) / 4)
	output.append((r.text, r.confidence, (center_x, center_y)))

	return output

	@property
	def is_available(self) -> bool:
	"""OCRエンジンが利用可能かどうか"""
	self._init_ocr()
	return self._initialized and self._ocr is not None