wuwa-dev-ocr / ocr_engine.py
system's picture
system HF Staff
Deploy OCR from GitHub 934a28b
be754f2 verified
Raw
History Blame Contribute Delete
1.76 kB
from collections import OrderedDict
import gc
import logging
import threading
from paddleocr import PaddleOCR
from ocr_config import (
LANG_MAP,
MAX_LOADED_LANGS,
configure_model_environment,
normalize_lang,
)
logger = logging.getLogger(__name__)
_ocr_engines = OrderedDict()
_ocr_lock = threading.Lock()
def get_ocr(raw_lang: str | None = "kr"):
lang = normalize_lang(raw_lang)
with _ocr_lock:
if lang in _ocr_engines:
_ocr_engines.move_to_end(lang)
return _ocr_engines[lang]
try:
logger.info("Initializing PaddleOCR for lang=%s...", lang)
configure_model_environment()
ocr_engine = PaddleOCR(
lang=LANG_MAP[lang],
use_angle_cls=True,
use_gpu=False,
show_log=False,
use_mp=False,
enable_mkldnn=False,
)
_ocr_engines[lang] = ocr_engine
_ocr_engines.move_to_end(lang)
_evict_old_engines()
logger.info("PaddleOCR initialized successfully for lang=%s.", lang)
return _ocr_engines[lang]
except Exception:
logger.exception("PaddleOCR initialization failed for lang=%s.", lang)
raise
def _evict_old_engines():
while MAX_LOADED_LANGS > 0 and len(_ocr_engines) > MAX_LOADED_LANGS:
removed_lang, _ = _ocr_engines.popitem(last=False)
logger.info("Unloaded PaddleOCR lang=%s by LRU policy.", removed_lang)
gc.collect()
def loaded_langs():
return list(_ocr_engines.keys())
def supported_langs():
return list(LANG_MAP.keys())
def is_lang_loaded(raw_lang: str | None):
return normalize_lang(raw_lang) in _ocr_engines