llm-ready-data / app /services /ocr_service.py
Soumik Bose
ok
6c24b50
Raw
History Blame Contribute Delete
3.58 kB
from __future__ import annotations
import io
import threading
from typing import Union
from urllib.parse import urlparse
import numpy as np
from PIL import Image
from app.config import get_settings
from app.core.logger import get_logger
_logger = get_logger(__name__)
_settings = get_settings()
_lock = threading.Lock()
_engine = None
def _get_engine():
global _engine
if _engine is None:
with _lock:
if _engine is None:
from rapidocr_onnxruntime import RapidOCR
_engine = RapidOCR(
Det={"use_cuda": _settings.ocr_det_cuda, "use_dml": _settings.ocr_det_dml},
Cls={"use_cuda": _settings.ocr_cls_cuda, "use_dml": _settings.ocr_cls_dml},
Rec={"use_cuda": _settings.ocr_rec_cuda, "use_dml": _settings.ocr_rec_dml},
print_verbose=False,
)
return _engine
def _to_numpy(source) -> Union[np.ndarray, str]:
if isinstance(source, Image.Image):
img = source
if img.mode not in ("RGB", "L", "RGBA"):
img = img.convert("RGB")
return np.array(img)
if isinstance(source, (bytes, bytearray)):
img = Image.open(io.BytesIO(source))
if img.mode not in ("RGB", "L", "RGBA"):
img = img.convert("RGB")
return np.array(img)
if isinstance(source, str):
parsed = urlparse(source)
if parsed.scheme in {"http", "https"}:
import httpx
resp = httpx.get(source, follow_redirects=True, timeout=30)
resp.raise_for_status()
img = Image.open(io.BytesIO(resp.content))
if img.mode not in ("RGB", "L", "RGBA"):
img = img.convert("RGB")
return np.array(img)
return source
if isinstance(source, np.ndarray):
return source
raise TypeError(
f"ocr_image expects bytes, str, numpy.ndarray or PIL.Image; got {type(source).__name__}"
)
def ocr_image(
source,
*,
use_det: bool = True,
use_cls: bool = True,
use_rec: bool = True,
text_score: float = 0.5,
) -> str:
engine = _get_engine()
img = _to_numpy(source)
result, _ = engine(
img,
use_det=use_det,
use_cls=use_cls,
use_rec=use_rec,
text_score=text_score,
)
if not result:
return ""
lines = [item[1] for item in result if len(item) > 1 and item[1]]
return "\n".join(lines)
def ocr_pdf(source: Union[str, bytes], *, dpi: int = 150) -> str:
try:
import pypdfium2 as pdfium
except ImportError:
_logger.error("pypdfium2 not installed")
return ""
try:
pdf = pdfium.PdfDocument(source)
scale = dpi / 72.0
page_texts: list[str] = []
for page_index in range(len(pdf)):
page = pdf[page_index]
bitmap = page.render(scale=scale, rotation=0)
pil_image = bitmap.to_pil()
page_text = ocr_image(pil_image)
if page_text:
page_texts.append(page_text)
pdf.close()
return "\n\n".join(page_texts)
except Exception as exc:
_logger.error("Failed to OCR PDF: %s", exc)
return ""
class OCRService:
def __init__(self) -> None:
self._engine = _get_engine()
def image_to_text(self, source, text_score: float = 0.5) -> str:
return ocr_image(source, text_score=text_score)
def pdf_to_text(self, source: Union[str, bytes], dpi: int = 150) -> str:
return ocr_pdf(source, dpi=dpi)