import io import easyocr import fitz # pymupdf import docx2txt import numpy as np from PIL import Image from typing import Tuple _reader = None def get_reader() -> easyocr.Reader: global _reader if _reader is None: _reader = easyocr.Reader(["en"], gpu=False) return _reader def image_bytes_to_array(image_bytes: bytes) -> np.ndarray: img = Image.open(io.BytesIO(image_bytes)).convert("RGB") return np.array(img) def ocr_image_array(img_array: np.ndarray) -> str: reader = get_reader() results = reader.readtext(img_array, detail=0, paragraph=True) return "\n".join(results) def extract_text_from_image(file_bytes: bytes) -> Tuple[str, int]: img_array = image_bytes_to_array(file_bytes) text = ocr_image_array(img_array) return text, 1 def extract_text_from_pdf(file_bytes: bytes) -> Tuple[str, int]: doc = fitz.open(stream=file_bytes, filetype="pdf") pages_processed = len(doc) all_text_parts = [] for page in doc: # Try native text first native_text = page.get_text("text").strip() if len(native_text) > 50: all_text_parts.append(native_text) else: # Fallback to EasyOCR on rasterised page pix = page.get_pixmap(dpi=200) img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) img_array = np.array(img) ocr_text = ocr_image_array(img_array) all_text_parts.append(ocr_text) doc.close() return "\n\n--- PAGE BREAK ---\n\n".join(all_text_parts), pages_processed def extract_text_from_docx(file_bytes: bytes) -> Tuple[str, int]: text = docx2txt.process(io.BytesIO(file_bytes)) return text or "", 1 def extract_text_from_file(file_bytes: bytes, ext: str) -> Tuple[str, int]: if ext == "pdf": return extract_text_from_pdf(file_bytes) elif ext == "docx": return extract_text_from_docx(file_bytes) elif ext in {"png", "jpg", "jpeg", "webp"}: return extract_text_from_image(file_bytes) else: raise ValueError(f"Unsupported extension: {ext}")