| import io |
| import easyocr |
| import fitz |
| import docx2txt |
| import numpy as np |
| from PIL import Image |
| from typing import Tuple |
|
|
| _reader = None |
|
|
| def get_reader() -> easyocr.Reader: |
| global _reader |
| if _reader is None: |
| _reader = easyocr.Reader(["en"], gpu=False) |
| return _reader |
|
|
|
|
| def image_bytes_to_array(image_bytes: bytes) -> np.ndarray: |
| img = Image.open(io.BytesIO(image_bytes)).convert("RGB") |
| return np.array(img) |
|
|
|
|
| def ocr_image_array(img_array: np.ndarray) -> str: |
| reader = get_reader() |
| results = reader.readtext(img_array, detail=0, paragraph=True) |
| return "\n".join(results) |
|
|
|
|
| def extract_text_from_image(file_bytes: bytes) -> Tuple[str, int]: |
| img_array = image_bytes_to_array(file_bytes) |
| text = ocr_image_array(img_array) |
| return text, 1 |
|
|
|
|
| def extract_text_from_pdf(file_bytes: bytes) -> Tuple[str, int]: |
| doc = fitz.open(stream=file_bytes, filetype="pdf") |
| pages_processed = len(doc) |
| all_text_parts = [] |
|
|
| for page in doc: |
| |
| native_text = page.get_text("text").strip() |
|
|
| if len(native_text) > 50: |
| all_text_parts.append(native_text) |
| else: |
| |
| pix = page.get_pixmap(dpi=200) |
| img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) |
| img_array = np.array(img) |
| ocr_text = ocr_image_array(img_array) |
| all_text_parts.append(ocr_text) |
|
|
| doc.close() |
| return "\n\n--- PAGE BREAK ---\n\n".join(all_text_parts), pages_processed |
|
|
|
|
| def extract_text_from_docx(file_bytes: bytes) -> Tuple[str, int]: |
| text = docx2txt.process(io.BytesIO(file_bytes)) |
| return text or "", 1 |
|
|
|
|
| def extract_text_from_file(file_bytes: bytes, ext: str) -> Tuple[str, int]: |
| if ext == "pdf": |
| return extract_text_from_pdf(file_bytes) |
| elif ext == "docx": |
| return extract_text_from_docx(file_bytes) |
| elif ext in {"png", "jpg", "jpeg", "webp"}: |
| return extract_text_from_image(file_bytes) |
| else: |
| raise ValueError(f"Unsupported extension: {ext}") |