qp-parser / app /extractor.py
MakPr016
QP Parser
d81169f
import io
import easyocr
import fitz # pymupdf
import docx2txt
import numpy as np
from PIL import Image
from typing import Tuple
_reader = None
def get_reader() -> easyocr.Reader:
global _reader
if _reader is None:
_reader = easyocr.Reader(["en"], gpu=False)
return _reader
def image_bytes_to_array(image_bytes: bytes) -> np.ndarray:
img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
return np.array(img)
def ocr_image_array(img_array: np.ndarray) -> str:
reader = get_reader()
results = reader.readtext(img_array, detail=0, paragraph=True)
return "\n".join(results)
def extract_text_from_image(file_bytes: bytes) -> Tuple[str, int]:
img_array = image_bytes_to_array(file_bytes)
text = ocr_image_array(img_array)
return text, 1
def extract_text_from_pdf(file_bytes: bytes) -> Tuple[str, int]:
doc = fitz.open(stream=file_bytes, filetype="pdf")
pages_processed = len(doc)
all_text_parts = []
for page in doc:
# Try native text first
native_text = page.get_text("text").strip()
if len(native_text) > 50:
all_text_parts.append(native_text)
else:
# Fallback to EasyOCR on rasterised page
pix = page.get_pixmap(dpi=200)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
img_array = np.array(img)
ocr_text = ocr_image_array(img_array)
all_text_parts.append(ocr_text)
doc.close()
return "\n\n--- PAGE BREAK ---\n\n".join(all_text_parts), pages_processed
def extract_text_from_docx(file_bytes: bytes) -> Tuple[str, int]:
text = docx2txt.process(io.BytesIO(file_bytes))
return text or "", 1
def extract_text_from_file(file_bytes: bytes, ext: str) -> Tuple[str, int]:
if ext == "pdf":
return extract_text_from_pdf(file_bytes)
elif ext == "docx":
return extract_text_from_docx(file_bytes)
elif ext in {"png", "jpg", "jpeg", "webp"}:
return extract_text_from_image(file_bytes)
else:
raise ValueError(f"Unsupported extension: {ext}")