Spaces:

MakPr016
/

qp-parser

Sleeping

qp-parser / app /extractor.py

MakPr016

QP Parser

d81169f about 1 month ago

2.11 kB

	import io
	import easyocr
	import fitz # pymupdf
	import docx2txt
	import numpy as np
	from PIL import Image
	from typing import Tuple

	_reader = None

	def get_reader() -> easyocr.Reader:
	global _reader
	if _reader is None:
	_reader = easyocr.Reader(["en"], gpu=False)
	return _reader


	def image_bytes_to_array(image_bytes: bytes) -> np.ndarray:
	img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
	return np.array(img)


	def ocr_image_array(img_array: np.ndarray) -> str:
	reader = get_reader()
	results = reader.readtext(img_array, detail=0, paragraph=True)
	return "\n".join(results)


	def extract_text_from_image(file_bytes: bytes) -> Tuple[str, int]:
	img_array = image_bytes_to_array(file_bytes)
	text = ocr_image_array(img_array)
	return text, 1


	def extract_text_from_pdf(file_bytes: bytes) -> Tuple[str, int]:
	doc = fitz.open(stream=file_bytes, filetype="pdf")
	pages_processed = len(doc)
	all_text_parts = []

	for page in doc:
	# Try native text first
	native_text = page.get_text("text").strip()

	if len(native_text) > 50:
	all_text_parts.append(native_text)
	else:
	# Fallback to EasyOCR on rasterised page
	pix = page.get_pixmap(dpi=200)
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	img_array = np.array(img)
	ocr_text = ocr_image_array(img_array)
	all_text_parts.append(ocr_text)

	doc.close()
	return "\n\n--- PAGE BREAK ---\n\n".join(all_text_parts), pages_processed


	def extract_text_from_docx(file_bytes: bytes) -> Tuple[str, int]:
	text = docx2txt.process(io.BytesIO(file_bytes))
	return text or "", 1


	def extract_text_from_file(file_bytes: bytes, ext: str) -> Tuple[str, int]:
	if ext == "pdf":
	return extract_text_from_pdf(file_bytes)
	elif ext == "docx":
	return extract_text_from_docx(file_bytes)
	elif ext in {"png", "jpg", "jpeg", "webp"}:
	return extract_text_from_image(file_bytes)
	else:
	raise ValueError(f"Unsupported extension: {ext}")