Spaces:

MakPr016
/

clinical-analysis-api

Sleeping

clinical-analysis-api / app /text_extractor.py

MakPr016

Inital phase

e158d2f 3 months ago

3.87 kB

	"""
	Text extraction from PDFs and images using EasyOCR
	Smart extraction: tries text layer first, falls back to OCR
	"""

	import fitz # PyMuPDF
	import easyocr
	from PIL import Image
	from pdf2image import convert_from_bytes
	import io
	import numpy as np
	from typing import Tuple, Optional

	print("Initializing EasyOCR Reader...")
	try:
	reader = easyocr.Reader(['en'], gpu=False, verbose=False)
	print("✓ EasyOCR Reader initialized successfully")
	except Exception as e:
	print(f"✗ EasyOCR initialization failed: {e}")
	reader = None

	def extract_text_from_pdf(pdf_bytes: bytes) -> Tuple[Optional[str], bool]:
	"""
	Extract text from PDF with smart OCR fallback

	Returns:
	(extracted_text, ocr_used)
	"""
	if not pdf_bytes:
	return None, False

	try:
	# Try extracting text layer first (fast)
	doc = fitz.open(stream=pdf_bytes, filetype="pdf")
	full_text = ""

	for page in doc:
	full_text += page.get_text()

	doc.close()

	# Check if meaningful text was extracted
	if len(full_text.strip()) > 50:
	print(f"✓ Extracted {len(full_text)} chars from text layer")
	return full_text.strip(), False

	# No text layer - use OCR
	print("⚠ No text layer detected, using EasyOCR...")
	text = extract_text_from_pdf_via_ocr(pdf_bytes)
	return text, True

	except Exception as e:
	print(f"✗ Error in PDF text extraction: {e}")
	return None, False

	def extract_text_from_pdf_via_ocr(pdf_bytes: bytes) -> Optional[str]:
	"""
	Extract text using EasyOCR on PDF pages converted to images
	"""
	if not reader:
	raise RuntimeError("EasyOCR not initialized")

	try:
	# Convert PDF to images
	images = convert_from_bytes(pdf_bytes, dpi=300)
	full_text = ""

	for i, image in enumerate(images):
	print(f" OCR processing page {i+1}/{len(images)}...")

	# Convert PIL to numpy array
	img_array = np.array(image)

	# Run EasyOCR
	results = reader.readtext(img_array, detail=0, paragraph=True)
	page_text = ' '.join(results)
	full_text += page_text + "\n\n"

	print(f"✓ EasyOCR extracted {len(full_text)} chars from {len(images)} pages")
	return full_text.strip()

	except Exception as e:
	print(f"✗ OCR failed: {e}")
	return None

	def extract_text_from_image(image_bytes: bytes) -> Optional[str]:
	"""
	Extract text from image file using EasyOCR
	"""
	if not reader:
	raise RuntimeError("EasyOCR not initialized")

	try:
	print("Processing image with EasyOCR...")

	# Open and prepare image
	image = Image.open(io.BytesIO(image_bytes))

	if image.mode != 'RGB':
	image = image.convert('RGB')

	# Convert to numpy
	img_array = np.array(image)

	# Run EasyOCR
	results = reader.readtext(img_array, detail=0, paragraph=True)
	text = ' '.join(results)

	print(f"✓ EasyOCR extracted {len(text)} chars from image")
	return text.strip()

	except Exception as e:
	print(f"✗ Image OCR failed: {e}")
	return None

	def get_ocr_confidence(image_array: np.ndarray) -> list:
	"""
	Get detailed OCR results with confidence scores
	"""
	if not reader:
	return []

	try:
	results = reader.readtext(image_array, detail=1)
	return [
	{
	"text": text,
	"confidence": round(conf, 3),
	"bbox": bbox
	}
	for bbox, text, conf in results
	]
	except:
	return []