clinical-analysis-api / app /text_extractor.py
MakPr016
Inital phase
e158d2f
"""
Text extraction from PDFs and images using EasyOCR
Smart extraction: tries text layer first, falls back to OCR
"""
import fitz # PyMuPDF
import easyocr
from PIL import Image
from pdf2image import convert_from_bytes
import io
import numpy as np
from typing import Tuple, Optional
print("Initializing EasyOCR Reader...")
try:
reader = easyocr.Reader(['en'], gpu=False, verbose=False)
print("βœ“ EasyOCR Reader initialized successfully")
except Exception as e:
print(f"βœ— EasyOCR initialization failed: {e}")
reader = None
def extract_text_from_pdf(pdf_bytes: bytes) -> Tuple[Optional[str], bool]:
"""
Extract text from PDF with smart OCR fallback
Returns:
(extracted_text, ocr_used)
"""
if not pdf_bytes:
return None, False
try:
# Try extracting text layer first (fast)
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
full_text = ""
for page in doc:
full_text += page.get_text()
doc.close()
# Check if meaningful text was extracted
if len(full_text.strip()) > 50:
print(f"βœ“ Extracted {len(full_text)} chars from text layer")
return full_text.strip(), False
# No text layer - use OCR
print("⚠ No text layer detected, using EasyOCR...")
text = extract_text_from_pdf_via_ocr(pdf_bytes)
return text, True
except Exception as e:
print(f"βœ— Error in PDF text extraction: {e}")
return None, False
def extract_text_from_pdf_via_ocr(pdf_bytes: bytes) -> Optional[str]:
"""
Extract text using EasyOCR on PDF pages converted to images
"""
if not reader:
raise RuntimeError("EasyOCR not initialized")
try:
# Convert PDF to images
images = convert_from_bytes(pdf_bytes, dpi=300)
full_text = ""
for i, image in enumerate(images):
print(f" OCR processing page {i+1}/{len(images)}...")
# Convert PIL to numpy array
img_array = np.array(image)
# Run EasyOCR
results = reader.readtext(img_array, detail=0, paragraph=True)
page_text = ' '.join(results)
full_text += page_text + "\n\n"
print(f"βœ“ EasyOCR extracted {len(full_text)} chars from {len(images)} pages")
return full_text.strip()
except Exception as e:
print(f"βœ— OCR failed: {e}")
return None
def extract_text_from_image(image_bytes: bytes) -> Optional[str]:
"""
Extract text from image file using EasyOCR
"""
if not reader:
raise RuntimeError("EasyOCR not initialized")
try:
print("Processing image with EasyOCR...")
# Open and prepare image
image = Image.open(io.BytesIO(image_bytes))
if image.mode != 'RGB':
image = image.convert('RGB')
# Convert to numpy
img_array = np.array(image)
# Run EasyOCR
results = reader.readtext(img_array, detail=0, paragraph=True)
text = ' '.join(results)
print(f"βœ“ EasyOCR extracted {len(text)} chars from image")
return text.strip()
except Exception as e:
print(f"βœ— Image OCR failed: {e}")
return None
def get_ocr_confidence(image_array: np.ndarray) -> list:
"""
Get detailed OCR results with confidence scores
"""
if not reader:
return []
try:
results = reader.readtext(image_array, detail=1)
return [
{
"text": text,
"confidence": round(conf, 3),
"bbox": bbox
}
for bbox, text, conf in results
]
except:
return []