Spaces:
Sleeping
Sleeping
| """ | |
| Text extraction from PDFs and images using EasyOCR | |
| Smart extraction: tries text layer first, falls back to OCR | |
| """ | |
| import fitz # PyMuPDF | |
| import easyocr | |
| from PIL import Image | |
| from pdf2image import convert_from_bytes | |
| import io | |
| import numpy as np | |
| from typing import Tuple, Optional | |
| print("Initializing EasyOCR Reader...") | |
| try: | |
| reader = easyocr.Reader(['en'], gpu=False, verbose=False) | |
| print("β EasyOCR Reader initialized successfully") | |
| except Exception as e: | |
| print(f"β EasyOCR initialization failed: {e}") | |
| reader = None | |
| def extract_text_from_pdf(pdf_bytes: bytes) -> Tuple[Optional[str], bool]: | |
| """ | |
| Extract text from PDF with smart OCR fallback | |
| Returns: | |
| (extracted_text, ocr_used) | |
| """ | |
| if not pdf_bytes: | |
| return None, False | |
| try: | |
| # Try extracting text layer first (fast) | |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
| full_text = "" | |
| for page in doc: | |
| full_text += page.get_text() | |
| doc.close() | |
| # Check if meaningful text was extracted | |
| if len(full_text.strip()) > 50: | |
| print(f"β Extracted {len(full_text)} chars from text layer") | |
| return full_text.strip(), False | |
| # No text layer - use OCR | |
| print("β No text layer detected, using EasyOCR...") | |
| text = extract_text_from_pdf_via_ocr(pdf_bytes) | |
| return text, True | |
| except Exception as e: | |
| print(f"β Error in PDF text extraction: {e}") | |
| return None, False | |
| def extract_text_from_pdf_via_ocr(pdf_bytes: bytes) -> Optional[str]: | |
| """ | |
| Extract text using EasyOCR on PDF pages converted to images | |
| """ | |
| if not reader: | |
| raise RuntimeError("EasyOCR not initialized") | |
| try: | |
| # Convert PDF to images | |
| images = convert_from_bytes(pdf_bytes, dpi=300) | |
| full_text = "" | |
| for i, image in enumerate(images): | |
| print(f" OCR processing page {i+1}/{len(images)}...") | |
| # Convert PIL to numpy array | |
| img_array = np.array(image) | |
| # Run EasyOCR | |
| results = reader.readtext(img_array, detail=0, paragraph=True) | |
| page_text = ' '.join(results) | |
| full_text += page_text + "\n\n" | |
| print(f"β EasyOCR extracted {len(full_text)} chars from {len(images)} pages") | |
| return full_text.strip() | |
| except Exception as e: | |
| print(f"β OCR failed: {e}") | |
| return None | |
| def extract_text_from_image(image_bytes: bytes) -> Optional[str]: | |
| """ | |
| Extract text from image file using EasyOCR | |
| """ | |
| if not reader: | |
| raise RuntimeError("EasyOCR not initialized") | |
| try: | |
| print("Processing image with EasyOCR...") | |
| # Open and prepare image | |
| image = Image.open(io.BytesIO(image_bytes)) | |
| if image.mode != 'RGB': | |
| image = image.convert('RGB') | |
| # Convert to numpy | |
| img_array = np.array(image) | |
| # Run EasyOCR | |
| results = reader.readtext(img_array, detail=0, paragraph=True) | |
| text = ' '.join(results) | |
| print(f"β EasyOCR extracted {len(text)} chars from image") | |
| return text.strip() | |
| except Exception as e: | |
| print(f"β Image OCR failed: {e}") | |
| return None | |
| def get_ocr_confidence(image_array: np.ndarray) -> list: | |
| """ | |
| Get detailed OCR results with confidence scores | |
| """ | |
| if not reader: | |
| return [] | |
| try: | |
| results = reader.readtext(image_array, detail=1) | |
| return [ | |
| { | |
| "text": text, | |
| "confidence": round(conf, 3), | |
| "bbox": bbox | |
| } | |
| for bbox, text, conf in results | |
| ] | |
| except: | |
| return [] | |