import fitz # PyMuPDF import easyocr from pdf2image import convert_from_path from typing import Optional import tempfile import os def extract_text_from_pdf(file_path: str) -> str: """ Extract text from PDF using hybrid approach: 1. First try PyMuPDF for searchable PDFs 2. If minimal text, fall back to OCR for scanned PDFs """ # Step 1: Try PyMuPDF extraction try: doc = fitz.open(file_path) text = "" for page_num in range(doc.page_count): page = doc.load_page(page_num) text += page.get_text() doc.close() # Check if we got meaningful text (more than 100 characters) if len(text.strip()) > 100: return text.strip() except Exception as e: print(f"PyMuPDF extraction failed: {e}") # Step 2: Fall back to OCR for scanned PDFs try: # Convert PDF to images images = convert_from_path(file_path) # Initialize EasyOCR for English and Hindi reader = easyocr.Reader(['en', 'hi']) ocr_text = "" for image in images: # Perform OCR on each page results = reader.readtext(image) # Extract text from OCR results for (bbox, text, confidence) in results: if confidence > 0.5: # Only include high-confidence text ocr_text += text + " " return ocr_text.strip() except Exception as e: print(f"OCR extraction failed: {e}") raise Exception(f"Failed to extract text from PDF: {e}") return ""