# models/ocr.py import fitz # PyMuPDF import easyocr # Load OCR model once (very important) reader = easyocr.Reader(['en'], gpu=False) def extract_text(file_path): """ Extract text from PDF, TXT, or Image. Faster version with smart OCR fallback. """ file_path = str(file_path) # ------------------------------ # PDF Handling # ------------------------------ if file_path.lower().endswith(".pdf"): doc = fitz.open(file_path) text = "" for page in doc: text += page.get_text() doc.close() # If PDF already has selectable text → return immediately if text.strip(): return text # If scanned PDF → fallback to OCR images_text = [] doc = fitz.open(file_path) for page in doc: pix = page.get_pixmap() img_bytes = pix.tobytes("png") result = reader.readtext(img_bytes, detail=0) images_text.extend(result) doc.close() return " ".join(images_text) # ------------------------------ # TXT Handling # ------------------------------ elif file_path.lower().endswith(".txt"): with open(file_path, "r", encoding="utf-8") as f: return f.read() # ------------------------------ # Image Handling # ------------------------------ else: result = reader.readtext(file_path, detail=0) return " ".join(result)