Spaces:
Sleeping
Sleeping
| # models/ocr.py | |
| import fitz # PyMuPDF | |
| import easyocr | |
| # Load OCR model once (very important) | |
| reader = easyocr.Reader(['en'], gpu=False) | |
| def extract_text(file_path): | |
| """ | |
| Extract text from PDF, TXT, or Image. | |
| Faster version with smart OCR fallback. | |
| """ | |
| file_path = str(file_path) | |
| # ------------------------------ | |
| # PDF Handling | |
| # ------------------------------ | |
| if file_path.lower().endswith(".pdf"): | |
| doc = fitz.open(file_path) | |
| text = "" | |
| for page in doc: | |
| text += page.get_text() | |
| doc.close() | |
| # If PDF already has selectable text → return immediately | |
| if text.strip(): | |
| return text | |
| # If scanned PDF → fallback to OCR | |
| images_text = [] | |
| doc = fitz.open(file_path) | |
| for page in doc: | |
| pix = page.get_pixmap() | |
| img_bytes = pix.tobytes("png") | |
| result = reader.readtext(img_bytes, detail=0) | |
| images_text.extend(result) | |
| doc.close() | |
| return " ".join(images_text) | |
| # ------------------------------ | |
| # TXT Handling | |
| # ------------------------------ | |
| elif file_path.lower().endswith(".txt"): | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| return f.read() | |
| # ------------------------------ | |
| # Image Handling | |
| # ------------------------------ | |
| else: | |
| result = reader.readtext(file_path, detail=0) | |
| return " ".join(result) |