|
|
import fitz |
|
|
import easyocr |
|
|
from pdf2image import convert_from_path |
|
|
from typing import Optional |
|
|
import tempfile |
|
|
import os |
|
|
|
|
|
|
|
|
def extract_text_from_pdf(file_path: str) -> str: |
|
|
""" |
|
|
Extract text from PDF using hybrid approach: |
|
|
1. First try PyMuPDF for searchable PDFs |
|
|
2. If minimal text, fall back to OCR for scanned PDFs |
|
|
""" |
|
|
|
|
|
|
|
|
try: |
|
|
doc = fitz.open(file_path) |
|
|
text = "" |
|
|
|
|
|
for page_num in range(doc.page_count): |
|
|
page = doc.load_page(page_num) |
|
|
text += page.get_text() |
|
|
|
|
|
doc.close() |
|
|
|
|
|
|
|
|
if len(text.strip()) > 100: |
|
|
return text.strip() |
|
|
|
|
|
except Exception as e: |
|
|
print(f"PyMuPDF extraction failed: {e}") |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
images = convert_from_path(file_path) |
|
|
|
|
|
|
|
|
reader = easyocr.Reader(['en', 'hi']) |
|
|
|
|
|
ocr_text = "" |
|
|
for image in images: |
|
|
|
|
|
results = reader.readtext(image) |
|
|
|
|
|
|
|
|
for (bbox, text, confidence) in results: |
|
|
if confidence > 0.5: |
|
|
ocr_text += text + " " |
|
|
|
|
|
return ocr_text.strip() |
|
|
|
|
|
except Exception as e: |
|
|
print(f"OCR extraction failed: {e}") |
|
|
raise Exception(f"Failed to extract text from PDF: {e}") |
|
|
|
|
|
return "" |
|
|
|