project / models /ocr.py
venkataashok's picture
Upload 3 files
8f04bd0 verified
# models/ocr.py
import fitz # PyMuPDF
import easyocr
# Load OCR model once (very important)
reader = easyocr.Reader(['en'], gpu=False)
def extract_text(file_path):
"""
Extract text from PDF, TXT, or Image.
Faster version with smart OCR fallback.
"""
file_path = str(file_path)
# ------------------------------
# PDF Handling
# ------------------------------
if file_path.lower().endswith(".pdf"):
doc = fitz.open(file_path)
text = ""
for page in doc:
text += page.get_text()
doc.close()
# If PDF already has selectable text → return immediately
if text.strip():
return text
# If scanned PDF → fallback to OCR
images_text = []
doc = fitz.open(file_path)
for page in doc:
pix = page.get_pixmap()
img_bytes = pix.tobytes("png")
result = reader.readtext(img_bytes, detail=0)
images_text.extend(result)
doc.close()
return " ".join(images_text)
# ------------------------------
# TXT Handling
# ------------------------------
elif file_path.lower().endswith(".txt"):
with open(file_path, "r", encoding="utf-8") as f:
return f.read()
# ------------------------------
# Image Handling
# ------------------------------
else:
result = reader.readtext(file_path, detail=0)
return " ".join(result)