# models/ocr.py

import fitz  # PyMuPDF
import easyocr

# Load OCR model once (very important)
reader = easyocr.Reader(['en'], gpu=False)

def extract_text(file_path):
    """
    Extract text from PDF, TXT, or Image.
    Faster version with smart OCR fallback.
    """
    file_path = str(file_path)

    # ------------------------------
    # PDF Handling
    # ------------------------------
    if file_path.lower().endswith(".pdf"):
        doc = fitz.open(file_path)
        text = ""

        for page in doc:
            text += page.get_text()

        doc.close()


        # If PDF already has selectable text → return immediately
        if text.strip():
            return text

        # If scanned PDF → fallback to OCR
        images_text = []
        doc = fitz.open(file_path)
        for page in doc:
            pix = page.get_pixmap()
            img_bytes = pix.tobytes("png")
            result = reader.readtext(img_bytes, detail=0)
            images_text.extend(result)
        doc.close()

        return " ".join(images_text)

    # ------------------------------
    # TXT Handling
    # ------------------------------
    elif file_path.lower().endswith(".txt"):
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read()

    # ------------------------------
    # Image Handling
    # ------------------------------
    else:
        result = reader.readtext(file_path, detail=0)
        return " ".join(result)