import os
import re
import shutil
import cv2
import pytesseract
import gradio as gr

from pdf2image import convert_from_path, pdfinfo_from_path
from tqdm import tqdm


TEMP_DIR = "temp_pages"
TESS_LANG = "ben"


def process_pdf(pdf_file):

    OUTPUT_TXT = "assamese_book.txt"

    os.makedirs(TEMP_DIR, exist_ok=True)

    pdf_path = pdf_file.name

    # -----------------------------------
    # GET TOTAL PAGES
    # -----------------------------------
    info = pdfinfo_from_path(pdf_path)
    total_pages = info["Pages"]

    all_text = []

    # -----------------------------------
    # PROCESS PAGES
    # -----------------------------------
    for page_num in tqdm(
        range(1, total_pages + 1),
        desc="PDF -> OCR",
        unit="page"
    ):

        pages = convert_from_path(
            pdf_path,
            dpi=300,
            first_page=page_num,
            last_page=page_num,
            fmt="png"
        )

        page = pages[0]

        img_path = os.path.join(
            TEMP_DIR,
            f"page_{page_num}.png"
        )

        page.save(img_path, "PNG")

        # -----------------------------------
        # PREPROCESS
        # -----------------------------------
        img = cv2.imread(img_path)

        gray = cv2.cvtColor(
            img,
            cv2.COLOR_BGR2GRAY
        )

        gray = cv2.fastNlMeansDenoising(gray)

        _, thresh = cv2.threshold(
            gray,
            0,
            255,
            cv2.THRESH_BINARY + cv2.THRESH_OTSU
        )

        # -----------------------------------
        # OCR
        # -----------------------------------
        text = pytesseract.image_to_string(
            thresh,
            lang=TESS_LANG,
            config="--oem 1 --psm 3"
        )

        # -----------------------------------
        # CLEAN
        # -----------------------------------
        text = text.replace("\u200c", "")
        text = text.replace("\u200d", "")
        text = re.sub(r"\s+", " ", text).strip()

        all_text.append(text)

        # delete image instantly
        os.remove(img_path)

    # -----------------------------------
    # SAVE TEXT
    # -----------------------------------
    with open(OUTPUT_TXT, "w", encoding="utf-8") as f:
        f.write("\n".join(all_text))

    # cleanup
    shutil.rmtree(TEMP_DIR)

    return OUTPUT_TXT


demo = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(
        label='Input PDF: "Israel - Hem Barua.pdf"',
        file_types=[".pdf"]
    ),
    outputs=gr.File(label="Download Extracted Text"),
    title="Assamese PDF OCR",
    description="Upload scanned Assamese PDFs and extract text."
)

demo.launch()