import os
import subprocess
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
from PyPDF2 import PdfWriter, PdfReader
from docx import Document
import gradio as gr
import io
import shutil

# Define paths for dependencies
POPPLER_PATH = "/usr/bin"
TESSERACT_PATH = "/usr/bin/tesseract"


def install_dependencies():
    """Install Poppler and Tesseract if not already installed."""
    # Install Poppler if missing
    if not shutil.which("pdfinfo"):
        print("Poppler not found. Installing...")
        try:
            subprocess.run(["apt-get", "update"], check=True)
            subprocess.run(["apt-get", "install", "-y", "poppler-utils"], check=True)
            print("Poppler installed successfully.")
        except Exception as e:
            raise RuntimeError(f"Error installing Poppler: {e}")
    else:
        print("Poppler is already installed.")

    # Install Tesseract if missing
    if not shutil.which("tesseract"):
        print("Tesseract not found. Installing...")
        try:
            subprocess.run(["apt-get", "install", "-y", "tesseract-ocr"], check=True)
            print("Tesseract installed successfully.")
        except Exception as e:
            raise RuntimeError(f"Error installing Tesseract: {e}")
    else:
        print("Tesseract is already installed.")

    # Ensure pytesseract uses the correct path
    pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH


def convert_pdf_to_text(input_pdf):
    """Convert scanned PDF to text-based PDF and Word document using OCR."""
    install_dependencies()  # Ensure dependencies are installed

    input_pdf_path = input_pdf.name  # Get file path

    # Convert PDF to images
    try:
        images = convert_from_path(input_pdf_path, poppler_path=POPPLER_PATH)
    except Exception as e:
        raise RuntimeError(f"Error during PDF to image conversion: {e}")

    # Extract text from images
    text_data = []
    for image in images:
        text = pytesseract.image_to_string(image)
        text_data.append(text)

    # Combine text
    full_text = "\n".join(text_data)

    # Generate text-based PDF in memory
    pdf_buffer = io.BytesIO()
    pdf_writer = PdfWriter()
    pdf_writer.add_metadata({
        "/Title": "OCR Converted PDF",
        "/Author": "OCR Application"
    })
    with open(input_pdf_path, "rb") as reader_file:
        reader = PdfReader(reader_file)
        for page in reader.pages:
            pdf_writer.add_page(page)
    pdf_writer.write(pdf_buffer)

    # Generate Word document in memory
    docx_buffer = io.BytesIO()
    doc = Document()
    doc.add_heading("OCR Converted Text", level=1)
    doc.add_paragraph(full_text)
    doc.save(docx_buffer)

    # Rewind buffers
    pdf_buffer.seek(0)
    docx_buffer.seek(0)

    return pdf_buffer, docx_buffer


def gradio_interface(file):
    pdf_output, docx_output = convert_pdf_to_text(file)
    return pdf_output, docx_output


iface = gr.Interface(
    fn=gradio_interface,
    inputs=gr.File(label="Upload Scanned PDF"),
    outputs=[
        gr.File(label="Download OCR-Processed PDF"),
        gr.File(label="Download OCR-Processed Word Document")
    ],
    title="OCR PDF Converter",
    description="Upload a scanned PDF, and this app will convert it into a text-based PDF and Word document using OCR."
)

if __name__ == "__main__":
    iface.launch()