import gradio as gr
from PyPDF2 import PdfReader
from pdf2image import convert_from_bytes
import pytesseract
from PIL import Image

def extract_text_from_pdf(pdf_path):
    try:
        reader = PdfReader(pdf_path)
        text = "\n".join(page.extract_text() or "" for page in reader.pages)
        return text.strip()
    except Exception as e:
        return f"[Error reading normal PDF] {e}"

def extract_text_from_scanned(pdf_path):
    try:
        images = convert_from_bytes(open(pdf_path, "rb").read())
        text = ""
        for image in images:
            text += pytesseract.image_to_string(image)
        return text.strip()
    except Exception as e:
        return f"[Error reading scanned PDF] {e}"

def process(pdf_file):
    if not pdf_file:
        return "Please upload a PDF."

    text = extract_text_from_pdf(pdf_file)
    if not text.strip():
        text = extract_text_from_scanned(pdf_file)
    return text or "❌ Could not extract text."

with gr.Blocks() as demo:
    gr.Markdown("## 📚 Law PDF Formatter")
    with gr.Row():
        file = gr.File(file_types=[".pdf"], label="Upload PDF", type="filepath")
        btn = gr.Button("Extract Text")
    out = gr.Textbox(label="Extracted Text", lines=20, interactive=True)
    btn.click(fn=process, inputs=file, outputs=out)

demo.launch(share=True, flagging_callback=None)