import gradio as gr from PyPDF2 import PdfReader from pdf2image import convert_from_bytes import pytesseract from PIL import Image def extract_text_from_pdf(pdf_path): try: reader = PdfReader(pdf_path) text = "\n".join(page.extract_text() or "" for page in reader.pages) return text.strip() except Exception as e: return f"[Error reading normal PDF] {e}" def extract_text_from_scanned(pdf_path): try: images = convert_from_bytes(open(pdf_path, "rb").read()) text = "" for image in images: text += pytesseract.image_to_string(image) return text.strip() except Exception as e: return f"[Error reading scanned PDF] {e}" def process(pdf_file): if not pdf_file: return "Please upload a PDF." text = extract_text_from_pdf(pdf_file) if not text.strip(): text = extract_text_from_scanned(pdf_file) return text or "❌ Could not extract text." with gr.Blocks() as demo: gr.Markdown("## 📚 Law PDF Formatter") with gr.Row(): file = gr.File(file_types=[".pdf"], label="Upload PDF", type="filepath") btn = gr.Button("Extract Text") out = gr.Textbox(label="Extracted Text", lines=20, interactive=True) btn.click(fn=process, inputs=file, outputs=out) demo.launch(share=True, flagging_callback=None)