| import gradio as gr | |
| from PyPDF2 import PdfReader | |
| from pdf2image import convert_from_bytes | |
| import pytesseract | |
| from PIL import Image | |
| def extract_text_from_pdf(pdf_path): | |
| try: | |
| reader = PdfReader(pdf_path) | |
| text = "\n".join(page.extract_text() or "" for page in reader.pages) | |
| return text.strip() | |
| except Exception as e: | |
| return f"[Error reading normal PDF] {e}" | |
| def extract_text_from_scanned(pdf_path): | |
| try: | |
| images = convert_from_bytes(open(pdf_path, "rb").read()) | |
| text = "" | |
| for image in images: | |
| text += pytesseract.image_to_string(image) | |
| return text.strip() | |
| except Exception as e: | |
| return f"[Error reading scanned PDF] {e}" | |
| def process(pdf_file): | |
| if not pdf_file: | |
| return "Please upload a PDF." | |
| text = extract_text_from_pdf(pdf_file) | |
| if not text.strip(): | |
| text = extract_text_from_scanned(pdf_file) | |
| return text or "β Could not extract text." | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## π Law PDF Formatter") | |
| with gr.Row(): | |
| file = gr.File(file_types=[".pdf"], label="Upload PDF", type="filepath") | |
| btn = gr.Button("Extract Text") | |
| out = gr.Textbox(label="Extracted Text", lines=20, interactive=True) | |
| btn.click(fn=process, inputs=file, outputs=out) | |
| demo.launch(share=True, flagging_callback=None) | |