lawdocker / app.py
iamnew123's picture
Update app.py
26f3eec verified
import gradio as gr
from PyPDF2 import PdfReader
from pdf2image import convert_from_bytes
import pytesseract
from PIL import Image
def extract_text_from_pdf(pdf_path):
try:
reader = PdfReader(pdf_path)
text = "\n".join(page.extract_text() or "" for page in reader.pages)
return text.strip()
except Exception as e:
return f"[Error reading normal PDF] {e}"
def extract_text_from_scanned(pdf_path):
try:
images = convert_from_bytes(open(pdf_path, "rb").read())
text = ""
for image in images:
text += pytesseract.image_to_string(image)
return text.strip()
except Exception as e:
return f"[Error reading scanned PDF] {e}"
def process(pdf_file):
if not pdf_file:
return "Please upload a PDF."
text = extract_text_from_pdf(pdf_file)
if not text.strip():
text = extract_text_from_scanned(pdf_file)
return text or "❌ Could not extract text."
with gr.Blocks() as demo:
gr.Markdown("## πŸ“š Law PDF Formatter")
with gr.Row():
file = gr.File(file_types=[".pdf"], label="Upload PDF", type="filepath")
btn = gr.Button("Extract Text")
out = gr.Textbox(label="Extracted Text", lines=20, interactive=True)
btn.click(fn=process, inputs=file, outputs=out)
demo.launch(share=True, flagging_callback=None)