Spaces:

lolhaha002
/

Pdf-Extractor

Sleeping

lolhaha002 commited on Jul 28, 2025

Commit

5c23bea

verified ·

1 Parent(s): 83cd0d2

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,30 +2,25 @@ import gradio as gr
 from pdf2image import convert_from_path
 from PIL import Image
 import pytesseract
-import os
-# Set Gujarati as OCR language
 OCR_LANG = "guj"
 def extract_gujarati_text(pdf_file, page_number):
-    # Convert selected page to image
     images = convert_from_path(pdf_file.name, first_page=page_number, last_page=page_number)
     image = images[0]
-    image_path = f"/tmp/page_{page_number}.png"
-    image.save(image_path)
-    # Run OCR with Gujarati
-    text = pytesseract.image_to_string(Image.open(image_path), lang=OCR_LANG)
-    return text
 with gr.Blocks() as demo:
-    gr.Markdown("## 📚 Gujarati OCR from PDF (Tesseract-powered)")
     pdf = gr.File(label="📤 Upload Gujarati PDF", file_types=[".pdf"])
     page = gr.Number(label="📄 Page Number", minimum=1, value=1, step=1)
     button = gr.Button("🔍 Extract Text")
-    output = gr.Textbox(label="📝 Extracted Gujarati Text", lines=20)
-    button.click(fn=extract_gujarati_text, inputs=[pdf, page], outputs=output)
 demo.launch()

 from pdf2image import convert_from_path
 from PIL import Image
 import pytesseract
 OCR_LANG = "guj"
 def extract_gujarati_text(pdf_file, page_number):
     images = convert_from_path(pdf_file.name, first_page=page_number, last_page=page_number)
     image = images[0]
+    text = pytesseract.image_to_string(image, lang=OCR_LANG)
+    return text, image  # Returning both OCR text and snapshot
 with gr.Blocks() as demo:
+    gr.Markdown("## 📚 Gujarati OCR from PDF (with Page Snapshot)")
     pdf = gr.File(label="📤 Upload Gujarati PDF", file_types=[".pdf"])
     page = gr.Number(label="📄 Page Number", minimum=1, value=1, step=1)
     button = gr.Button("🔍 Extract Text")
+    with gr.Row():
+        image_output = gr.Image(label="🖼️ PDF Page Snapshot")
+        text_output = gr.Textbox(label="📝 Extracted Gujarati Text", lines=20)
+    button.click(fn=extract_gujarati_text, inputs=[pdf, page], outputs=[text_output, image_output])
 demo.launch()