Spaces:

lolhaha002
/

Pdf-Extractor

Sleeping

App Files Files Community

lolhaha002 commited on Feb 26

Commit

6752cd0

verified ·

1 Parent(s): 5c23bea

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -12

app.py CHANGED Viewed

@@ -3,23 +3,44 @@ from pdf2image import convert_from_path
 from PIL import Image
 import pytesseract
-OCR_LANG = "guj"
 def extract_gujarati_text(pdf_file, page_number):
-    images = convert_from_path(pdf_file.name, first_page=page_number, last_page=page_number)
-    image = images[0]
-    text = pytesseract.image_to_string(image, lang=OCR_LANG)
-    return text, image  # Returning both OCR text and snapshot
 with gr.Blocks() as demo:
-    gr.Markdown("## 📚 Gujarati OCR from PDF (with Page Snapshot)")
-    pdf = gr.File(label="📤 Upload Gujarati PDF", file_types=[".pdf"])
-    page = gr.Number(label="📄 Page Number", minimum=1, value=1, step=1)
-    button = gr.Button("🔍 Extract Text")
     with gr.Row():
-        image_output = gr.Image(label="🖼️ PDF Page Snapshot")
-        text_output = gr.Textbox(label="📝 Extracted Gujarati Text", lines=20)
     button.click(fn=extract_gujarati_text, inputs=[pdf, page], outputs=[text_output, image_output])

 from PIL import Image
 import pytesseract
+# UPDATED: Added 'eng' to support mixed words (e.g., "Statistics", "Mean")
+# The '+' operator tells Tesseract to look for both languages.
+OCR_LANG = "guj+eng"
 def extract_gujarati_text(pdf_file, page_number):
+    try:
+        # Convert PDF page to image
+        images = convert_from_path(pdf_file.name, first_page=page_number, last_page=page_number)
+        image = images[0]
+        # UPDATED: added configuration
+        # --psm 3: Fully automatic page segmentation, but no OSD. (Good for mixed layouts)
+        # --oem 3: Default OCR Engine Mode.
+        custom_config = r'--oem 3 --psm 3'
+        text = pytesseract.image_to_string(image, lang=OCR_LANG, config=custom_config)
+        return text, image
+    except Exception as e:
+        return f"Error: {str(e)}", None
 with gr.Blocks() as demo:
+    gr.Markdown("## 📚 Gujarati + English OCR (Textbook Friendly)")
+    gr.Markdown("Upload your Maths/Stats textbook page. This tool now supports English words mixed with Gujarati.")
     with gr.Row():
+        with gr.Column():
+            pdf = gr.File(label="📤 Upload PDF", file_types=[".pdf"])
+            page = gr.Number(label="📄 Page Number", minimum=1, value=1, step=1)
+            button = gr.Button("🔍 Extract Text", variant="primary")
+        with gr.Column():
+            image_output = gr.Image(label="🖼️ Page Preview", type="pil")
+            text_output = gr.Textbox(
+                label="📝 Extracted Text (Copyable)",
+                lines=20,
+                show_copy_button=True # Added a copy button for ease
+            )
     button.click(fn=extract_gujarati_text, inputs=[pdf, page], outputs=[text_output, image_output])