Spaces:

SathvikGanta
/

Scaned_doc_typed

Sleeping

App Files Files Community

SathvikGanta commited on Dec 1, 2024

Commit

c6c2ea3

verified ·

1 Parent(s): 47b34cb

Create app.py

Browse files

Files changed (1) hide show

app.py +63 -0

app.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import gradio as gr
+import pytesseract
+from pdf2image import convert_from_path
+from PyPDF2 import PdfWriter
+from PIL import Image
+import os
+import tempfile
+# Define Tesseract path (ensure Tesseract is installed on the environment)
+pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"  # Adjust path for Hugging Face Spaces
+def convert_pdf_to_text(input_pdf):
+    """Convert scanned PDF to text-based PDF using OCR."""
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Save the uploaded file
+        input_pdf_path = os.path.join(temp_dir, "input.pdf")
+        with open(input_pdf_path, "wb") as f:
+            f.write(input_pdf.read())
+        # Convert PDF to images
+        print("Converting PDF to images...")
+        images = convert_from_path(input_pdf_path)
+        # Extract text from each image using OCR
+        print("Extracting text from images...")
+        text_data = []
+        for i, image in enumerate(images):
+            print(f"Processing page {i + 1}...")
+            text = pytesseract.image_to_string(image)
+            text_data.append(text)
+        # Create a text-based PDF
+        output_pdf_path = os.path.join(temp_dir, "output.pdf")
+        pdf_writer = PdfWriter()
+        for text in text_data:
+            pdf_writer.add_blank_page()  # Add pages (text storage is skipped here)
+        with open(output_pdf_path, "wb") as output_file:
+            pdf_writer.write(output_file)
+        # Read the generated file for download
+        with open(output_pdf_path, "rb") as f:
+            output_pdf = f.read()
+    return output_pdf
+# Gradio Interface
+def gradio_interface(file):
+    """Wrapper for OCR conversion with Gradio input and output."""
+    output_pdf = convert_pdf_to_text(file)
+    return output_pdf
+# Gradio UI
+iface = gr.Interface(
+    fn=gradio_interface,
+    inputs=gr.File(label="Upload Scanned PDF"),  # File input for the user
+    outputs=gr.File(label="Download Text-Based PDF"),  # File output
+    title="OCR PDF Converter",
+    description="Upload a scanned PDF and convert it into a text-based PDF using OCR.",
+    theme="compact"  # Optional: Compact theme for UI
+)
+if __name__ == "__main__":
+    iface.launch(share=True)