Spaces:

SathvikGanta
/

Scaned_doc_typed

Sleeping

App Files Files Community

SathvikGanta commited on Dec 1, 2024

Commit

d6d8645

verified ·

1 Parent(s): 0dc2181

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -15

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import gradio as gr
 import pytesseract
 from pdf2image import convert_from_path
-from PyPDF2 import PdfWriter, PdfReader
 from docx import Document
 import tempfile
 import os
@@ -12,16 +12,14 @@ pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"  # Adjust path for
 def convert_pdf_to_text(input_pdf):
     """Convert scanned PDF to text-based PDF and Word document using OCR."""
     with tempfile.TemporaryDirectory() as temp_dir:
-        # Save the uploaded file to a temporary location
-        input_pdf_path = os.path.join(temp_dir, "input.pdf")
-        with open(input_pdf_path, "wb") as f:
-            f.write(input_pdf)  # `input_pdf` is a byte stream from Gradio
         # Convert PDF to images
         print("Converting PDF to images...")
         images = convert_from_path(input_pdf_path)
-        # Extract text from images using OCR
         print("Extracting text from images...")
         text_data = []
         for i, image in enumerate(images):
@@ -35,13 +33,13 @@ def convert_pdf_to_text(input_pdf):
         # Create a text-based PDF
         print("Creating text-based PDF...")
         output_pdf_path = os.path.join(temp_dir, "output.pdf")
-        pdf_writer = PdfWriter()
-        pdf_writer.add_metadata({
-            "/Title": "OCR Converted PDF",
-            "/Author": "OCR Application"
-        })
-        with open(output_pdf_path, "wb") as f:
-            f.write(full_text.encode("utf-8"))
         # Create a Word document
         print("Creating Word document...")
@@ -75,8 +73,8 @@ iface = gr.Interface(
     ],
     title="OCR PDF Converter",
     description="Upload a scanned PDF, and this app will convert it into a text-based PDF and Word document using OCR.",
-    theme="compact"
 )
 if __name__ == "__main__":
-    iface.launch(share=True)

 import gradio as gr
 import pytesseract
 from pdf2image import convert_from_path
+from PyPDF2 import PdfWriter
 from docx import Document
 import tempfile
 import os
 def convert_pdf_to_text(input_pdf):
     """Convert scanned PDF to text-based PDF and Word document using OCR."""
     with tempfile.TemporaryDirectory() as temp_dir:
+        # Save the uploaded file to a temporary directory
+        input_pdf_path = input_pdf.name  # Get the file path directly from the Gradio object
         # Convert PDF to images
         print("Converting PDF to images...")
         images = convert_from_path(input_pdf_path)
+        # Extract text from each image using OCR
         print("Extracting text from images...")
         text_data = []
         for i, image in enumerate(images):
         # Create a text-based PDF
         print("Creating text-based PDF...")
         output_pdf_path = os.path.join(temp_dir, "output.pdf")
+        with open(output_pdf_path, "wb") as pdf_file:
+            pdf_writer = PdfWriter()
+            pdf_writer.add_metadata({
+                "/Title": "OCR Converted PDF",
+                "/Author": "OCR Application"
+            })
+            pdf_writer.write(pdf_file)
         # Create a Word document
         print("Creating Word document...")
     ],
     title="OCR PDF Converter",
     description="Upload a scanned PDF, and this app will convert it into a text-based PDF and Word document using OCR.",
+    theme="default"
 )
 if __name__ == "__main__":
+    iface.launch()