Spaces:

SathvikGanta
/

Scaned_doc_typed

Sleeping

App Files Files Community

SathvikGanta commited on Dec 1, 2024

Commit

1b4714b

verified ·

1 Parent(s): 3876542

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -38

app.py CHANGED Viewed

@@ -6,13 +6,16 @@ import pytesseract
 from PyPDF2 import PdfWriter
 from docx import Document
 import gradio as gr
-import tempfile
 import shutil
 # Define paths for dependencies
 POPPLER_PATH = "/usr/bin"
 TESSERACT_PATH = "/usr/bin/tesseract"
 def install_dependencies():
     """Install Poppler and Tesseract if not already installed."""
     # Install Poppler if missing
@@ -46,43 +49,42 @@ def convert_pdf_to_text(input_pdf):
     """Convert scanned PDF to text-based PDF and Word document using OCR."""
     install_dependencies()  # Ensure dependencies are installed
-    with tempfile.TemporaryDirectory() as temp_dir:
-        input_pdf_path = input_pdf.name  # Get file path
-        # Convert PDF to images
-        try:
-            images = convert_from_path(input_pdf_path, poppler_path=POPPLER_PATH)
-        except Exception as e:
-            raise RuntimeError(f"Error during PDF to image conversion: {e}")
-        # Extract text from images
-        text_data = []
-        for image in images:
-            text = pytesseract.image_to_string(image)
-            text_data.append(text)
-        # Combine text
-        full_text = "\n".join(text_data)
-        # Generate text-based PDF
-        output_pdf_path = os.path.join(temp_dir, "output.pdf")
-        with open(output_pdf_path, "wb") as f:
-            pdf_writer = PdfWriter()
-            pdf_writer.add_metadata({
-                "/Title": "OCR Converted PDF",
-                "/Author": "OCR Application"
-            })
-            pdf_writer.write(f)
-        # Generate Word document
-        output_docx_path = os.path.join(temp_dir, "output.docx")
-        doc = Document()
-        doc.add_heading("OCR Converted Text", level=1)
-        doc.add_paragraph(full_text)
-        doc.save(output_docx_path)
-        # Return file paths
-        return output_pdf_path, output_docx_path
 def gradio_interface(file):

 from PyPDF2 import PdfWriter
 from docx import Document
 import gradio as gr
 import shutil
 # Define paths for dependencies
 POPPLER_PATH = "/usr/bin"
 TESSERACT_PATH = "/usr/bin/tesseract"
+# Define a directory to store output files
+OUTPUT_DIR = "./output_files"
+os.makedirs(OUTPUT_DIR, exist_ok=True)
 def install_dependencies():
     """Install Poppler and Tesseract if not already installed."""
     # Install Poppler if missing
     """Convert scanned PDF to text-based PDF and Word document using OCR."""
     install_dependencies()  # Ensure dependencies are installed
+    input_pdf_path = input_pdf.name  # Get file path
+    # Convert PDF to images
+    try:
+        images = convert_from_path(input_pdf_path, poppler_path=POPPLER_PATH)
+    except Exception as e:
+        raise RuntimeError(f"Error during PDF to image conversion: {e}")
+    # Extract text from images
+    text_data = []
+    for image in images:
+        text = pytesseract.image_to_string(image)
+        text_data.append(text)
+    # Combine text
+    full_text = "\n".join(text_data)
+    # Generate text-based PDF
+    output_pdf_path = os.path.join(OUTPUT_DIR, "output.pdf")
+    with open(output_pdf_path, "wb") as f:
+        pdf_writer = PdfWriter()
+        pdf_writer.add_metadata({
+            "/Title": "OCR Converted PDF",
+            "/Author": "OCR Application"
+        })
+        pdf_writer.write(f)
+    # Generate Word document
+    output_docx_path = os.path.join(OUTPUT_DIR, "output.docx")
+    doc = Document()
+    doc.add_heading("OCR Converted Text", level=1)
+    doc.add_paragraph(full_text)
+    doc.save(output_docx_path)
+    # Return file paths
+    return output_pdf_path, output_docx_path
 def gradio_interface(file):