Spaces:

crowles
/

PDFToTxT

Sleeping

crowles commited on Feb 26, 2025

Commit

9e29afb

verified ·

1 Parent(s): 5d9aa39

Create app.py

Files changed (1) hide show

app.py ADDED Viewed

+# Install dependencies
+!apt-get install -y poppler-utils tesseract-ocr tesseract-ocr-eng
+import os
+import gradio as gr  # Assuming this is the input type (for example)
+def process_pdf(file):
+  # Get the uploaded PDF filename (Gradio File object)
+  input_pdf = file.name
+  os.system(f'pdftoppm -png "{input_pdf}" img')
+  # Perform OCR using Tesseract on each PNG image (only English)
+  for image in os.listdir():
+      if image.startswith('img') and image.endswith('.png'):
+          output_txt = f"ocr_{image}.txt"
+          os.system(f'tesseract "{image}" "{output_txt[:-4]}"')
+  # Combine all OCR text files into one
+  output_txt_file = f"{input_pdf[:-4]}.txt"
+  with open(output_txt_file, 'w') as output_file:
+      for text_file in os.listdir():
+          if text_file.startswith('ocr_img') and text_file.endswith('.txt'):
+              with open(text_file, 'r') as f:
+                  output_file.write(f.read())
+                  output_file.write("\n")  # Optional: add newline between text files
+  # Optional: Clean up intermediate PNG and text files
+  for file in os.listdir():
+      if file.startswith('img') or file.startswith('ocr_img'):
+          os.remove(file)
+  return output_txt_file
+# Example Gradio Interface
+interface = gr.Interface(
+    fn=process_pdf,
+    inputs=gr.File(),
+    outputs=gr.File(),
+    title="PDF to Text with OCR",
+    description="Upload a PDF, perform OCR on it."
+)
+# Launch the interface
+interface.launch(debug=True)