SathvikGanta commited on
Commit
c6c2ea3
·
verified ·
1 Parent(s): 47b34cb

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -0
app.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pytesseract
3
+ from pdf2image import convert_from_path
4
+ from PyPDF2 import PdfWriter
5
+ from PIL import Image
6
+ import os
7
+ import tempfile
8
+
9
+ # Define Tesseract path (ensure Tesseract is installed on the environment)
10
+ pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" # Adjust path for Hugging Face Spaces
11
+
12
+ def convert_pdf_to_text(input_pdf):
13
+ """Convert scanned PDF to text-based PDF using OCR."""
14
+ with tempfile.TemporaryDirectory() as temp_dir:
15
+ # Save the uploaded file
16
+ input_pdf_path = os.path.join(temp_dir, "input.pdf")
17
+ with open(input_pdf_path, "wb") as f:
18
+ f.write(input_pdf.read())
19
+
20
+ # Convert PDF to images
21
+ print("Converting PDF to images...")
22
+ images = convert_from_path(input_pdf_path)
23
+
24
+ # Extract text from each image using OCR
25
+ print("Extracting text from images...")
26
+ text_data = []
27
+ for i, image in enumerate(images):
28
+ print(f"Processing page {i + 1}...")
29
+ text = pytesseract.image_to_string(image)
30
+ text_data.append(text)
31
+
32
+ # Create a text-based PDF
33
+ output_pdf_path = os.path.join(temp_dir, "output.pdf")
34
+ pdf_writer = PdfWriter()
35
+ for text in text_data:
36
+ pdf_writer.add_blank_page() # Add pages (text storage is skipped here)
37
+ with open(output_pdf_path, "wb") as output_file:
38
+ pdf_writer.write(output_file)
39
+
40
+ # Read the generated file for download
41
+ with open(output_pdf_path, "rb") as f:
42
+ output_pdf = f.read()
43
+
44
+ return output_pdf
45
+
46
+ # Gradio Interface
47
+ def gradio_interface(file):
48
+ """Wrapper for OCR conversion with Gradio input and output."""
49
+ output_pdf = convert_pdf_to_text(file)
50
+ return output_pdf
51
+
52
+ # Gradio UI
53
+ iface = gr.Interface(
54
+ fn=gradio_interface,
55
+ inputs=gr.File(label="Upload Scanned PDF"), # File input for the user
56
+ outputs=gr.File(label="Download Text-Based PDF"), # File output
57
+ title="OCR PDF Converter",
58
+ description="Upload a scanned PDF and convert it into a text-based PDF using OCR.",
59
+ theme="compact" # Optional: Compact theme for UI
60
+ )
61
+
62
+ if __name__ == "__main__":
63
+ iface.launch(share=True)