crowles commited on
Commit
9e29afb
·
verified ·
1 Parent(s): 5d9aa39

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -0
app.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Install dependencies
2
+ !apt-get install -y poppler-utils tesseract-ocr tesseract-ocr-eng
3
+
4
+ import os
5
+ import gradio as gr # Assuming this is the input type (for example)
6
+
7
+ def process_pdf(file):
8
+
9
+ # Get the uploaded PDF filename (Gradio File object)
10
+ input_pdf = file.name
11
+ os.system(f'pdftoppm -png "{input_pdf}" img')
12
+
13
+ # Perform OCR using Tesseract on each PNG image (only English)
14
+ for image in os.listdir():
15
+ if image.startswith('img') and image.endswith('.png'):
16
+ output_txt = f"ocr_{image}.txt"
17
+ os.system(f'tesseract "{image}" "{output_txt[:-4]}"')
18
+
19
+ # Combine all OCR text files into one
20
+ output_txt_file = f"{input_pdf[:-4]}.txt"
21
+ with open(output_txt_file, 'w') as output_file:
22
+ for text_file in os.listdir():
23
+ if text_file.startswith('ocr_img') and text_file.endswith('.txt'):
24
+ with open(text_file, 'r') as f:
25
+ output_file.write(f.read())
26
+ output_file.write("\n") # Optional: add newline between text files
27
+
28
+ # Optional: Clean up intermediate PNG and text files
29
+ for file in os.listdir():
30
+ if file.startswith('img') or file.startswith('ocr_img'):
31
+ os.remove(file)
32
+
33
+ return output_txt_file
34
+
35
+
36
+
37
+ # Example Gradio Interface
38
+ interface = gr.Interface(
39
+ fn=process_pdf,
40
+ inputs=gr.File(),
41
+ outputs=gr.File(),
42
+ title="PDF to Text with OCR",
43
+ description="Upload a PDF, perform OCR on it."
44
+ )
45
+
46
+ # Launch the interface
47
+ interface.launch(debug=True)