SathvikGanta commited on
Commit
36fa47a
·
verified ·
1 Parent(s): 1b4714b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -21
app.py CHANGED
@@ -3,18 +3,16 @@ import subprocess
3
  from pdf2image import convert_from_path
4
  from PIL import Image
5
  import pytesseract
6
- from PyPDF2 import PdfWriter
7
  from docx import Document
8
  import gradio as gr
 
9
  import shutil
10
 
11
  # Define paths for dependencies
12
  POPPLER_PATH = "/usr/bin"
13
  TESSERACT_PATH = "/usr/bin/tesseract"
14
 
15
- # Define a directory to store output files
16
- OUTPUT_DIR = "./output_files"
17
- os.makedirs(OUTPUT_DIR, exist_ok=True)
18
 
19
  def install_dependencies():
20
  """Install Poppler and Tesseract if not already installed."""
@@ -66,30 +64,36 @@ def convert_pdf_to_text(input_pdf):
66
  # Combine text
67
  full_text = "\n".join(text_data)
68
 
69
- # Generate text-based PDF
70
- output_pdf_path = os.path.join(OUTPUT_DIR, "output.pdf")
71
- with open(output_pdf_path, "wb") as f:
72
- pdf_writer = PdfWriter()
73
- pdf_writer.add_metadata({
74
- "/Title": "OCR Converted PDF",
75
- "/Author": "OCR Application"
76
- })
77
- pdf_writer.write(f)
78
-
79
- # Generate Word document
80
- output_docx_path = os.path.join(OUTPUT_DIR, "output.docx")
 
 
 
81
  doc = Document()
82
  doc.add_heading("OCR Converted Text", level=1)
83
  doc.add_paragraph(full_text)
84
- doc.save(output_docx_path)
85
 
86
- # Return file paths
87
- return output_pdf_path, output_docx_path
 
 
 
88
 
89
 
90
  def gradio_interface(file):
91
- pdf_output_path, docx_output_path = convert_pdf_to_text(file)
92
- return pdf_output_path, docx_output_path
93
 
94
 
95
  iface = gr.Interface(
 
3
  from pdf2image import convert_from_path
4
  from PIL import Image
5
  import pytesseract
6
+ from PyPDF2 import PdfWriter, PdfReader
7
  from docx import Document
8
  import gradio as gr
9
+ import io
10
  import shutil
11
 
12
  # Define paths for dependencies
13
  POPPLER_PATH = "/usr/bin"
14
  TESSERACT_PATH = "/usr/bin/tesseract"
15
 
 
 
 
16
 
17
  def install_dependencies():
18
  """Install Poppler and Tesseract if not already installed."""
 
64
  # Combine text
65
  full_text = "\n".join(text_data)
66
 
67
+ # Generate text-based PDF in memory
68
+ pdf_buffer = io.BytesIO()
69
+ pdf_writer = PdfWriter()
70
+ pdf_writer.add_metadata({
71
+ "/Title": "OCR Converted PDF",
72
+ "/Author": "OCR Application"
73
+ })
74
+ with open(input_pdf_path, "rb") as reader_file:
75
+ reader = PdfReader(reader_file)
76
+ for page in reader.pages:
77
+ pdf_writer.add_page(page)
78
+ pdf_writer.write(pdf_buffer)
79
+
80
+ # Generate Word document in memory
81
+ docx_buffer = io.BytesIO()
82
  doc = Document()
83
  doc.add_heading("OCR Converted Text", level=1)
84
  doc.add_paragraph(full_text)
85
+ doc.save(docx_buffer)
86
 
87
+ # Rewind buffers
88
+ pdf_buffer.seek(0)
89
+ docx_buffer.seek(0)
90
+
91
+ return pdf_buffer, docx_buffer
92
 
93
 
94
  def gradio_interface(file):
95
+ pdf_output, docx_output = convert_pdf_to_text(file)
96
+ return pdf_output, docx_output
97
 
98
 
99
  iface = gr.Interface(