SathvikGanta commited on
Commit
d6d8645
·
verified ·
1 Parent(s): 0dc2181

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -15
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  import pytesseract
3
  from pdf2image import convert_from_path
4
- from PyPDF2 import PdfWriter, PdfReader
5
  from docx import Document
6
  import tempfile
7
  import os
@@ -12,16 +12,14 @@ pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" # Adjust path for
12
  def convert_pdf_to_text(input_pdf):
13
  """Convert scanned PDF to text-based PDF and Word document using OCR."""
14
  with tempfile.TemporaryDirectory() as temp_dir:
15
- # Save the uploaded file to a temporary location
16
- input_pdf_path = os.path.join(temp_dir, "input.pdf")
17
- with open(input_pdf_path, "wb") as f:
18
- f.write(input_pdf) # `input_pdf` is a byte stream from Gradio
19
 
20
  # Convert PDF to images
21
  print("Converting PDF to images...")
22
  images = convert_from_path(input_pdf_path)
23
 
24
- # Extract text from images using OCR
25
  print("Extracting text from images...")
26
  text_data = []
27
  for i, image in enumerate(images):
@@ -35,13 +33,13 @@ def convert_pdf_to_text(input_pdf):
35
  # Create a text-based PDF
36
  print("Creating text-based PDF...")
37
  output_pdf_path = os.path.join(temp_dir, "output.pdf")
38
- pdf_writer = PdfWriter()
39
- pdf_writer.add_metadata({
40
- "/Title": "OCR Converted PDF",
41
- "/Author": "OCR Application"
42
- })
43
- with open(output_pdf_path, "wb") as f:
44
- f.write(full_text.encode("utf-8"))
45
 
46
  # Create a Word document
47
  print("Creating Word document...")
@@ -75,8 +73,8 @@ iface = gr.Interface(
75
  ],
76
  title="OCR PDF Converter",
77
  description="Upload a scanned PDF, and this app will convert it into a text-based PDF and Word document using OCR.",
78
- theme="compact"
79
  )
80
 
81
  if __name__ == "__main__":
82
- iface.launch(share=True)
 
1
  import gradio as gr
2
  import pytesseract
3
  from pdf2image import convert_from_path
4
+ from PyPDF2 import PdfWriter
5
  from docx import Document
6
  import tempfile
7
  import os
 
12
  def convert_pdf_to_text(input_pdf):
13
  """Convert scanned PDF to text-based PDF and Word document using OCR."""
14
  with tempfile.TemporaryDirectory() as temp_dir:
15
+ # Save the uploaded file to a temporary directory
16
+ input_pdf_path = input_pdf.name # Get the file path directly from the Gradio object
 
 
17
 
18
  # Convert PDF to images
19
  print("Converting PDF to images...")
20
  images = convert_from_path(input_pdf_path)
21
 
22
+ # Extract text from each image using OCR
23
  print("Extracting text from images...")
24
  text_data = []
25
  for i, image in enumerate(images):
 
33
  # Create a text-based PDF
34
  print("Creating text-based PDF...")
35
  output_pdf_path = os.path.join(temp_dir, "output.pdf")
36
+ with open(output_pdf_path, "wb") as pdf_file:
37
+ pdf_writer = PdfWriter()
38
+ pdf_writer.add_metadata({
39
+ "/Title": "OCR Converted PDF",
40
+ "/Author": "OCR Application"
41
+ })
42
+ pdf_writer.write(pdf_file)
43
 
44
  # Create a Word document
45
  print("Creating Word document...")
 
73
  ],
74
  title="OCR PDF Converter",
75
  description="Upload a scanned PDF, and this app will convert it into a text-based PDF and Word document using OCR.",
76
+ theme="default"
77
  )
78
 
79
  if __name__ == "__main__":
80
+ iface.launch()