SathvikGanta commited on
Commit
1b4714b
·
verified ·
1 Parent(s): 3876542

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -38
app.py CHANGED
@@ -6,13 +6,16 @@ import pytesseract
6
  from PyPDF2 import PdfWriter
7
  from docx import Document
8
  import gradio as gr
9
- import tempfile
10
  import shutil
11
 
12
  # Define paths for dependencies
13
  POPPLER_PATH = "/usr/bin"
14
  TESSERACT_PATH = "/usr/bin/tesseract"
15
 
 
 
 
 
16
  def install_dependencies():
17
  """Install Poppler and Tesseract if not already installed."""
18
  # Install Poppler if missing
@@ -46,43 +49,42 @@ def convert_pdf_to_text(input_pdf):
46
  """Convert scanned PDF to text-based PDF and Word document using OCR."""
47
  install_dependencies() # Ensure dependencies are installed
48
 
49
- with tempfile.TemporaryDirectory() as temp_dir:
50
- input_pdf_path = input_pdf.name # Get file path
51
-
52
- # Convert PDF to images
53
- try:
54
- images = convert_from_path(input_pdf_path, poppler_path=POPPLER_PATH)
55
- except Exception as e:
56
- raise RuntimeError(f"Error during PDF to image conversion: {e}")
57
-
58
- # Extract text from images
59
- text_data = []
60
- for image in images:
61
- text = pytesseract.image_to_string(image)
62
- text_data.append(text)
63
-
64
- # Combine text
65
- full_text = "\n".join(text_data)
66
-
67
- # Generate text-based PDF
68
- output_pdf_path = os.path.join(temp_dir, "output.pdf")
69
- with open(output_pdf_path, "wb") as f:
70
- pdf_writer = PdfWriter()
71
- pdf_writer.add_metadata({
72
- "/Title": "OCR Converted PDF",
73
- "/Author": "OCR Application"
74
- })
75
- pdf_writer.write(f)
76
-
77
- # Generate Word document
78
- output_docx_path = os.path.join(temp_dir, "output.docx")
79
- doc = Document()
80
- doc.add_heading("OCR Converted Text", level=1)
81
- doc.add_paragraph(full_text)
82
- doc.save(output_docx_path)
83
-
84
- # Return file paths
85
- return output_pdf_path, output_docx_path
86
 
87
 
88
  def gradio_interface(file):
 
6
  from PyPDF2 import PdfWriter
7
  from docx import Document
8
  import gradio as gr
 
9
  import shutil
10
 
11
  # Define paths for dependencies
12
  POPPLER_PATH = "/usr/bin"
13
  TESSERACT_PATH = "/usr/bin/tesseract"
14
 
15
+ # Define a directory to store output files
16
+ OUTPUT_DIR = "./output_files"
17
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
18
+
19
  def install_dependencies():
20
  """Install Poppler and Tesseract if not already installed."""
21
  # Install Poppler if missing
 
49
  """Convert scanned PDF to text-based PDF and Word document using OCR."""
50
  install_dependencies() # Ensure dependencies are installed
51
 
52
+ input_pdf_path = input_pdf.name # Get file path
53
+
54
+ # Convert PDF to images
55
+ try:
56
+ images = convert_from_path(input_pdf_path, poppler_path=POPPLER_PATH)
57
+ except Exception as e:
58
+ raise RuntimeError(f"Error during PDF to image conversion: {e}")
59
+
60
+ # Extract text from images
61
+ text_data = []
62
+ for image in images:
63
+ text = pytesseract.image_to_string(image)
64
+ text_data.append(text)
65
+
66
+ # Combine text
67
+ full_text = "\n".join(text_data)
68
+
69
+ # Generate text-based PDF
70
+ output_pdf_path = os.path.join(OUTPUT_DIR, "output.pdf")
71
+ with open(output_pdf_path, "wb") as f:
72
+ pdf_writer = PdfWriter()
73
+ pdf_writer.add_metadata({
74
+ "/Title": "OCR Converted PDF",
75
+ "/Author": "OCR Application"
76
+ })
77
+ pdf_writer.write(f)
78
+
79
+ # Generate Word document
80
+ output_docx_path = os.path.join(OUTPUT_DIR, "output.docx")
81
+ doc = Document()
82
+ doc.add_heading("OCR Converted Text", level=1)
83
+ doc.add_paragraph(full_text)
84
+ doc.save(output_docx_path)
85
+
86
+ # Return file paths
87
+ return output_pdf_path, output_docx_path
 
88
 
89
 
90
  def gradio_interface(file):