SathvikGanta's picture
Update app.py
36fa47a verified
import os
import subprocess
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
from PyPDF2 import PdfWriter, PdfReader
from docx import Document
import gradio as gr
import io
import shutil
# Define paths for dependencies
POPPLER_PATH = "/usr/bin"
TESSERACT_PATH = "/usr/bin/tesseract"
def install_dependencies():
"""Install Poppler and Tesseract if not already installed."""
# Install Poppler if missing
if not shutil.which("pdfinfo"):
print("Poppler not found. Installing...")
try:
subprocess.run(["apt-get", "update"], check=True)
subprocess.run(["apt-get", "install", "-y", "poppler-utils"], check=True)
print("Poppler installed successfully.")
except Exception as e:
raise RuntimeError(f"Error installing Poppler: {e}")
else:
print("Poppler is already installed.")
# Install Tesseract if missing
if not shutil.which("tesseract"):
print("Tesseract not found. Installing...")
try:
subprocess.run(["apt-get", "install", "-y", "tesseract-ocr"], check=True)
print("Tesseract installed successfully.")
except Exception as e:
raise RuntimeError(f"Error installing Tesseract: {e}")
else:
print("Tesseract is already installed.")
# Ensure pytesseract uses the correct path
pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH
def convert_pdf_to_text(input_pdf):
"""Convert scanned PDF to text-based PDF and Word document using OCR."""
install_dependencies() # Ensure dependencies are installed
input_pdf_path = input_pdf.name # Get file path
# Convert PDF to images
try:
images = convert_from_path(input_pdf_path, poppler_path=POPPLER_PATH)
except Exception as e:
raise RuntimeError(f"Error during PDF to image conversion: {e}")
# Extract text from images
text_data = []
for image in images:
text = pytesseract.image_to_string(image)
text_data.append(text)
# Combine text
full_text = "\n".join(text_data)
# Generate text-based PDF in memory
pdf_buffer = io.BytesIO()
pdf_writer = PdfWriter()
pdf_writer.add_metadata({
"/Title": "OCR Converted PDF",
"/Author": "OCR Application"
})
with open(input_pdf_path, "rb") as reader_file:
reader = PdfReader(reader_file)
for page in reader.pages:
pdf_writer.add_page(page)
pdf_writer.write(pdf_buffer)
# Generate Word document in memory
docx_buffer = io.BytesIO()
doc = Document()
doc.add_heading("OCR Converted Text", level=1)
doc.add_paragraph(full_text)
doc.save(docx_buffer)
# Rewind buffers
pdf_buffer.seek(0)
docx_buffer.seek(0)
return pdf_buffer, docx_buffer
def gradio_interface(file):
pdf_output, docx_output = convert_pdf_to_text(file)
return pdf_output, docx_output
iface = gr.Interface(
fn=gradio_interface,
inputs=gr.File(label="Upload Scanned PDF"),
outputs=[
gr.File(label="Download OCR-Processed PDF"),
gr.File(label="Download OCR-Processed Word Document")
],
title="OCR PDF Converter",
description="Upload a scanned PDF, and this app will convert it into a text-based PDF and Word document using OCR."
)
if __name__ == "__main__":
iface.launch()