import os import subprocess from pdf2image import convert_from_path from PIL import Image import pytesseract from PyPDF2 import PdfWriter, PdfReader from docx import Document import gradio as gr import io import shutil # Define paths for dependencies POPPLER_PATH = "/usr/bin" TESSERACT_PATH = "/usr/bin/tesseract" def install_dependencies(): """Install Poppler and Tesseract if not already installed.""" # Install Poppler if missing if not shutil.which("pdfinfo"): print("Poppler not found. Installing...") try: subprocess.run(["apt-get", "update"], check=True) subprocess.run(["apt-get", "install", "-y", "poppler-utils"], check=True) print("Poppler installed successfully.") except Exception as e: raise RuntimeError(f"Error installing Poppler: {e}") else: print("Poppler is already installed.") # Install Tesseract if missing if not shutil.which("tesseract"): print("Tesseract not found. Installing...") try: subprocess.run(["apt-get", "install", "-y", "tesseract-ocr"], check=True) print("Tesseract installed successfully.") except Exception as e: raise RuntimeError(f"Error installing Tesseract: {e}") else: print("Tesseract is already installed.") # Ensure pytesseract uses the correct path pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH def convert_pdf_to_text(input_pdf): """Convert scanned PDF to text-based PDF and Word document using OCR.""" install_dependencies() # Ensure dependencies are installed input_pdf_path = input_pdf.name # Get file path # Convert PDF to images try: images = convert_from_path(input_pdf_path, poppler_path=POPPLER_PATH) except Exception as e: raise RuntimeError(f"Error during PDF to image conversion: {e}") # Extract text from images text_data = [] for image in images: text = pytesseract.image_to_string(image) text_data.append(text) # Combine text full_text = "\n".join(text_data) # Generate text-based PDF in memory pdf_buffer = io.BytesIO() pdf_writer = PdfWriter() pdf_writer.add_metadata({ "/Title": "OCR Converted PDF", "/Author": "OCR Application" }) with open(input_pdf_path, "rb") as reader_file: reader = PdfReader(reader_file) for page in reader.pages: pdf_writer.add_page(page) pdf_writer.write(pdf_buffer) # Generate Word document in memory docx_buffer = io.BytesIO() doc = Document() doc.add_heading("OCR Converted Text", level=1) doc.add_paragraph(full_text) doc.save(docx_buffer) # Rewind buffers pdf_buffer.seek(0) docx_buffer.seek(0) return pdf_buffer, docx_buffer def gradio_interface(file): pdf_output, docx_output = convert_pdf_to_text(file) return pdf_output, docx_output iface = gr.Interface( fn=gradio_interface, inputs=gr.File(label="Upload Scanned PDF"), outputs=[ gr.File(label="Download OCR-Processed PDF"), gr.File(label="Download OCR-Processed Word Document") ], title="OCR PDF Converter", description="Upload a scanned PDF, and this app will convert it into a text-based PDF and Word document using OCR." ) if __name__ == "__main__": iface.launch()