Spaces:
Sleeping
Sleeping
| import os | |
| import subprocess | |
| from pdf2image import convert_from_path | |
| from PIL import Image | |
| import pytesseract | |
| from PyPDF2 import PdfWriter, PdfReader | |
| from docx import Document | |
| import gradio as gr | |
| import io | |
| import shutil | |
| # Define paths for dependencies | |
| POPPLER_PATH = "/usr/bin" | |
| TESSERACT_PATH = "/usr/bin/tesseract" | |
| def install_dependencies(): | |
| """Install Poppler and Tesseract if not already installed.""" | |
| # Install Poppler if missing | |
| if not shutil.which("pdfinfo"): | |
| print("Poppler not found. Installing...") | |
| try: | |
| subprocess.run(["apt-get", "update"], check=True) | |
| subprocess.run(["apt-get", "install", "-y", "poppler-utils"], check=True) | |
| print("Poppler installed successfully.") | |
| except Exception as e: | |
| raise RuntimeError(f"Error installing Poppler: {e}") | |
| else: | |
| print("Poppler is already installed.") | |
| # Install Tesseract if missing | |
| if not shutil.which("tesseract"): | |
| print("Tesseract not found. Installing...") | |
| try: | |
| subprocess.run(["apt-get", "install", "-y", "tesseract-ocr"], check=True) | |
| print("Tesseract installed successfully.") | |
| except Exception as e: | |
| raise RuntimeError(f"Error installing Tesseract: {e}") | |
| else: | |
| print("Tesseract is already installed.") | |
| # Ensure pytesseract uses the correct path | |
| pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH | |
| def convert_pdf_to_text(input_pdf): | |
| """Convert scanned PDF to text-based PDF and Word document using OCR.""" | |
| install_dependencies() # Ensure dependencies are installed | |
| input_pdf_path = input_pdf.name # Get file path | |
| # Convert PDF to images | |
| try: | |
| images = convert_from_path(input_pdf_path, poppler_path=POPPLER_PATH) | |
| except Exception as e: | |
| raise RuntimeError(f"Error during PDF to image conversion: {e}") | |
| # Extract text from images | |
| text_data = [] | |
| for image in images: | |
| text = pytesseract.image_to_string(image) | |
| text_data.append(text) | |
| # Combine text | |
| full_text = "\n".join(text_data) | |
| # Generate text-based PDF in memory | |
| pdf_buffer = io.BytesIO() | |
| pdf_writer = PdfWriter() | |
| pdf_writer.add_metadata({ | |
| "/Title": "OCR Converted PDF", | |
| "/Author": "OCR Application" | |
| }) | |
| with open(input_pdf_path, "rb") as reader_file: | |
| reader = PdfReader(reader_file) | |
| for page in reader.pages: | |
| pdf_writer.add_page(page) | |
| pdf_writer.write(pdf_buffer) | |
| # Generate Word document in memory | |
| docx_buffer = io.BytesIO() | |
| doc = Document() | |
| doc.add_heading("OCR Converted Text", level=1) | |
| doc.add_paragraph(full_text) | |
| doc.save(docx_buffer) | |
| # Rewind buffers | |
| pdf_buffer.seek(0) | |
| docx_buffer.seek(0) | |
| return pdf_buffer, docx_buffer | |
| def gradio_interface(file): | |
| pdf_output, docx_output = convert_pdf_to_text(file) | |
| return pdf_output, docx_output | |
| iface = gr.Interface( | |
| fn=gradio_interface, | |
| inputs=gr.File(label="Upload Scanned PDF"), | |
| outputs=[ | |
| gr.File(label="Download OCR-Processed PDF"), | |
| gr.File(label="Download OCR-Processed Word Document") | |
| ], | |
| title="OCR PDF Converter", | |
| description="Upload a scanned PDF, and this app will convert it into a text-based PDF and Word document using OCR." | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() | |