File size: 3,376 Bytes
d228197
271ed4d
c6c2ea3
d228197
 
36fa47a
fbbe150
d228197
36fa47a
271ed4d
 
6018995
 
 
271ed4d
1b4714b
6018995
 
 
271ed4d
 
 
 
 
 
 
 
 
 
c6c2ea3
6018995
 
 
 
 
 
 
 
 
 
 
 
 
 
c6c2ea3
 
fbbe150
6018995
271ed4d
1b4714b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36fa47a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b4714b
 
 
36fa47a
1b4714b
36fa47a
 
 
 
 
c6c2ea3
271ed4d
c6c2ea3
36fa47a
 
c6c2ea3
271ed4d
c6c2ea3
 
d228197
fbbe150
 
 
 
c6c2ea3
d228197
c6c2ea3
 
 
d6d8645
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
import subprocess
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
from PyPDF2 import PdfWriter, PdfReader
from docx import Document
import gradio as gr
import io
import shutil

# Define paths for dependencies
POPPLER_PATH = "/usr/bin"
TESSERACT_PATH = "/usr/bin/tesseract"


def install_dependencies():
    """Install Poppler and Tesseract if not already installed."""
    # Install Poppler if missing
    if not shutil.which("pdfinfo"):
        print("Poppler not found. Installing...")
        try:
            subprocess.run(["apt-get", "update"], check=True)
            subprocess.run(["apt-get", "install", "-y", "poppler-utils"], check=True)
            print("Poppler installed successfully.")
        except Exception as e:
            raise RuntimeError(f"Error installing Poppler: {e}")
    else:
        print("Poppler is already installed.")

    # Install Tesseract if missing
    if not shutil.which("tesseract"):
        print("Tesseract not found. Installing...")
        try:
            subprocess.run(["apt-get", "install", "-y", "tesseract-ocr"], check=True)
            print("Tesseract installed successfully.")
        except Exception as e:
            raise RuntimeError(f"Error installing Tesseract: {e}")
    else:
        print("Tesseract is already installed.")

    # Ensure pytesseract uses the correct path
    pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH


def convert_pdf_to_text(input_pdf):
    """Convert scanned PDF to text-based PDF and Word document using OCR."""
    install_dependencies()  # Ensure dependencies are installed

    input_pdf_path = input_pdf.name  # Get file path

    # Convert PDF to images
    try:
        images = convert_from_path(input_pdf_path, poppler_path=POPPLER_PATH)
    except Exception as e:
        raise RuntimeError(f"Error during PDF to image conversion: {e}")

    # Extract text from images
    text_data = []
    for image in images:
        text = pytesseract.image_to_string(image)
        text_data.append(text)

    # Combine text
    full_text = "\n".join(text_data)

    # Generate text-based PDF in memory
    pdf_buffer = io.BytesIO()
    pdf_writer = PdfWriter()
    pdf_writer.add_metadata({
        "/Title": "OCR Converted PDF",
        "/Author": "OCR Application"
    })
    with open(input_pdf_path, "rb") as reader_file:
        reader = PdfReader(reader_file)
        for page in reader.pages:
            pdf_writer.add_page(page)
    pdf_writer.write(pdf_buffer)

    # Generate Word document in memory
    docx_buffer = io.BytesIO()
    doc = Document()
    doc.add_heading("OCR Converted Text", level=1)
    doc.add_paragraph(full_text)
    doc.save(docx_buffer)

    # Rewind buffers
    pdf_buffer.seek(0)
    docx_buffer.seek(0)

    return pdf_buffer, docx_buffer


def gradio_interface(file):
    pdf_output, docx_output = convert_pdf_to_text(file)
    return pdf_output, docx_output


iface = gr.Interface(
    fn=gradio_interface,
    inputs=gr.File(label="Upload Scanned PDF"),
    outputs=[
        gr.File(label="Download OCR-Processed PDF"),
        gr.File(label="Download OCR-Processed Word Document")
    ],
    title="OCR PDF Converter",
    description="Upload a scanned PDF, and this app will convert it into a text-based PDF and Word document using OCR."
)

if __name__ == "__main__":
    iface.launch()