Spaces:
Sleeping
Sleeping
File size: 3,376 Bytes
d228197 271ed4d c6c2ea3 d228197 36fa47a fbbe150 d228197 36fa47a 271ed4d 6018995 271ed4d 1b4714b 6018995 271ed4d c6c2ea3 6018995 c6c2ea3 fbbe150 6018995 271ed4d 1b4714b 36fa47a 1b4714b 36fa47a 1b4714b 36fa47a c6c2ea3 271ed4d c6c2ea3 36fa47a c6c2ea3 271ed4d c6c2ea3 d228197 fbbe150 c6c2ea3 d228197 c6c2ea3 d6d8645 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import os
import subprocess
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
from PyPDF2 import PdfWriter, PdfReader
from docx import Document
import gradio as gr
import io
import shutil
# Define paths for dependencies
POPPLER_PATH = "/usr/bin"
TESSERACT_PATH = "/usr/bin/tesseract"
def install_dependencies():
"""Install Poppler and Tesseract if not already installed."""
# Install Poppler if missing
if not shutil.which("pdfinfo"):
print("Poppler not found. Installing...")
try:
subprocess.run(["apt-get", "update"], check=True)
subprocess.run(["apt-get", "install", "-y", "poppler-utils"], check=True)
print("Poppler installed successfully.")
except Exception as e:
raise RuntimeError(f"Error installing Poppler: {e}")
else:
print("Poppler is already installed.")
# Install Tesseract if missing
if not shutil.which("tesseract"):
print("Tesseract not found. Installing...")
try:
subprocess.run(["apt-get", "install", "-y", "tesseract-ocr"], check=True)
print("Tesseract installed successfully.")
except Exception as e:
raise RuntimeError(f"Error installing Tesseract: {e}")
else:
print("Tesseract is already installed.")
# Ensure pytesseract uses the correct path
pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH
def convert_pdf_to_text(input_pdf):
"""Convert scanned PDF to text-based PDF and Word document using OCR."""
install_dependencies() # Ensure dependencies are installed
input_pdf_path = input_pdf.name # Get file path
# Convert PDF to images
try:
images = convert_from_path(input_pdf_path, poppler_path=POPPLER_PATH)
except Exception as e:
raise RuntimeError(f"Error during PDF to image conversion: {e}")
# Extract text from images
text_data = []
for image in images:
text = pytesseract.image_to_string(image)
text_data.append(text)
# Combine text
full_text = "\n".join(text_data)
# Generate text-based PDF in memory
pdf_buffer = io.BytesIO()
pdf_writer = PdfWriter()
pdf_writer.add_metadata({
"/Title": "OCR Converted PDF",
"/Author": "OCR Application"
})
with open(input_pdf_path, "rb") as reader_file:
reader = PdfReader(reader_file)
for page in reader.pages:
pdf_writer.add_page(page)
pdf_writer.write(pdf_buffer)
# Generate Word document in memory
docx_buffer = io.BytesIO()
doc = Document()
doc.add_heading("OCR Converted Text", level=1)
doc.add_paragraph(full_text)
doc.save(docx_buffer)
# Rewind buffers
pdf_buffer.seek(0)
docx_buffer.seek(0)
return pdf_buffer, docx_buffer
def gradio_interface(file):
pdf_output, docx_output = convert_pdf_to_text(file)
return pdf_output, docx_output
iface = gr.Interface(
fn=gradio_interface,
inputs=gr.File(label="Upload Scanned PDF"),
outputs=[
gr.File(label="Download OCR-Processed PDF"),
gr.File(label="Download OCR-Processed Word Document")
],
title="OCR PDF Converter",
description="Upload a scanned PDF, and this app will convert it into a text-based PDF and Word document using OCR."
)
if __name__ == "__main__":
iface.launch()
|