Spaces:

SathvikGanta
/

Scaned_doc_typed

Sleeping

App Files Files Community

Scaned_doc_typed / app.py

SathvikGanta

Update app.py

36fa47a verified about 1 year ago

raw

history blame contribute delete

3.38 kB

	import os
	import subprocess
	from pdf2image import convert_from_path
	from PIL import Image
	import pytesseract
	from PyPDF2 import PdfWriter, PdfReader
	from docx import Document
	import gradio as gr
	import io
	import shutil

	# Define paths for dependencies
	POPPLER_PATH = "/usr/bin"
	TESSERACT_PATH = "/usr/bin/tesseract"


	def install_dependencies():
	"""Install Poppler and Tesseract if not already installed."""
	# Install Poppler if missing
	if not shutil.which("pdfinfo"):
	print("Poppler not found. Installing...")
	try:
	subprocess.run(["apt-get", "update"], check=True)
	subprocess.run(["apt-get", "install", "-y", "poppler-utils"], check=True)
	print("Poppler installed successfully.")
	except Exception as e:
	raise RuntimeError(f"Error installing Poppler: {e}")
	else:
	print("Poppler is already installed.")

	# Install Tesseract if missing
	if not shutil.which("tesseract"):
	print("Tesseract not found. Installing...")
	try:
	subprocess.run(["apt-get", "install", "-y", "tesseract-ocr"], check=True)
	print("Tesseract installed successfully.")
	except Exception as e:
	raise RuntimeError(f"Error installing Tesseract: {e}")
	else:
	print("Tesseract is already installed.")

	# Ensure pytesseract uses the correct path
	pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH


	def convert_pdf_to_text(input_pdf):
	"""Convert scanned PDF to text-based PDF and Word document using OCR."""
	install_dependencies() # Ensure dependencies are installed

	input_pdf_path = input_pdf.name # Get file path

	# Convert PDF to images
	try:
	images = convert_from_path(input_pdf_path, poppler_path=POPPLER_PATH)
	except Exception as e:
	raise RuntimeError(f"Error during PDF to image conversion: {e}")

	# Extract text from images
	text_data = []
	for image in images:
	text = pytesseract.image_to_string(image)
	text_data.append(text)

	# Combine text
	full_text = "\n".join(text_data)

	# Generate text-based PDF in memory
	pdf_buffer = io.BytesIO()
	pdf_writer = PdfWriter()
	pdf_writer.add_metadata({
	"/Title": "OCR Converted PDF",
	"/Author": "OCR Application"
	})
	with open(input_pdf_path, "rb") as reader_file:
	reader = PdfReader(reader_file)
	for page in reader.pages:
	pdf_writer.add_page(page)
	pdf_writer.write(pdf_buffer)

	# Generate Word document in memory
	docx_buffer = io.BytesIO()
	doc = Document()
	doc.add_heading("OCR Converted Text", level=1)
	doc.add_paragraph(full_text)
	doc.save(docx_buffer)

	# Rewind buffers
	pdf_buffer.seek(0)
	docx_buffer.seek(0)

	return pdf_buffer, docx_buffer


	def gradio_interface(file):
	pdf_output, docx_output = convert_pdf_to_text(file)
	return pdf_output, docx_output


	iface = gr.Interface(
	fn=gradio_interface,
	inputs=gr.File(label="Upload Scanned PDF"),
	outputs=[
	gr.File(label="Download OCR-Processed PDF"),
	gr.File(label="Download OCR-Processed Word Document")
	],
	title="OCR PDF Converter",
	description="Upload a scanned PDF, and this app will convert it into a text-based PDF and Word document using OCR."
	)

	if __name__ == "__main__":
	iface.launch()