Spaces:

Danielzapirtan
/

ocr_pdf

Build error

App Files Files Community

ocr_pdf / app.py

Danielzapirtan

Update app.py

48027c9 verified 3 months ago

raw

history blame contribute delete

3.64 kB

	import gradio as gr
	import PyPDF2
	from pdf2image import convert_from_path
	import pytesseract
	from PIL import Image
	import io
	import tempfile
	import os

	def extract_text_from_pdf(pdf_file):
	"""
	Extract text from PDF. Uses direct text extraction if available,
	falls back to OCR if the PDF is image-based.
	"""
	if pdf_file is None:
	return None, "Please upload a PDF file first."

	try:
	# Try direct text extraction first
	pdf_reader = PyPDF2.PdfReader(pdf_file)
	extracted_text = ""

	for page_num, page in enumerate(pdf_reader.pages):
	page_text = page.extract_text()
	extracted_text += page_text

	# Check if we got meaningful text (more than just whitespace)
	if extracted_text.strip():
	status = f"✓ Text extracted directly from PDF ({len(pdf_reader.pages)} pages)\nNo OCR needed - PDF contains searchable text."
	return create_txt_file(extracted_text), status

	# If no text found, use OCR
	status_msg = "⚠ PDF appears to be image-based. Running OCR...\n"

	# Save uploaded file temporarily for pdf2image
	with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
	tmp_file.write(pdf_file.read())
	tmp_path = tmp_file.name

	# Convert PDF pages to images
	images = convert_from_path(tmp_path)

	# Perform OCR on each page
	ocr_text = ""
	for i, image in enumerate(images):
	page_text = pytesseract.image_to_string(image)
	ocr_text += f"\n--- Page {i + 1} ---\n{page_text}\n"

	# Clean up temp file
	os.unlink(tmp_path)

	status_msg += f"✓ OCR completed on {len(images)} pages"
	return create_txt_file(ocr_text), status_msg

	except Exception as e:
	return None, f"Error processing PDF: {str(e)}"

	def create_txt_file(text):
	"""Create a downloadable text file from the extracted text."""
	txt_bytes = text.encode('utf-8')
	return txt_bytes

	# Create Gradio interface
	with gr.Blocks(title="PDF to Text Converter") as demo:
	gr.Markdown(
	"""
	# 📄 PDF to Text Converter with Smart OCR

	Upload a PDF file and get downloadable text. The app automatically:
	- Extracts text directly if the PDF contains searchable text
	- Uses OCR (Optical Character Recognition) only for image-based PDFs
	"""
	)

	with gr.Row():
	with gr.Column():
	pdf_input = gr.File(
	label="Upload PDF File",
	file_types=[".pdf"],
	type="binary"
	)
	convert_btn = gr.Button("Convert to Text", variant="primary")

	with gr.Column():
	status_output = gr.Textbox(
	label="Status",
	lines=3,
	interactive=False
	)
	txt_output = gr.File(
	label="Download Text File",
	type="binary"
	)

	gr.Markdown(
	"""
	### How it works:
	1. Upload your PDF file
	2. Click "Convert to Text"
	3. The app will extract text directly if possible, or use OCR if needed
	4. Download the resulting .txt file

	Note: OCR may take longer for large PDFs with many pages.
	"""
	)

	convert_btn.click(
	fn=extract_text_from_pdf,
	inputs=pdf_input,
	outputs=[txt_output, status_output]
	)

	if __name__ == "__main__":
	demo.launch()