ocr_pdf / app.py
Danielzapirtan's picture
Update app.py
48027c9 verified
import gradio as gr
import PyPDF2
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import io
import tempfile
import os
def extract_text_from_pdf(pdf_file):
"""
Extract text from PDF. Uses direct text extraction if available,
falls back to OCR if the PDF is image-based.
"""
if pdf_file is None:
return None, "Please upload a PDF file first."
try:
# Try direct text extraction first
pdf_reader = PyPDF2.PdfReader(pdf_file)
extracted_text = ""
for page_num, page in enumerate(pdf_reader.pages):
page_text = page.extract_text()
extracted_text += page_text
# Check if we got meaningful text (more than just whitespace)
if extracted_text.strip():
status = f"✓ Text extracted directly from PDF ({len(pdf_reader.pages)} pages)\nNo OCR needed - PDF contains searchable text."
return create_txt_file(extracted_text), status
# If no text found, use OCR
status_msg = "⚠ PDF appears to be image-based. Running OCR...\n"
# Save uploaded file temporarily for pdf2image
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
tmp_file.write(pdf_file.read())
tmp_path = tmp_file.name
# Convert PDF pages to images
images = convert_from_path(tmp_path)
# Perform OCR on each page
ocr_text = ""
for i, image in enumerate(images):
page_text = pytesseract.image_to_string(image)
ocr_text += f"\n--- Page {i + 1} ---\n{page_text}\n"
# Clean up temp file
os.unlink(tmp_path)
status_msg += f"✓ OCR completed on {len(images)} pages"
return create_txt_file(ocr_text), status_msg
except Exception as e:
return None, f"Error processing PDF: {str(e)}"
def create_txt_file(text):
"""Create a downloadable text file from the extracted text."""
txt_bytes = text.encode('utf-8')
return txt_bytes
# Create Gradio interface
with gr.Blocks(title="PDF to Text Converter") as demo:
gr.Markdown(
"""
# 📄 PDF to Text Converter with Smart OCR
Upload a PDF file and get downloadable text. The app automatically:
- Extracts text directly if the PDF contains searchable text
- Uses OCR (Optical Character Recognition) only for image-based PDFs
"""
)
with gr.Row():
with gr.Column():
pdf_input = gr.File(
label="Upload PDF File",
file_types=[".pdf"],
type="binary"
)
convert_btn = gr.Button("Convert to Text", variant="primary")
with gr.Column():
status_output = gr.Textbox(
label="Status",
lines=3,
interactive=False
)
txt_output = gr.File(
label="Download Text File",
type="binary"
)
gr.Markdown(
"""
### How it works:
1. Upload your PDF file
2. Click "Convert to Text"
3. The app will extract text directly if possible, or use OCR if needed
4. Download the resulting .txt file
**Note:** OCR may take longer for large PDFs with many pages.
"""
)
convert_btn.click(
fn=extract_text_from_pdf,
inputs=pdf_input,
outputs=[txt_output, status_output]
)
if __name__ == "__main__":
demo.launch()