Image2OcrPdf / app.py
arasuezofis's picture
Update app.py
ba1c3af verified
import streamlit as st
import pytesseract
from pdf2image import convert_from_bytes
from PIL import Image
import io
import os
from PyPDF2 import PdfReader, PdfWriter
# -----------------------
# Set Tesseract data path
# -----------------------
os.environ["TESSDATA_PREFIX"] = "/usr/share/tesseract-ocr/5/tessdata/"
# -----------------------
# Streamlit page config
# -----------------------
st.set_page_config(page_title="Image/PDF → Searchable PDF", layout="centered")
st.title("📄 Image / PDF to Searchable PDF (OCR)")
st.write(
"Upload an image (PNG/JPG/JPEG) or a PDF. The app will convert it into a searchable PDF using OCR."
)
st.write("Supports English (eng), Hindi (hin), or both.")
# -----------------------
# Language selection mapping
# -----------------------
language_options = {
"English": "eng",
"Hindi": "hin",
"English + Hindi": "eng+hin"
}
selected_lang = st.selectbox("Select OCR Language", list(language_options.keys()))
lang_code = language_options[selected_lang]
# -----------------------
# Helper functions
# -----------------------
def image_to_searchable_pdf(image_obj: Image.Image, lang: str):
"""Convert PIL Image → searchable PDF"""
return pytesseract.image_to_pdf_or_hocr(image_obj, extension="pdf", lang=lang)
def pdf_to_searchable_pdf(pdf_bytes: bytes, lang: str):
"""Convert multi-page PDF → single searchable PDF"""
pages = convert_from_bytes(pdf_bytes)
pdf_writer = PdfWriter()
for page in pages:
# OCR each page
ocred_pdf_bytes = pytesseract.image_to_pdf_or_hocr(page, extension="pdf", lang=lang)
reader = PdfReader(io.BytesIO(ocred_pdf_bytes))
for p in reader.pages:
pdf_writer.add_page(p)
final_pdf = io.BytesIO()
pdf_writer.write(final_pdf)
return final_pdf.getvalue()
# -----------------------
# File uploader
# -----------------------
uploaded_file = st.file_uploader(
"Upload Image or PDF", type=["png", "jpg", "jpeg", "pdf"]
)
if uploaded_file:
file_bytes = uploaded_file.getvalue()
st.info("Processing file… This may take a few seconds…")
try:
if uploaded_file.type.startswith("image"):
img = Image.open(io.BytesIO(file_bytes))
result_pdf = image_to_searchable_pdf(img, lang_code)
elif uploaded_file.type == "application/pdf":
result_pdf = pdf_to_searchable_pdf(file_bytes, lang_code)
else:
st.error("Unsupported file type")
st.stop()
st.success("✅ OCR Completed!")
st.download_button(
label="📥 Download Searchable PDF",
data=result_pdf,
file_name="searchable.pdf",
mime="application/pdf"
)
except Exception as e:
st.error(f"🚨 OCR failed: {e}")