import streamlit as st import pytesseract from pdf2image import convert_from_bytes from PIL import Image import io import os from PyPDF2 import PdfReader, PdfWriter # ----------------------- # Set Tesseract data path # ----------------------- os.environ["TESSDATA_PREFIX"] = "/usr/share/tesseract-ocr/5/tessdata/" # ----------------------- # Streamlit page config # ----------------------- st.set_page_config(page_title="Image/PDF → Searchable PDF", layout="centered") st.title("📄 Image / PDF to Searchable PDF (OCR)") st.write( "Upload an image (PNG/JPG/JPEG) or a PDF. The app will convert it into a searchable PDF using OCR." ) st.write("Supports English (eng), Hindi (hin), or both.") # ----------------------- # Language selection mapping # ----------------------- language_options = { "English": "eng", "Hindi": "hin", "English + Hindi": "eng+hin" } selected_lang = st.selectbox("Select OCR Language", list(language_options.keys())) lang_code = language_options[selected_lang] # ----------------------- # Helper functions # ----------------------- def image_to_searchable_pdf(image_obj: Image.Image, lang: str): """Convert PIL Image → searchable PDF""" return pytesseract.image_to_pdf_or_hocr(image_obj, extension="pdf", lang=lang) def pdf_to_searchable_pdf(pdf_bytes: bytes, lang: str): """Convert multi-page PDF → single searchable PDF""" pages = convert_from_bytes(pdf_bytes) pdf_writer = PdfWriter() for page in pages: # OCR each page ocred_pdf_bytes = pytesseract.image_to_pdf_or_hocr(page, extension="pdf", lang=lang) reader = PdfReader(io.BytesIO(ocred_pdf_bytes)) for p in reader.pages: pdf_writer.add_page(p) final_pdf = io.BytesIO() pdf_writer.write(final_pdf) return final_pdf.getvalue() # ----------------------- # File uploader # ----------------------- uploaded_file = st.file_uploader( "Upload Image or PDF", type=["png", "jpg", "jpeg", "pdf"] ) if uploaded_file: file_bytes = uploaded_file.getvalue() st.info("Processing file… This may take a few seconds…") try: if uploaded_file.type.startswith("image"): img = Image.open(io.BytesIO(file_bytes)) result_pdf = image_to_searchable_pdf(img, lang_code) elif uploaded_file.type == "application/pdf": result_pdf = pdf_to_searchable_pdf(file_bytes, lang_code) else: st.error("Unsupported file type") st.stop() st.success("✅ OCR Completed!") st.download_button( label="📥 Download Searchable PDF", data=result_pdf, file_name="searchable.pdf", mime="application/pdf" ) except Exception as e: st.error(f"🚨 OCR failed: {e}")