Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pytesseract | |
| from pdf2image import convert_from_bytes | |
| from PIL import Image | |
| import io | |
| import os | |
| from PyPDF2 import PdfReader, PdfWriter | |
| # ----------------------- | |
| # Set Tesseract data path | |
| # ----------------------- | |
| os.environ["TESSDATA_PREFIX"] = "/usr/share/tesseract-ocr/5/tessdata/" | |
| # ----------------------- | |
| # Streamlit page config | |
| # ----------------------- | |
| st.set_page_config(page_title="Image/PDF → Searchable PDF", layout="centered") | |
| st.title("📄 Image / PDF to Searchable PDF (OCR)") | |
| st.write( | |
| "Upload an image (PNG/JPG/JPEG) or a PDF. The app will convert it into a searchable PDF using OCR." | |
| ) | |
| st.write("Supports English (eng), Hindi (hin), or both.") | |
| # ----------------------- | |
| # Language selection mapping | |
| # ----------------------- | |
| language_options = { | |
| "English": "eng", | |
| "Hindi": "hin", | |
| "English + Hindi": "eng+hin" | |
| } | |
| selected_lang = st.selectbox("Select OCR Language", list(language_options.keys())) | |
| lang_code = language_options[selected_lang] | |
| # ----------------------- | |
| # Helper functions | |
| # ----------------------- | |
| def image_to_searchable_pdf(image_obj: Image.Image, lang: str): | |
| """Convert PIL Image → searchable PDF""" | |
| return pytesseract.image_to_pdf_or_hocr(image_obj, extension="pdf", lang=lang) | |
| def pdf_to_searchable_pdf(pdf_bytes: bytes, lang: str): | |
| """Convert multi-page PDF → single searchable PDF""" | |
| pages = convert_from_bytes(pdf_bytes) | |
| pdf_writer = PdfWriter() | |
| for page in pages: | |
| # OCR each page | |
| ocred_pdf_bytes = pytesseract.image_to_pdf_or_hocr(page, extension="pdf", lang=lang) | |
| reader = PdfReader(io.BytesIO(ocred_pdf_bytes)) | |
| for p in reader.pages: | |
| pdf_writer.add_page(p) | |
| final_pdf = io.BytesIO() | |
| pdf_writer.write(final_pdf) | |
| return final_pdf.getvalue() | |
| # ----------------------- | |
| # File uploader | |
| # ----------------------- | |
| uploaded_file = st.file_uploader( | |
| "Upload Image or PDF", type=["png", "jpg", "jpeg", "pdf"] | |
| ) | |
| if uploaded_file: | |
| file_bytes = uploaded_file.getvalue() | |
| st.info("Processing file… This may take a few seconds…") | |
| try: | |
| if uploaded_file.type.startswith("image"): | |
| img = Image.open(io.BytesIO(file_bytes)) | |
| result_pdf = image_to_searchable_pdf(img, lang_code) | |
| elif uploaded_file.type == "application/pdf": | |
| result_pdf = pdf_to_searchable_pdf(file_bytes, lang_code) | |
| else: | |
| st.error("Unsupported file type") | |
| st.stop() | |
| st.success("✅ OCR Completed!") | |
| st.download_button( | |
| label="📥 Download Searchable PDF", | |
| data=result_pdf, | |
| file_name="searchable.pdf", | |
| mime="application/pdf" | |
| ) | |
| except Exception as e: | |
| st.error(f"🚨 OCR failed: {e}") | |