import gradio as gr from pdf2image import convert_from_path from PIL import Image import pytesseract OCR_LANG = "guj" def extract_gujarati_text(pdf_file, page_number): images = convert_from_path(pdf_file.name, first_page=page_number, last_page=page_number) image = images[0] text = pytesseract.image_to_string(image, lang=OCR_LANG) return text, image # Returning both OCR text and snapshot with gr.Blocks() as demo: gr.Markdown("## 📚 Gujarati OCR from PDF (with Page Snapshot)") pdf = gr.File(label="📤 Upload Gujarati PDF", file_types=[".pdf"]) page = gr.Number(label="📄 Page Number", minimum=1, value=1, step=1) button = gr.Button("🔍 Extract Text") with gr.Row(): image_output = gr.Image(label="🖼️ PDF Page Snapshot") text_output = gr.Textbox(label="📝 Extracted Gujarati Text", lines=20) button.click(fn=extract_gujarati_text, inputs=[pdf, page], outputs=[text_output, image_output]) demo.launch()