Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from pdf2image import convert_from_path | |
| from PIL import Image | |
| import pytesseract | |
| OCR_LANG = "guj" | |
| def extract_gujarati_text(pdf_file, page_number): | |
| images = convert_from_path(pdf_file.name, first_page=page_number, last_page=page_number) | |
| image = images[0] | |
| text = pytesseract.image_to_string(image, lang=OCR_LANG) | |
| return text, image # Returning both OCR text and snapshot | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## π Gujarati OCR from PDF (with Page Snapshot)") | |
| pdf = gr.File(label="π€ Upload Gujarati PDF", file_types=[".pdf"]) | |
| page = gr.Number(label="π Page Number", minimum=1, value=1, step=1) | |
| button = gr.Button("π Extract Text") | |
| with gr.Row(): | |
| image_output = gr.Image(label="πΌοΈ PDF Page Snapshot") | |
| text_output = gr.Textbox(label="π Extracted Gujarati Text", lines=20) | |
| button.click(fn=extract_gujarati_text, inputs=[pdf, page], outputs=[text_output, image_output]) | |
| demo.launch() |