File size: 981 Bytes
aab000f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import gradio as gr
from pdf2image import convert_from_path
from PIL import Image
import pytesseract

OCR_LANG = "guj"

def extract_gujarati_text(pdf_file, page_number):
    images = convert_from_path(pdf_file.name, first_page=page_number, last_page=page_number)
    image = images[0]
    text = pytesseract.image_to_string(image, lang=OCR_LANG)
    return text, image  # Returning both OCR text and snapshot

with gr.Blocks() as demo:
    gr.Markdown("## πŸ“š Gujarati OCR from PDF (with Page Snapshot)")
    pdf = gr.File(label="πŸ“€ Upload Gujarati PDF", file_types=[".pdf"])
    page = gr.Number(label="πŸ“„ Page Number", minimum=1, value=1, step=1)
    button = gr.Button("πŸ” Extract Text")

    with gr.Row():
        image_output = gr.Image(label="πŸ–ΌοΈ PDF Page Snapshot")
        text_output = gr.Textbox(label="πŸ“ Extracted Gujarati Text", lines=20)

    button.click(fn=extract_gujarati_text, inputs=[pdf, page], outputs=[text_output, image_output])

demo.launch()