Spaces:
Sleeping
Sleeping
File size: 981 Bytes
aab000f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 | import gradio as gr
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
OCR_LANG = "guj"
def extract_gujarati_text(pdf_file, page_number):
images = convert_from_path(pdf_file.name, first_page=page_number, last_page=page_number)
image = images[0]
text = pytesseract.image_to_string(image, lang=OCR_LANG)
return text, image # Returning both OCR text and snapshot
with gr.Blocks() as demo:
gr.Markdown("## π Gujarati OCR from PDF (with Page Snapshot)")
pdf = gr.File(label="π€ Upload Gujarati PDF", file_types=[".pdf"])
page = gr.Number(label="π Page Number", minimum=1, value=1, step=1)
button = gr.Button("π Extract Text")
with gr.Row():
image_output = gr.Image(label="πΌοΈ PDF Page Snapshot")
text_output = gr.Textbox(label="π Extracted Gujarati Text", lines=20)
button.click(fn=extract_gujarati_text, inputs=[pdf, page], outputs=[text_output, image_output])
demo.launch() |