Pdf-Extractor / app(backup).py
lolhaha002's picture
Upload app(backup).py
aab000f verified
raw
history blame contribute delete
981 Bytes
import gradio as gr
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
OCR_LANG = "guj"
def extract_gujarati_text(pdf_file, page_number):
images = convert_from_path(pdf_file.name, first_page=page_number, last_page=page_number)
image = images[0]
text = pytesseract.image_to_string(image, lang=OCR_LANG)
return text, image # Returning both OCR text and snapshot
with gr.Blocks() as demo:
gr.Markdown("## πŸ“š Gujarati OCR from PDF (with Page Snapshot)")
pdf = gr.File(label="πŸ“€ Upload Gujarati PDF", file_types=[".pdf"])
page = gr.Number(label="πŸ“„ Page Number", minimum=1, value=1, step=1)
button = gr.Button("πŸ” Extract Text")
with gr.Row():
image_output = gr.Image(label="πŸ–ΌοΈ PDF Page Snapshot")
text_output = gr.Textbox(label="πŸ“ Extracted Gujarati Text", lines=20)
button.click(fn=extract_gujarati_text, inputs=[pdf, page], outputs=[text_output, image_output])
demo.launch()