Pdf-Extractor / app.py
lolhaha002's picture
Update app.py
27dca6d verified
import gradio as gr
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
# UPDATED: 'eng' added for mixed words like "Mean", "Statistics"
OCR_LANG = "guj+eng"
def extract_gujarati_text(pdf_file, page_number):
if pdf_file is None:
return "Please upload a PDF file first.", None
try:
# Convert PDF page to image
images = convert_from_path(pdf_file.name, first_page=page_number, last_page=page_number)
image = images[0]
# Configuration for better block handling (helps with textbook layouts)
custom_config = r'--oem 3 --psm 3'
text = pytesseract.image_to_string(image, lang=OCR_LANG, config=custom_config)
return text, image
except Exception as e:
return f"Error: {str(e)}", None
# CSS to ensure the image and text box are roughly the same height for easy comparison
css = """
.gradio-container {min-height: 0px !important;}
#img_out {height: 80vh !important;}
#txt_out textarea {height: 80vh !important;}
"""
with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
gr.Markdown("## πŸ“š Side-by-Side Gujarati OCR (Textbook Mode)")
# Input Section at the top
with gr.Row(variant="panel"):
with gr.Column(scale=1):
pdf = gr.File(label="1. Upload PDF", file_types=[".pdf"])
with gr.Column(scale=1):
page = gr.Number(label="2. Page Number", minimum=1, value=1, step=1)
button = gr.Button("3. Extract Text & Compare", variant="primary", size="lg")
# Output Section: Side-by-Side Comparison
with gr.Row():
with gr.Column(scale=1):
# Left: Original Image
image_output = gr.Image(label="πŸ“„ Original Page Snapshot", type="pil", elem_id="img_out")
with gr.Column(scale=1):
# Right: Extracted Text
text_output = gr.Textbox(
label="πŸ“ Extracted Text (Editable)",
elem_id="txt_out",
show_copy_button=True,
interactive=True
)
button.click(fn=extract_gujarati_text, inputs=[pdf, page], outputs=[text_output, image_output])
demo.launch()