import gradio as gr from pdf2image import convert_from_path from PIL import Image import pytesseract # UPDATED: 'eng' added for mixed words like "Mean", "Statistics" OCR_LANG = "guj+eng" def extract_gujarati_text(pdf_file, page_number): if pdf_file is None: return "Please upload a PDF file first.", None try: # Convert PDF page to image images = convert_from_path(pdf_file.name, first_page=page_number, last_page=page_number) image = images[0] # Configuration for better block handling (helps with textbook layouts) custom_config = r'--oem 3 --psm 3' text = pytesseract.image_to_string(image, lang=OCR_LANG, config=custom_config) return text, image except Exception as e: return f"Error: {str(e)}", None # CSS to ensure the image and text box are roughly the same height for easy comparison css = """ .gradio-container {min-height: 0px !important;} #img_out {height: 80vh !important;} #txt_out textarea {height: 80vh !important;} """ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo: gr.Markdown("## 📚 Side-by-Side Gujarati OCR (Textbook Mode)") # Input Section at the top with gr.Row(variant="panel"): with gr.Column(scale=1): pdf = gr.File(label="1. Upload PDF", file_types=[".pdf"]) with gr.Column(scale=1): page = gr.Number(label="2. Page Number", minimum=1, value=1, step=1) button = gr.Button("3. Extract Text & Compare", variant="primary", size="lg") # Output Section: Side-by-Side Comparison with gr.Row(): with gr.Column(scale=1): # Left: Original Image image_output = gr.Image(label="📄 Original Page Snapshot", type="pil", elem_id="img_out") with gr.Column(scale=1): # Right: Extracted Text text_output = gr.Textbox( label="📝 Extracted Text (Editable)", elem_id="txt_out", show_copy_button=True, interactive=True ) button.click(fn=extract_gujarati_text, inputs=[pdf, page], outputs=[text_output, image_output]) demo.launch()