Spaces:

lolhaha002
/

Pdf-Extractor

Sleeping

File size: 2,211 Bytes

3aa2d23
 
 
 
 
27dca6d
6752cd0
3aa2d23
 
27dca6d
 
 
6752cd0
 
 
 
 
27dca6d
6752cd0
 
 
 
 
 
 
3aa2d23
27dca6d
 
 
 
 
 
 
 
 
6752cd0
27dca6d
 
 
 
 
 
 
 
 
5c23bea
27dca6d
 
 
6752cd0
27dca6d
 
6752cd0
27dca6d
 
 
 
6752cd0
5c23bea
 
3aa2d23

import gradio as gr
from pdf2image import convert_from_path
from PIL import Image
import pytesseract

# UPDATED: 'eng' added for mixed words like "Mean", "Statistics"
OCR_LANG = "guj+eng"

def extract_gujarati_text(pdf_file, page_number):
    if pdf_file is None:
        return "Please upload a PDF file first.", None
        
    try:
        # Convert PDF page to image
        images = convert_from_path(pdf_file.name, first_page=page_number, last_page=page_number)
        image = images[0]
        
        # Configuration for better block handling (helps with textbook layouts)
        custom_config = r'--oem 3 --psm 3'
        
        text = pytesseract.image_to_string(image, lang=OCR_LANG, config=custom_config)
        return text, image
        
    except Exception as e:
        return f"Error: {str(e)}", None

# CSS to ensure the image and text box are roughly the same height for easy comparison
css = """
.gradio-container {min-height: 0px !important;}
#img_out {height: 80vh !important;} 
#txt_out textarea {height: 80vh !important;}
"""

with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
    gr.Markdown("## 📚 Side-by-Side Gujarati OCR (Textbook Mode)")
    
    # Input Section at the top
    with gr.Row(variant="panel"):
        with gr.Column(scale=1):
            pdf = gr.File(label="1. Upload PDF", file_types=[".pdf"])
        with gr.Column(scale=1):
            page = gr.Number(label="2. Page Number", minimum=1, value=1, step=1)
            button = gr.Button("3. Extract Text & Compare", variant="primary", size="lg")

    # Output Section: Side-by-Side Comparison
    with gr.Row():
        with gr.Column(scale=1):
            # Left: Original Image
            image_output = gr.Image(label="📄 Original Page Snapshot", type="pil", elem_id="img_out")
        
        with gr.Column(scale=1):
            # Right: Extracted Text
            text_output = gr.Textbox(
                label="📝 Extracted Text (Editable)", 
                elem_id="txt_out",
                show_copy_button=True,
                interactive=True 
            )

    button.click(fn=extract_gujarati_text, inputs=[pdf, page], outputs=[text_output, image_output])

demo.launch()