Spaces:

Huzaifa424
/

OCR_DEMO

Runtime error

File size: 2,700 Bytes

edf4939
 
 
 
 
 
 
48b201c
e4e2cb3
 
edf4939
 
 
 
 
 
48b201c
edf4939
 
 
 
 
 
 
e4e2cb3
 
edf4939
 
e4e2cb3
edf4939

import gradio as gr
import requests
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor
from PyPDF2 import PdfReader
import tempfile
import os
from pdf2image import convert_from_path
token= os.getenv("HF_TOKEN")
# Model and processor setup
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

# Load the model
model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    use_auth_token=token,
    torch_dtype=torch.bfloat16,
    device_map="auto",  # Automatically allocates the model across available devices
)
processor = AutoProcessor.from_pretrained(model_id)

def process_pdf(pdf_file):
    """Extract text from each page of a PDF."""
    # Read the PDF using pdf2image to convert pages to images
    images = convert_from_path(pdf_file.name)
    extracted_text = {}

    for i, page_image in enumerate(images):
        # Define the instruction for OCR
        messages = [
            {"role": "user", "content": [
                {"type": "image"},
                {"type": "text", "text": "Extract all the text from this image:"}
            ]}
        ]

        # Prepare the input
        input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
        inputs = processor(
            page_image,
            input_text,
            add_special_tokens=False,
            return_tensors="pt"
        ).to(model.device)

        # Generate the output
        output = model.generate(**inputs, max_new_tokens=1500)

        # Decode the generated text
        page_text = processor.decode(output[0])
        extracted_text[f"Page {i + 1}"] = page_text

    return extracted_text

def display_results(pdf_file):
    """Process the PDF and display results as key-value pairs with checkboxes."""
    extracted_text = process_pdf(pdf_file)
    checkboxes = {key: False for key in extracted_text.keys()}

    return checkboxes, extracted_text

def create_interface():
    """Build the Gradio interface."""
    with gr.Blocks() as app:
        gr.Markdown("# PDF OCR Extractor with Key-Value Pairs")

        with gr.Row():
            pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
            submit_button = gr.Button("Extract Text")

        with gr.Row():
            checkboxes_output = gr.CheckboxGroup(label="Select Pages", choices=[])
            text_output = gr.Textbox(label="Extracted Text", lines=10, interactive=False)

        submit_button.click(
            display_results,
            inputs=[pdf_input],
            outputs=[checkboxes_output, text_output]
        )

    return app

if __name__ == "__main__":
    interface = create_interface()
    interface.launch()