File size: 2,700 Bytes
edf4939
 
 
 
 
 
 
48b201c
e4e2cb3
 
edf4939
 
 
 
 
 
48b201c
edf4939
 
 
 
 
 
 
e4e2cb3
 
edf4939
 
e4e2cb3
edf4939
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import gradio as gr
import requests
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor
from PyPDF2 import PdfReader
import tempfile
import os
from pdf2image import convert_from_path
token= os.getenv("HF_TOKEN")
# Model and processor setup
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

# Load the model
model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    use_auth_token=token,
    torch_dtype=torch.bfloat16,
    device_map="auto",  # Automatically allocates the model across available devices
)
processor = AutoProcessor.from_pretrained(model_id)

def process_pdf(pdf_file):
    """Extract text from each page of a PDF."""
    # Read the PDF using pdf2image to convert pages to images
    images = convert_from_path(pdf_file.name)
    extracted_text = {}

    for i, page_image in enumerate(images):
        # Define the instruction for OCR
        messages = [
            {"role": "user", "content": [
                {"type": "image"},
                {"type": "text", "text": "Extract all the text from this image:"}
            ]}
        ]

        # Prepare the input
        input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
        inputs = processor(
            page_image,
            input_text,
            add_special_tokens=False,
            return_tensors="pt"
        ).to(model.device)

        # Generate the output
        output = model.generate(**inputs, max_new_tokens=1500)

        # Decode the generated text
        page_text = processor.decode(output[0])
        extracted_text[f"Page {i + 1}"] = page_text

    return extracted_text

def display_results(pdf_file):
    """Process the PDF and display results as key-value pairs with checkboxes."""
    extracted_text = process_pdf(pdf_file)
    checkboxes = {key: False for key in extracted_text.keys()}

    return checkboxes, extracted_text

def create_interface():
    """Build the Gradio interface."""
    with gr.Blocks() as app:
        gr.Markdown("# PDF OCR Extractor with Key-Value Pairs")

        with gr.Row():
            pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
            submit_button = gr.Button("Extract Text")

        with gr.Row():
            checkboxes_output = gr.CheckboxGroup(label="Select Pages", choices=[])
            text_output = gr.Textbox(label="Extracted Text", lines=10, interactive=False)

        submit_button.click(
            display_results,
            inputs=[pdf_input],
            outputs=[checkboxes_output, text_output]
        )

    return app

if __name__ == "__main__":
    interface = create_interface()
    interface.launch()