import gradio as gr import requests import torch from PIL import Image from transformers import MllamaForConditionalGeneration, AutoProcessor from PyPDF2 import PdfReader import tempfile import os from pdf2image import convert_from_path token= os.getenv("HF_TOKEN") # Model and processor setup model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct" # Load the model model = MllamaForConditionalGeneration.from_pretrained( model_id, use_auth_token=token, torch_dtype=torch.bfloat16, device_map="auto", # Automatically allocates the model across available devices ) processor = AutoProcessor.from_pretrained(model_id) def process_pdf(pdf_file): """Extract text from each page of a PDF.""" # Read the PDF using pdf2image to convert pages to images images = convert_from_path(pdf_file.name) extracted_text = {} for i, page_image in enumerate(images): # Define the instruction for OCR messages = [ {"role": "user", "content": [ {"type": "image"}, {"type": "text", "text": "Extract all the text from this image:"} ]} ] # Prepare the input input_text = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor( page_image, input_text, add_special_tokens=False, return_tensors="pt" ).to(model.device) # Generate the output output = model.generate(**inputs, max_new_tokens=1500) # Decode the generated text page_text = processor.decode(output[0]) extracted_text[f"Page {i + 1}"] = page_text return extracted_text def display_results(pdf_file): """Process the PDF and display results as key-value pairs with checkboxes.""" extracted_text = process_pdf(pdf_file) checkboxes = {key: False for key in extracted_text.keys()} return checkboxes, extracted_text def create_interface(): """Build the Gradio interface.""" with gr.Blocks() as app: gr.Markdown("# PDF OCR Extractor with Key-Value Pairs") with gr.Row(): pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) submit_button = gr.Button("Extract Text") with gr.Row(): checkboxes_output = gr.CheckboxGroup(label="Select Pages", choices=[]) text_output = gr.Textbox(label="Extracted Text", lines=10, interactive=False) submit_button.click( display_results, inputs=[pdf_input], outputs=[checkboxes_output, text_output] ) return app if __name__ == "__main__": interface = create_interface() interface.launch()