Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import requests | |
| import torch | |
| from PIL import Image | |
| from transformers import MllamaForConditionalGeneration, AutoProcessor | |
| from PyPDF2 import PdfReader | |
| import tempfile | |
| import os | |
| from pdf2image import convert_from_path | |
| token= os.getenv("HF_TOKEN") | |
| # Model and processor setup | |
| model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct" | |
| # Load the model | |
| model = MllamaForConditionalGeneration.from_pretrained( | |
| model_id, | |
| use_auth_token=token, | |
| torch_dtype=torch.bfloat16, | |
| device_map="auto", # Automatically allocates the model across available devices | |
| ) | |
| processor = AutoProcessor.from_pretrained(model_id) | |
| def process_pdf(pdf_file): | |
| """Extract text from each page of a PDF.""" | |
| # Read the PDF using pdf2image to convert pages to images | |
| images = convert_from_path(pdf_file.name) | |
| extracted_text = {} | |
| for i, page_image in enumerate(images): | |
| # Define the instruction for OCR | |
| messages = [ | |
| {"role": "user", "content": [ | |
| {"type": "image"}, | |
| {"type": "text", "text": "Extract all the text from this image:"} | |
| ]} | |
| ] | |
| # Prepare the input | |
| input_text = processor.apply_chat_template(messages, add_generation_prompt=True) | |
| inputs = processor( | |
| page_image, | |
| input_text, | |
| add_special_tokens=False, | |
| return_tensors="pt" | |
| ).to(model.device) | |
| # Generate the output | |
| output = model.generate(**inputs, max_new_tokens=1500) | |
| # Decode the generated text | |
| page_text = processor.decode(output[0]) | |
| extracted_text[f"Page {i + 1}"] = page_text | |
| return extracted_text | |
| def display_results(pdf_file): | |
| """Process the PDF and display results as key-value pairs with checkboxes.""" | |
| extracted_text = process_pdf(pdf_file) | |
| checkboxes = {key: False for key in extracted_text.keys()} | |
| return checkboxes, extracted_text | |
| def create_interface(): | |
| """Build the Gradio interface.""" | |
| with gr.Blocks() as app: | |
| gr.Markdown("# PDF OCR Extractor with Key-Value Pairs") | |
| with gr.Row(): | |
| pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) | |
| submit_button = gr.Button("Extract Text") | |
| with gr.Row(): | |
| checkboxes_output = gr.CheckboxGroup(label="Select Pages", choices=[]) | |
| text_output = gr.Textbox(label="Extracted Text", lines=10, interactive=False) | |
| submit_button.click( | |
| display_results, | |
| inputs=[pdf_input], | |
| outputs=[checkboxes_output, text_output] | |
| ) | |
| return app | |
| if __name__ == "__main__": | |
| interface = create_interface() | |
| interface.launch() | |