import gradio as gr from transformers import LayoutLMv2ForQuestionAnswering, LayoutLMv2Processor import torch from PIL import Image # Load the model and processor model_name = "impira/layoutlm-document-qa" model = LayoutLMv2ForQuestionAnswering.from_pretrained(model_name) processor = LayoutLMv2Processor.from_pretrained(model_name) def predict(image, question): # Process the image and question encoding = processor(image, question, return_tensors="pt") with torch.no_grad(): outputs = model(**encoding) # Extract the answer start_logits = outputs.start_logits end_logits = outputs.end_logits start_index = torch.argmax(start_logits) end_index = torch.argmax(end_logits) answer = processor.tokenizer.decode(encoding.input_ids[0][start_index:end_index+1]) return answer # Create a Gradio interface interface = gr.Interface( fn=predict, inputs=[gr.inputs.Image(type="pil"), gr.inputs.Textbox(lines=1, placeholder="Enter your question")], outputs="text", title="Document Question Answering with LayoutLM", description="Ask questions about the content of a document." ) # Launch the interface interface.launch()