import gradio as gr
from transformers import LayoutLMv2ForQuestionAnswering, LayoutLMv2Processor
import torch
from PIL import Image

# Load the model and processor
model_name = "impira/layoutlm-document-qa"
model = LayoutLMv2ForQuestionAnswering.from_pretrained(model_name)
processor = LayoutLMv2Processor.from_pretrained(model_name)

def predict(image, question):
    # Process the image and question
    encoding = processor(image, question, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**encoding)
    
    # Extract the answer
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits
    start_index = torch.argmax(start_logits)
    end_index = torch.argmax(end_logits)
    answer = processor.tokenizer.decode(encoding.input_ids[0][start_index:end_index+1])
    return answer

# Create a Gradio interface
interface = gr.Interface(
    fn=predict,
    inputs=[gr.inputs.Image(type="pil"), gr.inputs.Textbox(lines=1, placeholder="Enter your question")],
    outputs="text",
    title="Document Question Answering with LayoutLM",
    description="Ask questions about the content of a document."
)

# Launch the interface
interface.launch()