import torch import gradio as gr from lavis.models import load_model_and_preprocess from PIL import Image def process(input_image, prompt): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model, vis_processors, txt_processors = load_model_and_preprocess(name="pnp_vqa", model_type="base", is_eval=True, device=device) input_image = input_image.resize((256, 256)) image = vis_processors["eval"](input_image).unsqueeze(0).to(device) text_input = txt_processors["eval"](prompt) sample = {"image": image, "text_input": [text_input]} pred_answers, caption, gradcam = model.predict_answers(sample, num_captions=50, num_patches=20) return pred_answers[0] if __name__ == '__main__': input_image = gr.inputs.Image(label='image', type='pil') prompt = gr.Textbox(label='Prompt') ips = [ input_image, prompt ] outputs = gr.outputs.Textbox(label='Answer') iface = gr.Interface(fn=process, inputs=ips, outputs=outputs, title='Image Question Answering', description='画像に関する質問に答えるモデルを使って、質問に答えます。画像をアップロードし、質問を入力してください。') iface.launch()