Spaces:

e-p
/

vqa-vilt

Sleeping

vqa-vilt / app.py

app demo

d471219 2 months ago

1.67 kB

	import gradio as gr
	from transformers import ViltProcessor, ViltForQuestionAnswering
	import pandas as pd
	import torch

	device = "cuda" if torch.cuda.is_available() else "cpu"

	processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
	model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa").to(device)


	def predict(img, prompt, return_topk):
	encoding = processor(img, prompt, return_tensors="pt")
	outputs = model(**encoding)
	with torch.no_grad():
	probs = torch.nn.Sigmoid()(outputs.logits)
	topk_anss = torch.topk(probs, return_topk)
	# these are the indices of the top-k outputs
	indices = topk_anss.indices.flatten().numpy()
	# create a dataframe with two columns/series:
	# class labels and corresponding probabilities
	out_df = pd.DataFrame(
	{
	"answer": [model.config.id2label[key] for key in indices],
	"probability": topk_anss.values.flatten().numpy()
	}
	)
	return out_df

	demo = gr.Interface(
	fn = predict,
	# we use the type='pil' parameter so that gradio passes to our function
	# a picture that is already in the PIL format,
	# see https://www.gradio.app/docs/gradio/image#description
	inputs = [gr.Image(type="pil"),
	"textbox",
	# value is the default value, it can be lower than 1
	gr.Number(value=4, minimum=1)],
	outputs = gr.BarPlot(x="answer", y="probability",
	title="Multi-class probabilities")
	# outputs="dataframe" # uncomment if the gradio interface is unresponsive
	)

	demo.launch()