Spaces:

e-p
/

vqa-vilt

Sleeping

App Files Files Community

e-p commited on Nov 11, 2025

Commit

d471219

1 Parent(s): d3b7c3f

app demo

Browse files

Files changed (2) hide show

app.py +44 -0
requirements.txt +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import gradio as gr
+from transformers import ViltProcessor, ViltForQuestionAnswering
+import pandas as pd
+import torch
+device = "cuda" if torch.cuda.is_available() else "cpu"
+processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
+model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa").to(device)
+def predict(img, prompt, return_topk):
+    encoding = processor(img, prompt, return_tensors="pt")
+    outputs = model(**encoding)
+    with torch.no_grad():
+        probs = torch.nn.Sigmoid()(outputs.logits)
+        topk_anss = torch.topk(probs, return_topk)
+        # these are the indices of the top-k outputs
+        indices = topk_anss.indices.flatten().numpy()
+        # create a dataframe with two columns/series:
+        # class labels and corresponding probabilities
+        out_df = pd.DataFrame(
+            {
+                "answer": [model.config.id2label[key] for key in indices],
+                "probability": topk_anss.values.flatten().numpy()
+            }
+        )
+    return out_df
+demo = gr.Interface(
+    fn = predict,
+    # we use the type='pil' parameter so that gradio passes to our function
+    # a picture that is already in the PIL format,
+    # see https://www.gradio.app/docs/gradio/image#description
+    inputs = [gr.Image(type="pil"),
+              "textbox",
+              # value is the default value, it can be lower than 1
+              gr.Number(value=4, minimum=1)],
+    outputs = gr.BarPlot(x="answer", y="probability",
+                       title="Multi-class probabilities")
+    # outputs="dataframe" # uncomment if the gradio interface is unresponsive
+)
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+torch
+pandas
+transformers
+gradio