Spaces:

MR
/

vilt-word-patch-alignment

Build error

App Files Files Community

MR commited on Oct 5, 2022

Commit

de36b67

1 Parent(s): d6ad5e3

Create new file

Browse files

Files changed (1) hide show

app.py +238 -0

app.py ADDED Viewed

	@@ -0,0 +1,238 @@

+import gradio as gr
+from transformers import ViltProcessor, ViltForQuestionAnswering
+import torch
+import gradio as gr
+import torch
+import copy
+import time
+import requests
+import io
+import numpy as np
+import re
+import ipdb
+from PIL import Image
+from vilt.config import ex
+from vilt.modules import ViLTransformerSS
+from vilt.modules.objectives import cost_matrix_cosine, ipot
+from vilt.transforms import pixelbert_transform
+from vilt.datamodules.datamodule_base import get_pretrained_tokenizer
+@ex.automain
+def main(_config):
+    _config = copy.deepcopy(_config)
+    loss_names = {
+        "itm": 0,
+        "mlm": 0.5,
+        "mpp": 0,
+        "vqa": 0,
+        "imgcls": 0,
+        "nlvr2": 0,
+        "irtr": 0,
+        "arc": 0,
+    }
+    tokenizer = get_pretrained_tokenizer(_config["tokenizer"])
+    _config.update(
+        {
+            "loss_names": loss_names,
+        }
+    )
+    model = ViLTransformerSS(_config)
+    model.setup("test")
+    model.eval()
+    device = "cuda:0" if _config["num_gpus"] > 0 else "cpu"
+    model.to(device)
+    def infer(url, mp_text, hidx):
+        try:
+            res = requests.get(url)
+            image = Image.open(io.BytesIO(res.content)).convert("RGB")
+            img = pixelbert_transform(size=384)(image)
+            img = img.unsqueeze(0).to(device)
+        except:
+            return False
+        batch = {"text": [""], "image": [None]}
+        tl = len(re.findall("\[MASK\]", mp_text))
+        inferred_token = [mp_text]
+        batch["image"][0] = img
+        with torch.no_grad():
+            for i in range(tl):
+                batch["text"] = inferred_token
+                encoded = tokenizer(inferred_token)
+                batch["text_ids"] = torch.tensor(encoded["input_ids"]).to(device)
+                batch["text_labels"] = torch.tensor(encoded["input_ids"]).to(device)
+                batch["text_masks"] = torch.tensor(encoded["attention_mask"]).to(device)
+                encoded = encoded["input_ids"][0][1:-1]
+                infer = model(batch)
+                mlm_logits = model.mlm_score(infer["text_feats"])[0, 1:-1]
+                mlm_values, mlm_ids = mlm_logits.softmax(dim=-1).max(dim=-1)
+                mlm_values[torch.tensor(encoded) != 103] = 0
+                select = mlm_values.argmax().item()
+                encoded[select] = mlm_ids[select].item()
+                inferred_token = [tokenizer.decode(encoded)]
+        selected_token = ""
+        encoded = tokenizer(inferred_token)
+        if hidx > 0 and hidx < len(encoded["input_ids"][0][:-1]):
+            with torch.no_grad():
+                batch["text"] = inferred_token
+                batch["text_ids"] = torch.tensor(encoded["input_ids"]).to(device)
+                batch["text_labels"] = torch.tensor(encoded["input_ids"]).to(device)
+                batch["text_masks"] = torch.tensor(encoded["attention_mask"]).to(device)
+                infer = model(batch)
+                txt_emb, img_emb = infer["text_feats"], infer["image_feats"]
+                txt_mask, img_mask = (
+                    infer["text_masks"].bool(),
+                    infer["image_masks"].bool(),
+                )
+                for i, _len in enumerate(txt_mask.sum(dim=1)):
+                    txt_mask[i, _len - 1] = False
+                txt_mask[:, 0] = False
+                img_mask[:, 0] = False
+                txt_pad, img_pad = ~txt_mask, ~img_mask
+                cost = cost_matrix_cosine(txt_emb.float(), img_emb.float())
+                joint_pad = txt_pad.unsqueeze(-1) | img_pad.unsqueeze(-2)
+                cost.masked_fill_(joint_pad, 0)
+                txt_len = (txt_pad.size(1) - txt_pad.sum(dim=1, keepdim=False)).to(
+                    dtype=cost.dtype
+                )
+                img_len = (img_pad.size(1) - img_pad.sum(dim=1, keepdim=False)).to(
+                    dtype=cost.dtype
+                )
+                T = ipot(
+                    cost.detach(),
+                    txt_len,
+                    txt_pad,
+                    img_len,
+                    img_pad,
+                    joint_pad,
+                    0.1,
+                    1000,
+                    1,
+                )
+                plan = T[0]
+                plan_single = plan * len(txt_emb)
+                cost_ = plan_single.t()
+                cost_ = cost_[hidx][1:].cpu()
+                patch_index, (H, W) = infer["patch_index"]
+                heatmap = torch.zeros(H, W)
+                for i, pidx in enumerate(patch_index[0]):
+                    h, w = pidx[0].item(), pidx[1].item()
+                    heatmap[h, w] = cost_[i]
+                heatmap = (heatmap - heatmap.mean()) / heatmap.std()
+                heatmap = np.clip(heatmap, 1.0, 3.0)
+                heatmap = (heatmap - heatmap.min()) / (heatmap.max() - heatmap.min())
+                _w, _h = image.size
+                overlay = Image.fromarray(np.uint8(heatmap * 255), "L").resize(
+                    (_w, _h), resample=Image.NEAREST
+                )
+                image_rgba = image.copy()
+                image_rgba.putalpha(overlay)
+                image = image_rgba
+                selected_token = tokenizer.convert_ids_to_tokens(
+                    encoded["input_ids"][0][hidx]
+                )
+        return [np.array(image), inferred_token[0], selected_token]
+    inputs = [
+        gr.inputs.Textbox(
+            label="Url of an image.",
+            lines=5,
+        ),
+        gr.inputs.Textbox(label="Caption with [MASK] tokens to be filled.", lines=5),
+        gr.inputs.Slider(
+            minimum=0,
+            maximum=38,
+            step=1,
+            label="Index of token for heatmap visualization (ignored if zero)",
+        ),
+    ]
+    outputs = [
+        gr.outputs.Image(label="Image"),
+        gr.outputs.Textbox(label="description"),
+        gr.outputs.Textbox(label="selected token"),
+    ]
+    interface = gr.Interface(
+        fn=infer,
+        inputs=inputs,
+        outputs=outputs,
+        server_name="0.0.0.0",
+        server_port=8888,
+        examples=[
+            [
+                "https://s3.geograph.org.uk/geophotos/06/21/24/6212487_1cca7f3f_1024x1024.jpg",
+                "a display of flowers growing out and over the [MASK] [MASK] in front of [MASK] on a [MASK] [MASK].",
+                0,
+            ],
+            [
+                "https://s3.geograph.org.uk/geophotos/06/21/24/6212487_1cca7f3f_1024x1024.jpg",
+                "a display of flowers growing out and over the retaining wall in front of cottages on a cloudy day.",
+                4,
+            ],
+            [
+                "https://s3.geograph.org.uk/geophotos/06/21/24/6212487_1cca7f3f_1024x1024.jpg",
+                "a display of flowers growing out and over the retaining wall in front of cottages on a cloudy day.",
+                11,
+            ],
+            [
+                "https://s3.geograph.org.uk/geophotos/06/21/24/6212487_1cca7f3f_1024x1024.jpg",
+                "a display of flowers growing out and over the retaining wall in front of cottages on a cloudy day.",
+                15,
+            ],
+            [
+                "https://s3.geograph.org.uk/geophotos/06/21/24/6212487_1cca7f3f_1024x1024.jpg",
+                "a display of flowers growing out and over the retaining wall in front of cottages on a cloudy day.",
+                18,
+            ],
+            [
+                "https://upload.wikimedia.org/wikipedia/commons/thumb/4/40/Living_Room.jpg/800px-Living_Room.jpg",
+                "a room with a [MASK], a [MASK], a [MASK], and a [MASK].",
+                0,
+            ],
+            [
+                "https://upload.wikimedia.org/wikipedia/commons/thumb/4/40/Living_Room.jpg/800px-Living_Room.jpg",
+                "a room with a rug, a chair, a painting, and a plant.",
+                5,
+            ],
+            [
+                "https://upload.wikimedia.org/wikipedia/commons/thumb/4/40/Living_Room.jpg/800px-Living_Room.jpg",
+                "a room with a rug, a chair, a painting, and a plant.",
+                8,
+            ],
+            [
+                "https://upload.wikimedia.org/wikipedia/commons/thumb/4/40/Living_Room.jpg/800px-Living_Room.jpg",
+                "a room with a rug, a chair, a painting, and a plant.",
+                11,
+            ],
+            [
+                "https://upload.wikimedia.org/wikipedia/commons/thumb/4/40/Living_Room.jpg/800px-Living_Room.jpg",
+                "a room with a rug, a chair, a painting, and a plant.",
+                15,
+            ],
+        ],
+    )
+    interface.launch(debug=True)