Spaces:

WwYc
/

explain-ViT

Sleeping

App Files Files Community

WwYc commited on Mar 4, 2024

Commit

c35cecd

verified ·

1 Parent(s): 49ef971

Update ViT_DeiT/baselines/ViT/ViT_explanation_generator.py

Browse files

Files changed (1) hide show

ViT_DeiT/baselines/ViT/ViT_explanation_generator.py +39 -15

ViT_DeiT/baselines/ViT/ViT_explanation_generator.py CHANGED Viewed

@@ -1,28 +1,46 @@
 import argparse
-import torch
 import numpy as np
 from numpy import *
 # compute rollout between attention layers
 def compute_rollout_attention(all_layer_matrices, start_layer=0):
     # adding residual consideration- code adapted from https://github.com/samiraabnar/attention_flow
     num_tokens = all_layer_matrices[0].shape[1]
     batch_size = all_layer_matrices[0].shape[0]
-    eye = torch.eye(num_tokens).expand(batch_size, num_tokens, num_tokens).to(all_layer_matrices[0].device)
-    all_layer_matrices = [all_layer_matrices[i] + eye for i in range(len(all_layer_matrices))]
-    matrices_aug = [all_layer_matrices[i] / all_layer_matrices[i].sum(dim=-1, keepdim=True)
-                          for i in range(len(all_layer_matrices))]
     joint_attention = matrices_aug[start_layer]
-    for i in range(start_layer+1, len(matrices_aug)):
         joint_attention = matrices_aug[i].bmm(joint_attention)
     return joint_attention
 class LRP:
     def __init__(self, model):
         self.model = model
         self.model.eval()
-    def generate_LRP(self, input, index=None, method="transformer_attribution", is_ablation=False, start_layer=0):
         output = self.model(input)
         kwargs = {"alpha": 1}
         if index == None:
@@ -32,14 +50,18 @@ class LRP:
         one_hot[0, index] = 1
         one_hot_vector = one_hot
         one_hot = torch.from_numpy(one_hot).requires_grad_(True)
-        one_hot = torch.sum(one_hot.cuda() * output)
         self.model.zero_grad()
         one_hot.backward(retain_graph=True)
-        return self.model.relprop(torch.tensor(one_hot_vector).to(input.device), method=method, is_ablation=is_ablation,
-                                  start_layer=start_layer, **kwargs)
 class Baselines:
@@ -48,14 +70,14 @@ class Baselines:
         self.model.eval()
     def generate_cam_attn(self, input, index=None):
-        output = self.model(input.cuda(), register_hook=True)
         if index == None:
             index = np.argmax(output.cpu().data.numpy())
         one_hot = np.zeros((1, output.size()[-1]), dtype=np.float32)
         one_hot[0][index] = 1
         one_hot = torch.from_numpy(one_hot).requires_grad_(True)
-        one_hot = torch.sum(one_hot.cuda() * output)
         self.model.zero_grad()
         one_hot.backward(retain_graph=True)
@@ -79,5 +101,7 @@ class Baselines:
             attn_heads = blk.attn.get_attention_map()
             avg_heads = (attn_heads.sum(dim=1) / attn_heads.shape[1]).detach()
             all_layer_attentions.append(avg_heads)
-        rollout = compute_rollout_attention(all_layer_attentions, start_layer=start_layer)
-        return rollout[:,0, 1:]

 import argparse
 import numpy as np
+import torch
 from numpy import *
 # compute rollout between attention layers
 def compute_rollout_attention(all_layer_matrices, start_layer=0):
     # adding residual consideration- code adapted from https://github.com/samiraabnar/attention_flow
     num_tokens = all_layer_matrices[0].shape[1]
     batch_size = all_layer_matrices[0].shape[0]
+    eye = (
+        torch.eye(num_tokens)
+        .expand(batch_size, num_tokens, num_tokens)
+        .to(all_layer_matrices[0].device)
+    )
+    all_layer_matrices = [
+        all_layer_matrices[i] + eye for i in range(len(all_layer_matrices))
+    ]
+    matrices_aug = [
+        all_layer_matrices[i] / all_layer_matrices[i].sum(dim=-1, keepdim=True)
+        for i in range(len(all_layer_matrices))
+    ]
     joint_attention = matrices_aug[start_layer]
+    for i in range(start_layer + 1, len(matrices_aug)):
         joint_attention = matrices_aug[i].bmm(joint_attention)
     return joint_attention
 class LRP:
     def __init__(self, model):
         self.model = model
         self.model.eval()
+    def generate_LRP(
+        self,
+        input,
+        index=None,
+        method="transformer_attribution",
+        is_ablation=False,
+        start_layer=0,
+    ):
         output = self.model(input)
         kwargs = {"alpha": 1}
         if index == None:
         one_hot[0, index] = 1
         one_hot_vector = one_hot
         one_hot = torch.from_numpy(one_hot).requires_grad_(True)
+        one_hot = torch.sum(one_hot * output)
         self.model.zero_grad()
         one_hot.backward(retain_graph=True)
+        return self.model.relprop(
+            torch.tensor(one_hot_vector).to(input.device),
+            method=method,
+            is_ablation=is_ablation,
+            start_layer=start_layer,
+            **kwargs
+        )
 class Baselines:
         self.model.eval()
     def generate_cam_attn(self, input, index=None):
+        output = self.model(input, register_hook=True)
         if index == None:
             index = np.argmax(output.cpu().data.numpy())
         one_hot = np.zeros((1, output.size()[-1]), dtype=np.float32)
         one_hot[0][index] = 1
         one_hot = torch.from_numpy(one_hot).requires_grad_(True)
+        one_hot = torch.sum(one_hot * output)
         self.model.zero_grad()
         one_hot.backward(retain_graph=True)
             attn_heads = blk.attn.get_attention_map()
             avg_heads = (attn_heads.sum(dim=1) / attn_heads.shape[1]).detach()
             all_layer_attentions.append(avg_heads)
+        rollout = compute_rollout_attention(
+            all_layer_attentions, start_layer=start_layer
+        )
+        return rollout[:, 0, 1:]