Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

Qformer.py +1 -1
basis_functions.py +266 -0
long_term_attention_gibbs.py +315 -0

Qformer.py CHANGED Viewed

@@ -41,7 +41,7 @@ from transformers.utils import logging
 from transformers.models.bert.configuration_bert import BertConfig
 from functools import partial
-from .ltm.long_term_attention_gibbs import LongTermAttention
 logger = logging.get_logger(__name__)

 from transformers.models.bert.configuration_bert import BertConfig
 from functools import partial
+from .long_term_attention_gibbs import LongTermAttention
 logger = logging.get_logger(__name__)

basis_functions.py ADDED Viewed

	@@ -0,0 +1,266 @@

+import torch
+import math
+class BasisFunctions(object):
+    def __init__(self):
+        pass
+    def __len__(self):
+        """Number of basis functions."""
+        pass
+    def evaluate(self, t):
+        pass
+    def integrate_t2_times_psi(self, a, b):
+        """Compute integral int_a^b (t**2) * psi(t)."""
+        pass
+    def integrate_t_times_psi(self, a, b):
+        """Compute integral int_a^b t * psi(t)."""
+        pass
+    def integrate_psi(self, a, b):
+        """Compute integral int_a^b psi(t)."""
+        pass
+class PowerBasisFunctions(BasisFunctions):
+    """Function phi(t) = t**degree."""
+    def __init__(self, degree):
+        self.degree = degree.unsqueeze(0)
+    def __len__(self):
+        """Number of basis functions."""
+        return self.degree.size(1)
+    def evaluate(self, t):
+        return t**self.degree
+    def integrate_t2_times_psi(self, a, b):
+        """Compute integral int_a^b (t**2) * psi(t)."""
+        return (b**(self.degree + 3) - a**(self.degree + 3)) / (self.degree + 3)
+    def integrate_t_times_psi(self, a, b):
+        """Compute integral int_a^b t * psi(t)."""
+        return (b**(self.degree + 2) - a**(self.degree + 2)) / (self.degree + 2)
+    def integrate_psi(self, a, b):
+        """Compute integral int_a^b psi(t)."""
+        return (b**(self.degree + 1) - a**(self.degree + 1)) / (self.degree + 1)
+    def __repr__(self):
+        return f"PowerBasisFunction(degree={self.degree})"
+class SineBasisFunctions(BasisFunctions):
+    """Function phi(t) = sin(omega*t)."""
+    def __init__(self, omega):
+        self.omega = omega.unsqueeze(0)
+    def __repr__(self):
+        return f"SineBasisFunction(omega={self.omega})"
+    def __len__(self):
+        """Number of basis functions."""
+        return self.omega.size(1)
+    def evaluate(self, t):
+        return torch.sin(self.omega*t)
+    def integrate_t2_times_psi(self, a, b):
+        """Compute integral int_a^b (t**2) * psi(t)."""
+        # The antiderivative of (t**2)*sin(omega*t) is
+        # ((2-(t**2)*(omega**2))*cos(omega*t) + 2*omega*t*sin(omega*t)) / omega**3.  # noqa
+        return ((2-(b**2)*(self.omega**2))*torch.cos(self.omega*b)
+                + 2*self.omega*b*torch.sin(self.omega*b)
+                - (2-(a**2)*(self.omega**2))*torch.cos(self.omega*a)
+                - 2*self.omega*a*torch.sin(self.omega*a)
+                ) / (self.omega**3)
+    def integrate_t_times_psi(self, a, b):
+        """Compute integral int_a^b t * psi(t)."""
+        # The antiderivative of t*sin(omega*t) is
+        # (sin(omega*t) - omega*t*cos(omega*t)) / omega**2.
+        return (torch.sin(self.omega*b) - self.omega*b*torch.cos(self.omega*b)
+                - torch.sin(self.omega*a) + self.omega*a*torch.cos(self.omega*a)
+                ) / (self.omega**2)
+    def integrate_psi(self, a, b):
+        """Compute integral int_a^b psi(t)."""
+        # The antiderivative of sin(omega*t) is -cos(omega*t)/omega.
+        return (-torch.cos(self.omega*b) + torch.cos(self.omega*a)) / self.omega
+class CosineBasisFunctions(BasisFunctions):
+    """Function phi(t) = cos(omega*t)."""
+    def __init__(self, omega):
+        self.omega = omega.unsqueeze(0)
+    def __repr__(self):
+        return f"CosineBasisFunction(omega={self.omega})"
+    def __len__(self):
+        """Number of basis functions."""
+        return self.omega.size(1)
+    def evaluate(self, t):
+        return torch.cos(self.omega*t)
+    def integrate_t2_times_psi(self, a, b):
+        """Compute integral int_a^b (t**2) * psi(t)."""
+        # The antiderivative of (t**2)*cos(omega*t) is
+        # (((t**2)*(omega**2)-2)*cos(omega*t) + 2*omega*t*sin(omega*t)) / omega**3.  # noqa
+        return (((b**2)*(self.omega**2)-2)*torch.sin(self.omega*b)
+                + 2*self.omega*b*torch.cos(self.omega*b)
+                - ((a**2)*(self.omega**2)-2)*torch.sin(self.omega*a)
+                - 2*self.omega*a*torch.cos(self.omega*a)
+                ) / (self.omega**3)
+    def integrate_t_times_psi(self, a, b):
+        """Compute integral int_a^b t * psi(t)."""
+        # The antiderivative of t*cos(omega*t) is
+        # (cos(omega*t) + omega*t*sin(omega*t)) / omega**2.
+        return (torch.cos(self.omega*b) + self.omega*b*torch.sin(self.omega*b)
+                - torch.cos(self.omega*a) - self.omega*a*torch.sin(self.omega*a)
+                ) / (self.omega**2)
+    def integrate_psi(self, a, b):
+        """Compute integral int_a^b psi(t)."""
+        # The antiderivative of cos(omega*t) is sin(omega*t)/omega.
+        return (torch.sin(self.omega*b) - torch.sin(self.omega*a)) / self.omega
+class GaussianBasisFunctions(BasisFunctions):
+    """Function phi(t) = Gaussian(t; mu, sigma_sq)."""
+    def __init__(self, mu, sigma):
+        self.mu = mu.unsqueeze(0)
+        self.sigma = sigma.unsqueeze(0)
+    def __repr__(self):
+        return f"GaussianBasisFunction(mu={self.mu}, sigma={self.sigma})"
+    def __len__(self):
+        """Number of basis functions."""
+        return self.mu.size(1)
+    def _phi(self, t):
+        return 1. / math.sqrt(2 * math.pi) * torch.exp(-.5 * t**2)
+    def _Phi(self, t):
+        return .5 * (1 + torch.erf(t / math.sqrt(2)))
+    def _integrate_product_of_gaussians(self, mu, sigma_sq):
+        sigma = torch.sqrt(self.sigma ** 2 + sigma_sq)
+        return self._phi((mu - self.mu) / sigma) / sigma
+    def evaluate(self, t):
+        return self._phi((t - self.mu) / self.sigma) / self.sigma
+    def batch_evaluate(self, t):
+        t_ = t.repeat(self.mu.size(0),1) - self.mu.repeat(t.size(0),1).transpose(1,0)
+        t_ = t_ / self.sigma.repeat((t.size(0),1)).transpose(1,0)
+        return (self._phi(t_) / self.sigma.repeat((t.size(0),1)).transpose(1,0)).transpose(0,1)
+    def integrate_t2_times_psi(self, a, b):
+        """Compute integral int_a^b (t**2) * psi(t)."""
+        return (self.mu**2 + self.sigma**2) * (
+            self._Phi((b - self.mu) / self.sigma) - self._Phi((a - self.mu) / self.sigma)
+        ) - (
+            self.sigma * (b + self.mu) * self._phi((b - self.mu) / self.sigma)
+        ) + (
+            self.sigma * (a + self.mu) * self._phi((a - self.mu) / self.sigma)
+        )
+    def integrate_t_times_psi(self, a, b):
+        """Compute integral int_a^b t * psi(t)."""
+        return self.mu * (
+            self._Phi((b - self.mu) / self.sigma) - self._Phi((a - self.mu) / self.sigma)
+        ) - self.sigma * (
+            self._phi((b - self.mu) / self.sigma) - self._phi((a - self.mu) / self.sigma)
+        )
+    def integrate_psi(self, a, b):
+        """Compute integral int_a^b psi(t)."""
+        return self._Phi((b - self.mu) / self.sigma) - self._Phi((a - self.mu) / self.sigma)
+    def integrate_t2_times_psi_gaussian(self, mu, sigma_sq):
+        """Compute integral int N(t; mu, sigma_sq) * t**2 * psi(t)."""
+        S_tilde = self._integrate_product_of_gaussians(mu, sigma_sq)
+        mu_tilde = (
+            self.mu * sigma_sq + mu * self.sigma ** 2
+        ) / (
+            self.sigma ** 2 + sigma_sq
+        )
+        sigma_sq_tilde = ((self.sigma ** 2) * sigma_sq) / (self.sigma ** 2 + sigma_sq)
+        return S_tilde * (mu_tilde ** 2 + sigma_sq_tilde)
+    def integrate_t_times_psi_gaussian(self, mu, sigma_sq):
+        """Compute integral int N(t; mu, sigma_sq) * t * psi(t)."""
+        S_tilde = self._integrate_product_of_gaussians(mu, sigma_sq)
+        mu_tilde = (
+            self.mu * sigma_sq + mu * self.sigma ** 2
+        ) / (
+            self.sigma ** 2 + sigma_sq
+        )
+        return S_tilde * mu_tilde
+    def integrate_psi_gaussian(self, mu, sigma_sq):
+        """Compute integral int N(t; mu, sigma_sq) * psi(t)."""
+        return self._integrate_product_of_gaussians(mu, sigma_sq)
+class RetangularBasisFunctions(BasisFunctions):
+    """Function phi(t) = Gaussian(t; mu, sigma_sq)."""
+    def __init__(self, mu, sigma):
+        self.mu = mu.unsqueeze(0)
+        self.width = sigma.unsqueeze(0)
+    def __repr__(self):
+        return f"GaussianBasisFunction(mu={self.mu}, sigma={self.sigma})"
+    def __len__(self):
+        """Number of basis functions."""
+        return self.mu.size(1)
+    def batch_evaluate(self, t):
+        """
+        Evaluate multiple time points against all rectangular basis functions.
+        Args:
+            t: Tensor of time values to evaluate, shape (num_points,).
+        Returns:
+            Tensor of evaluations, shape (num_basis, num_points).
+        """
+        t = t.repeat(self.mu.size(0),1)  # Shape: (1, num_points)
+        mu = self.mu.repeat(t.size(0),1).transpose(1,0)  # Shape: (num_basis, 1)
+        width = self.width.repeat(t.size(0),1).transpose(1,0)  # Shape: (num_basis, 1)
+        return ((t >= (mu - width / 2)) & (t < (mu + width / 2))).float().transpose(0,1)
+    def _Phi(self, t):
+        """
+        Compute the step function for a single value of t.
+        Args:
+            t: A scalar or tensor of time values.
+        Returns:
+            Tensor of values indicating presence in each basis function's range.
+        """
+        lower_bounds = self.mu - self.width / 2
+        upper_bounds = self.mu + self.width / 2
+        return ((t >= lower_bounds) & (t < upper_bounds)).float()
+    def evaluate(self, t):
+        """
+        Evaluate the rectangular basis functions at a single point or array of points.
+        Args:
+            t: A scalar or 1D tensor of time values.
+        Returns:
+            Tensor of shape (num_basis,) for scalar input, or (num_basis, num_points) for tensor input.
+        """
+        if t.ndim == 0:  # Scalar input
+            return self._Phi(t)
+        else:  # Tensor input
+          # Shape: (1, num_points)
+            lower_bounds = (self.mu - self.width / 2)  # Shape: (num_basis, 1)
+            upper_bounds = (self.mu + self.width / 2)  # Shape: (num_basis, 1)
+            return ((t >= lower_bounds) & (t < upper_bounds)).float()

long_term_attention_gibbs.py ADDED Viewed

	@@ -0,0 +1,315 @@

+# coding: utf-8
+"""
+Attention modules
+"""
+import torch
+import torch.nn as nn
+import torch.distributions as dist
+from .basis_functions import (
+    PowerBasisFunctions,
+    SineBasisFunctions,
+    CosineBasisFunctions,
+    GaussianBasisFunctions,
+    RetangularBasisFunctions
+)
+import numpy as np
+class LongTermAttention(nn.Module):
+    def __init__(self, head_size:int , length: int, target_len:int,  attn_func: str, attn_num_basis: int,
+                  continuous: bool, attn_drop: float, infinite_memory: bool, n_layers: int,
+                  n_heads: int, affines: bool, mask: bool, mask_type: str, kl_regularizer: bool, proj_key, proj_value, sigma_0, mu_0, sticky_memories, sigmas, tau, **kwargs):
+        super(LongTermAttention, self).__init__()
+        self.device = 'cuda'
+        self.length = length #memory length
+        self.target_len = target_len #target length / transformer length
+        self.head_size = head_size
+        self.attn_num_basis = attn_num_basis
+        self.continuous = continuous # whether attention over memory vectors is continuous
+        self.attn_func = attn_func # normalizing function
+        self.n_head = n_heads
+        self.sigmas = sigmas
+        self.kl_regularizer = kl_regularizer
+        self.sigma_0 = sigma_0
+        self.mu_0 = mu_0
+        self.proj_key = proj_key
+        self.proj_value = proj_value
+        self.affines=affines # whether mu, sigma should be computed using affine transformations
+        self.sticky_memories=sticky_memories
+        self.mem_threshold=2048
+        self.infinite_memory = infinite_memory # whether the memory is infinite
+        self.nb_samples=512 # number of samples used for update
+        self.tau = tau #compressing factor
+        self.count = 0
+        self.x_past=None # previous memory vectors
+        self.B_past=None # previous coefficient matrix
+        self.ridge_penalty=0.5 # ridge penalty
+        self.padding = True
+        self.spacing='linear'
+    def get_basis(self, length, target_len):
+        def compute_G(l, psi, positions, padding=True):
+            F = torch.zeros(self.attn_num_basis, positions.size(0))
+            basis_functions = psi
+            F[:, :] = basis_functions.evaluate(positions.unsqueeze(1)).t()
+            I = torch.eye(self.attn_num_basis)
+            G = F.t().matmul((F.matmul(F.t()) + self.ridge_penalty * I).inverse())
+            if padding:
+                if l % 2:
+                    G = G[((l-1)//2):(-(l-1)//2), :]
+                else:
+                    G = G[(l//2):-(l//2), :]
+            return G.to(self.device)
+        padding = self.padding
+        attn_num_basis = self.attn_num_basis
+        if self.continuous:
+            self.psi=[None]
+            self.Gs=[None for _ in range(length+1)]
+            lengths=[]
+            for i in range(length):
+                self.psi.append([])
+                if (i+1)%target_len==0:
+                    lengths.append(i+1)
+            if length not in lengths:
+                lengths.append(length)
+            for l in lengths:
+                # get positions for memory vectors
+                self.add_retangular_basis_functions(self.psi[l], attn_num_basis, device=self.device)
+                if self.spacing=='linear':
+                    if padding:
+                        if l % 2:
+                            shift = 1 / float(l)
+                            positions = torch.linspace(-.5+shift, 1.5-shift, 2*l-1).to(self.device)
+                        else:
+                            shift = 1 / float(2*l)
+                            positions = torch.linspace(-.5+shift, 1.5-shift, 2*l).to(self.device)
+                    else:
+                        shift = 1 / float(2*l)
+                        positions = torch.linspace(shift, 1-shift, l).to(self.device)
+                elif self.spacing=='log':
+                    if padding:
+                        if l % 2:
+                            shift = 1 / float(l)
+                            positions = torch.linspace(-.5+shift, 1.5-shift, 2*l-1).to(self.device)
+                        else:
+                            shift = 1 / float(2*l)
+                            positions = torch.linspace(-.5+shift, 1.5-shift, 2*l).to(self.device)
+                        pos = np.e**(np.log(1+1)*torch.arange(1,length+1)/length)-1
+                        positions = torch.cat([positions[:int(l/2)],pos.to(self.device),positions[-int(l/2):]])
+                    else:
+                        positions = np.e**(np.log(1+1)*torch.arange(1,length+1)/length)-1
+                # compute basis functions
+                self.Gs[l]=compute_G(l, self.psi[l][0], positions, padding=padding) # [L,N]
+                self.positions = positions[int(l/2):-int(l/2)]
+            # compute samples for memory update
+            if self.infinite_memory:
+                tm_tau = torch.arange(1,self.nb_samples+1).float()
+                tm_l = torch.arange(self.nb_samples+1,length+self.nb_samples+1).float()
+                tm_tau = tm_tau*self.tau/self.nb_samples # positions of old vectors
+                tm_l = self.tau + (1-self.tau)*(tm_l-self.nb_samples)/length # positions of new vectors
+                positions_inf = torch.cat([tm_tau, tm_l],0).to(self.device) # positions
+                if padding:
+                    if l % 2:
+                        shift = 1 / float(length+self.nb_samples)
+                        positions_pad = torch.linspace(-.5+shift, 1.5-shift, 2*(length+self.nb_samples)-1).to(self.device)
+                    else:
+                        shift = 1 / float(2*length+self.nb_samples)
+                        positions_pad = torch.linspace(-.5+shift, 1.5-shift, 2*(length+self.nb_samples)).to(self.device)
+                    positions_pad_ = torch.FloatTensor([i for i in positions_pad if i<0]).to(self.device)
+                    positions_pad__ = torch.FloatTensor([i for i in positions_pad if i>1]).to(self.device)
+                    positions_inf = torch.cat([positions_pad_,positions_inf,positions_pad__], dim=0)
+                self.samples=None
+                for t in tm_tau:
+                    if self.samples is None:
+                        self.samples = self.psi[l][0].evaluate(t/self.tau)
+                    else:
+                        self.samples = torch.cat([self.samples,self.psi[l][0].evaluate(t/self.tau)], dim=0)
+                # compute G for the infinite case
+                self.G_inf = compute_G(self.nb_samples+length, self.psi[l][0], positions_inf, padding=padding) #[L+nb_samples,N]
+                if self.sticky_memories:
+                    self.bins = torch.linspace(0,1,129).to(device=self.device) #self.positions
+                    self.nb_bins_cat=1
+                    self.bins_cat = dist.Categorical(torch.ones(self.nb_bins_cat))
+    def add_gaussian_basis_functions(self, psi, nb_basis, sigmas, device):
+        mu, sigma = torch.meshgrid(torch.linspace(0, 1, nb_basis // len(sigmas)), torch.Tensor(sigmas))
+        mu = mu.flatten().to(device)
+        sigma = sigma.flatten().to(device)
+        self.basis_mu=mu
+        self.basis_sigma=sigma
+        assert mu.size(0) == nb_basis
+        psi.append(GaussianBasisFunctions(mu=mu, sigma=sigma))
+    def add_retangular_basis_functions(self, psi, nb_basis, device):
+        width = torch.ones(nb_basis, device=device) / nb_basis
+        # Compute the centers (midpoints) of each bin
+        edges = torch.linspace(0, 1, nb_basis + 1, device=device)
+        mu = (edges[:-1] + edges[1:]) / 2
+        psi.append(RetangularBasisFunctions(mu=mu, sigma=width))
+    def value_function(self, x, inf=False):
+        if inf:
+            G = self.G_inf # [nb_sample+L,N]
+        else:
+            G = self.Gs[x.size(-1)] # [L,N]
+        B = torch.matmul(x, G) # [B,e,N]
+        B = B.permute(0,2,1) # [B,N,e]
+        return B
+    def update_inf(self, x):
+        if self.B_past is not None:
+            if self.sticky_memories:
+                bins = self.bins.clone()
+                bins[0]=-.000001
+                bins[-1]=1.000001
+                prob_density = self.compute_probability(self.score, t=bins)
+                cum_prob = torch.cumulative_trapezoid(prob_density, bins, dim=-1).to(self.device)
+                p = (cum_prob[..., 1:] - cum_prob[..., :-1]).sum(dim=(1, 2))
+                p = p / p.sum(-1, keepdim=True)  # Normalize over the last dimension (bins)
+                p = dist.Categorical(p)
+                b = p.sample((self.nb_samples,))
+                t = self.bins_cat.sample((self.nb_samples, 1)).to(device=self.device)
+                ts = (t*(self.bins[b+1]-self.bins[b])/self.nb_bins_cat +self.bins[b]).transpose(1,0)
+                samples = self.psi[self.length][0].batch_evaluate(ts[0]).contiguous()
+                xm_tau = self.B_past.transpose(-1,-2).matmul(samples.transpose(-1,-2)) # [B,e,nb_samples]
+            else:
+                xm_tau = self.B_past.transpose(-1,-2).matmul(self.samples.transpose(-1,-2)) # [B,e,nb_samples]
+            x = torch.cat([xm_tau,x], dim=2) # [B,e,nb_samples+L]
+            B = self.value_function(x, inf=True) # [B,N,e]
+        else:
+            B = self.value_function(x)
+        self.B_past=B.detach()
+        self.x_past=x
+        return B
+    def score(self, t):
+        psis = self.psis[0].batch_evaluate(t)
+        query = self.queries/ (self.d_head ** 0.5) # divide by sqrt(d_head) [B,h,q,d]
+        keys = self.keys.transpose(-1, -2)
+        keys = torch.matmul(keys, psis.T) #[B,h,d,1]
+        scores = torch.matmul(query, keys) #[B,h,q,1]
+        return scores
+    def compute_probability(self, score_fn, num_points=1000, t=None):
+        """
+        Compute probability distribution p(t).
+        Args:
+            score_fn (callable): Function that computes z(t)
+            num_points (int): Number of points for numerical integration
+        Returns:
+            tuple: (probabilities, normalization constant)
+        """
+        if t is None:
+            # Create integration points
+            t = torch.linspace(0, 1, num_points).to(self.device)
+        scores = score_fn(t)
+        prob = torch.exp(scores) / torch.trapz(torch.exp(scores), t, dim=-1).unsqueeze(-1)
+        return prob
+    def expected_value(self, score_fn, num_points=1000):
+        """
+        Compute expected value E_p[V(t)] using nested integration.
+        Args:
+            score_fn (callable): Function that computes z(t)
+            value_fn (callable): Function that computes v(t)
+            num_points (int): Number of points for numerical integration
+        Returns:
+            torch.Tensor: Expected value
+        """
+        # Create integration points
+        t = torch.linspace(0, 1, num_points).to(self.device)
+        # Compute basis functions
+        self.psis = []
+        self.add_retangular_basis_functions(self.psis, self.attn_num_basis, self.device)
+        psi = self.psis[0].batch_evaluate(t)
+        # Compute probability distribution
+        prob = self.compute_probability(score_fn, num_points)
+        # Compute values at integration points
+        values = self.values
+        # Compute p(t) * psi(t)
+        # Reshape psi for broadcasting to match the shape of prob
+        psi_broadcasted = psi.unsqueeze(1).unsqueeze(2).unsqueeze(3)
+        # Expand psi to match the dimensions of prob (num_points, batch_size, n_head, qlen, 256)
+        psi_broadcasted = psi_broadcasted.expand(num_points, self.batch_size, self.n_head, self.qlen, self.attn_num_basis)
+        integrand = torch.matmul(prob.permute(3,0,1,2).unsqueeze(-1).unsqueeze(-1), psi_broadcasted.unsqueeze(-2)).permute(1, 2, 3, 4, 5, 0).squeeze(-3)
+        integral  = torch.trapz(integrand, t, dim=-1)
+        # Matrix multiply with values
+        expected_value = torch.matmul(integral, values)  # [B, h, q, d]
+        return expected_value
+    def forward(self, k, q, new_doc, layer_n):
+        self.device = k.device
+        if self.continuous:
+            klen = int(k.size(1)/(14*14))
+            self.length = klen
+            batch_size = k.size(0) #batch size
+            qlen = q.size(1) #query length
+            self.qlen = qlen
+            self.batch_size = batch_size
+            self.d_head = self.head_size #head size
+            self.get_basis(klen, klen)
+            # clean memory if going through different document
+            if new_doc:
+                self.B_past=None
+                self.x_past=None
+            k = k.reshape(batch_size, klen, 14, 14, 1024).mean(dim=(2, 3))
+            k = k.transpose(1,2)
+            # perform memory update
+            if self.infinite_memory:
+                B = self.update_inf(k)
+            else: # compute input continuous approximation
+                B = self.value_function(k) # [B,N,e]
+            keys = self.proj_key(B)
+            values = self.proj_value(B)
+            query = q
+            self.queries = query.view(batch_size,qlen,self.n_head,self.d_head).transpose(1,2) # [B,h,q,d]
+            self.keys = keys.view(batch_size,self.attn_num_basis,self.n_head,self.d_head).transpose(1,2) # [B,h,N,d]
+            self.values = values.view(batch_size,self.attn_num_basis,self.n_head,self.d_head).transpose(1,2) # [B, h, q, N]
+            context = self.expected_value(self.score)  # Shape [1, 32, 768]
+            return context.contiguous().transpose(1,2).reshape(1, qlen, -1)