Delete iris/models

Browse files

Files changed (6) hide show

iris/models/__init__.py +0 -0
iris/models/kv_caching.py +0 -106
iris/models/lpips.py +0 -167
iris/models/nets.py +0 -345
iris/models/slicer.py +0 -53
iris/models/transformer.py +0 -101

iris/models/__init__.py DELETED Viewed

File without changes

iris/models/kv_caching.py DELETED Viewed

@@ -1,106 +0,0 @@
-from typing import Tuple
-import numpy as np
-import torch
-class Cache:
-    def __init__(self, num_samples: int, num_heads: int, max_tokens: int, embed_dim: int, device: torch.device) -> None:
-        assert embed_dim % num_heads == 0
-        self._n, self._cache, self._size = num_samples, None, None
-        self._reset = lambda n: torch.empty(n, num_heads, max_tokens, embed_dim // num_heads, device=device)  # (B, nh, T, hs)
-        self.reset()
-    @property
-    def shape(self) -> Tuple[int, int, int, int]:
-        n, num_heads, _, head_dim = self._cache.shape
-        return n, num_heads, self._size, head_dim
-    def reset(self) -> None:
-        self._cache = self._reset(self._n)
-        self._size = 0
-    def prune(self, mask: np.ndarray) -> None:
-        assert mask.ndim == 1 and mask.shape[0] == self.shape[0]
-        self._cache = self._cache[mask]
-        self._n = self._cache.shape[0]
-    def get(self) -> torch.Tensor:
-        return self._cache[:, :, :self._size, :]
-    def update(self, x: torch.Tensor) -> None:
-        assert (x.ndim == self._cache.ndim) and all([x.size(i) == self._cache.size(i) for i in (0, 1, 3)])
-        assert self._size + x.size(2) <= self._cache.shape[2]
-        self._cache = AssignWithoutInplaceCheck.apply(self._cache, x, 2, self._size, self._size + x.size(2))
-        self._size += x.size(2)
-class KVCache:
-    def __init__(self, n: int, num_heads: int, max_tokens: int, embed_dim: int, device: torch.device) -> None:
-        self._k_cache = Cache(n, num_heads, max_tokens, embed_dim, device)
-        self._v_cache = Cache(n, num_heads, max_tokens, embed_dim, device)
-    @property
-    def shape(self) -> Tuple[int, int, int, int]:
-        return self._k_cache.shape
-    def reset(self) -> None:
-        self._k_cache.reset()
-        self._v_cache.reset()
-    def prune(self, mask: np.ndarray) -> None:
-        self._k_cache.prune(mask)
-        self._v_cache.prune(mask)
-    def get(self) -> Tuple[torch.Tensor, torch.Tensor]:
-        return self._k_cache.get(), self._v_cache.get()
-    def update(self, k: torch.Tensor, v: torch.Tensor):
-        self._k_cache.update(k)
-        self._v_cache.update(v)
-class KeysValues:
-    def __init__(self, n: int, num_heads: int, max_tokens: int, embed_dim: int, num_layers: int, device: torch.device) -> None:
-        self._keys_values = tuple([KVCache(n, num_heads, max_tokens, embed_dim, device) for _ in range(num_layers)])
-    def __getitem__(self, key: int) -> KVCache:
-        return self._keys_values[key]
-    def __len__(self):
-        return len(self._keys_values)
-    @property
-    def size(self):
-        return self._keys_values[0].shape[2]
-    def reset(self) -> None:
-        for kv_cache in self._keys_values:
-            kv_cache.reset()
-    def prune(self, mask: np.ndarray) -> None:
-        for kv_cache in self._keys_values:
-            kv_cache.prune(mask)
-class AssignWithoutInplaceCheck(torch.autograd.Function):
-    """
-    Inspired from : https://discuss.pytorch.org/t/disable-in-place-correctness-version-check-any-other-workaround/90738/4
-    Warning : do not use it to overwrite a slice twice.
-    """
-    @staticmethod
-    def get_slice(dim: int, start: int, stop: int) -> Tuple[slice]:
-        return tuple([slice(None), ] * dim + [slice(start, stop)])
-    @staticmethod
-    def forward(ctx, input: torch.Tensor, value: torch.Tensor, dim: int, start: int, stop: int) -> torch.Tensor:
-        ctx.dim = dim
-        ctx.start = start
-        ctx.stop = stop
-        input.data[AssignWithoutInplaceCheck.get_slice(dim, start, stop)] = value
-        return input
-    @staticmethod
-    def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor]:
-        return grad_out, grad_out[AssignWithoutInplaceCheck.get_slice(ctx.dim, ctx.start, ctx.stop)], None, None, None

iris/models/lpips.py DELETED Viewed

@@ -1,167 +0,0 @@
-"""
-Credits to https://github.com/CompVis/taming-transformers
-"""
-from collections import namedtuple
-import hashlib
-import os
-from pathlib import Path
-import requests
-import torch
-import torch.nn as nn
-from torchvision import models
-from tqdm import tqdm
-class LPIPS(nn.Module):
-    # Learned perceptual metric
-    def __init__(self, use_dropout: bool = True):
-        super().__init__()
-        self.scaling_layer = ScalingLayer()
-        self.chns = [64, 128, 256, 512, 512]  # vg16 features
-        self.net = vgg16(pretrained=True, requires_grad=False)
-        self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
-        self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)
-        self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)
-        self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout)
-        self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout)
-        self.load_from_pretrained()
-        for param in self.parameters():
-            param.requires_grad = False
-    def load_from_pretrained(self) -> None:
-        ckpt = get_ckpt_path(name="vgg_lpips", root=Path.home() / ".cache/iris/tokenizer_pretrained_vgg")  # Download VGG if necessary
-        self.load_state_dict(torch.load(ckpt, map_location=torch.device("cpu")), strict=False)
-    def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
-        in0_input, in1_input = (self.scaling_layer(input), self.scaling_layer(target))
-        outs0, outs1 = self.net(in0_input), self.net(in1_input)
-        feats0, feats1, diffs = {}, {}, {}
-        lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4]
-        for kk in range(len(self.chns)):
-            feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk])
-            diffs[kk] = (feats0[kk] - feats1[kk]) ** 2
-        res = [spatial_average(lins[kk].model(diffs[kk]), keepdim=True) for kk in range(len(self.chns))]
-        val = res[0]
-        for i in range(1, len(self.chns)):
-            val += res[i]
-        return val
-class ScalingLayer(nn.Module):
-    def __init__(self) -> None:
-        super(ScalingLayer, self).__init__()
-        self.register_buffer('shift', torch.Tensor([-.030, -.088, -.188])[None, :, None, None])
-        self.register_buffer('scale', torch.Tensor([.458, .448, .450])[None, :, None, None])
-    def forward(self, inp: torch.Tensor) -> torch.Tensor:
-        return (inp - self.shift) / self.scale
-class NetLinLayer(nn.Module):
-    """ A single linear layer which does a 1x1 conv """
-    def __init__(self, chn_in: int, chn_out: int = 1, use_dropout: bool = False) -> None:
-        super(NetLinLayer, self).__init__()
-        layers = [nn.Dropout(), ] if (use_dropout) else []
-        layers += [nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False), ]
-        self.model = nn.Sequential(*layers)
-class vgg16(torch.nn.Module):
-    def __init__(self, requires_grad: bool = False, pretrained: bool = True) -> None:
-        super(vgg16, self).__init__()
-        vgg_pretrained_features = models.vgg16(pretrained=pretrained).features
-        self.slice1 = torch.nn.Sequential()
-        self.slice2 = torch.nn.Sequential()
-        self.slice3 = torch.nn.Sequential()
-        self.slice4 = torch.nn.Sequential()
-        self.slice5 = torch.nn.Sequential()
-        self.N_slices = 5
-        for x in range(4):
-            self.slice1.add_module(str(x), vgg_pretrained_features[x])
-        for x in range(4, 9):
-            self.slice2.add_module(str(x), vgg_pretrained_features[x])
-        for x in range(9, 16):
-            self.slice3.add_module(str(x), vgg_pretrained_features[x])
-        for x in range(16, 23):
-            self.slice4.add_module(str(x), vgg_pretrained_features[x])
-        for x in range(23, 30):
-            self.slice5.add_module(str(x), vgg_pretrained_features[x])
-        if not requires_grad:
-            for param in self.parameters():
-                param.requires_grad = False
-    def forward(self, X: torch.Tensor) -> torch.Tensor:
-        h = self.slice1(X)
-        h_relu1_2 = h
-        h = self.slice2(h)
-        h_relu2_2 = h
-        h = self.slice3(h)
-        h_relu3_3 = h
-        h = self.slice4(h)
-        h_relu4_3 = h
-        h = self.slice5(h)
-        h_relu5_3 = h
-        vgg_outputs = namedtuple("VggOutputs", ['relu1_2', 'relu2_2', 'relu3_3', 'relu4_3', 'relu5_3'])
-        out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3)
-        return out
-def normalize_tensor(x: torch.Tensor, eps: float = 1e-10) -> torch.Tensor:
-    norm_factor = torch.sqrt(torch.sum(x ** 2, dim=1, keepdim=True))
-    return x / (norm_factor + eps)
-def spatial_average(x: torch.Tensor, keepdim: bool = True) -> torch.Tensor:
-    return x.mean([2, 3], keepdim=keepdim)
-# ********************************************************************
-# *************** Utilities to download pretrained vgg ***************
-# ********************************************************************
-URL_MAP = {
-    "vgg_lpips": "https://heibox.uni-heidelberg.de/f/607503859c864bc1b30b/?dl=1"
-}
-CKPT_MAP = {
-    "vgg_lpips": "vgg.pth"
-}
-MD5_MAP = {
-    "vgg_lpips": "d507d7349b931f0638a25a48a722f98a"
-}
-def download(url: str, local_path: str, chunk_size: int = 1024) -> None:
-    os.makedirs(os.path.split(local_path)[0], exist_ok=True)
-    with requests.get(url, stream=True) as r:
-        total_size = int(r.headers.get("content-length", 0))
-        with tqdm(total=total_size, unit="B", unit_scale=True) as pbar:
-            with open(local_path, "wb") as f:
-                for data in r.iter_content(chunk_size=chunk_size):
-                    if data:
-                        f.write(data)
-                        pbar.update(chunk_size)
-def md5_hash(path: str) -> str:
-    with open(path, "rb") as f:
-        content = f.read()
-    return hashlib.md5(content).hexdigest()
-def get_ckpt_path(name: str, root: str, check: bool = False) -> str:
-    assert name in URL_MAP
-    path = os.path.join(root, CKPT_MAP[name])
-    if not os.path.exists(path) or (check and not md5_hash(path) == MD5_MAP[name]):
-        print("Downloading {} model from {} to {}".format(name, URL_MAP[name], path))
-        download(URL_MAP[name], path)
-        md5 = md5_hash(path)
-        assert md5 == MD5_MAP[name], md5
-    return path

iris/models/nets.py DELETED Viewed

@@ -1,345 +0,0 @@
-"""
-Credits to https://github.com/CompVis/taming-transformers
-"""
-import torch
-import torch.nn as nn
-class Encoder(nn.Module):
-    def __init__(self, config: dict) -> None:
-        super().__init__()
-        self.config = config
-        self.num_resolutions = len(config["ch_mult"])
-        temb_ch = 0  # timestep embedding #channels
-        # downsampling
-        self.conv_in = torch.nn.Conv2d(config["in_channels"],
-                                       config["ch"],
-                                       kernel_size=3,
-                                       stride=1,
-                                       padding=1)
-        curr_res = config["resolution"]
-        in_ch_mult = (1,) + tuple(config["ch_mult"])
-        self.down = nn.ModuleList()
-        for i_level in range(self.num_resolutions):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_in = config["ch"] * in_ch_mult[i_level]
-            block_out = config["ch"] * config["ch_mult"][i_level]
-            for i_block in range(self.config["num_res_blocks"]):
-                block.append(ResnetBlock(in_channels=block_in,
-                                         out_channels=block_out,
-                                         temb_channels=temb_ch,
-                                         dropout=config["dropout"]))
-                block_in = block_out
-                if curr_res in config["attn_resolutions"]:
-                    attn.append(AttnBlock(block_in))
-            down = nn.Module()
-            down.block = block
-            down.attn = attn
-            if i_level != self.num_resolutions - 1:
-                down.downsample = Downsample(block_in, with_conv=True)
-                curr_res = curr_res // 2
-            self.down.append(down)
-        # middle
-        self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(in_channels=block_in,
-                                       out_channels=block_in,
-                                       temb_channels=temb_ch,
-                                       dropout=config["dropout"])
-        self.mid.attn_1 = AttnBlock(block_in)
-        self.mid.block_2 = ResnetBlock(in_channels=block_in,
-                                       out_channels=block_in,
-                                       temb_channels=temb_ch,
-                                       dropout=config["dropout"])
-        # end
-        self.norm_out = Normalize(block_in)
-        self.conv_out = torch.nn.Conv2d(block_in,
-                                        config["z_channels"],
-                                        kernel_size=3,
-                                        stride=1,
-                                        padding=1)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        temb = None  # timestep embedding
-        # downsampling
-        hs = [self.conv_in(x)]
-        for i_level in range(self.num_resolutions):
-            for i_block in range(self.config["num_res_blocks"]):
-                h = self.down[i_level].block[i_block](hs[-1], temb)
-                if len(self.down[i_level].attn) > 0:
-                    h = self.down[i_level].attn[i_block](h)
-                hs.append(h)
-            if i_level != self.num_resolutions - 1:
-                hs.append(self.down[i_level].downsample(hs[-1]))
-        # middle
-        h = hs[-1]
-        h = self.mid.block_1(h, temb)
-        h = self.mid.attn_1(h)
-        h = self.mid.block_2(h, temb)
-        # end
-        h = self.norm_out(h)
-        h = nonlinearity(h)
-        h = self.conv_out(h)
-        return h
-class Decoder(nn.Module):
-    def __init__(self, config: dict) -> None:
-        super().__init__()
-        self.config = config
-        temb_ch = 0
-        self.num_resolutions = len(config["ch_mult"])
-        # compute in_ch_mult, block_in and curr_res at lowest res
-        in_ch_mult = (1,) + tuple(config["ch_mult"])
-        block_in = config["ch"] * config["ch_mult"][self.num_resolutions - 1]
-        curr_res = config["resolution"] // 2 ** (self.num_resolutions - 1)
-        print(f"Tokenizer : shape of latent is {config["z_channels"], curr_res, curr_res}.")
-        # z to block_in
-        self.conv_in = torch.nn.Conv2d(config["z_channels"],
-                                       block_in,
-                                       kernel_size=3,
-                                       stride=1,
-                                       padding=1)
-        # middle
-        self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(in_channels=block_in,
-                                       out_channels=block_in,
-                                       temb_channels=temb_ch,
-                                       dropout=config["dropout"])
-        self.mid.attn_1 = AttnBlock(block_in)
-        self.mid.block_2 = ResnetBlock(in_channels=block_in,
-                                       out_channels=block_in,
-                                       temb_channels=temb_ch,
-                                       dropout=config["dropout"])
-        # upsampling
-        self.up = nn.ModuleList()
-        for i_level in reversed(range(self.num_resolutions)):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_out = config["ch"] * config["ch_mult"][i_level]
-            for i_block in range(config["num_res_blocks"] + 1):
-                block.append(ResnetBlock(in_channels=block_in,
-                                         out_channels=block_out,
-                                         temb_channels=temb_ch,
-                                         dropout=config["dropout"]))
-                block_in = block_out
-                if curr_res in config["attn_resolutions"]:
-                    attn.append(AttnBlock(block_in))
-            up = nn.Module()
-            up.block = block
-            up.attn = attn
-            if i_level != 0:
-                up.upsample = Upsample(block_in, with_conv=True)
-                curr_res = curr_res * 2
-            self.up.insert(0, up)  # prepend to get consistent order
-        # end
-        self.norm_out = Normalize(block_in)
-        self.conv_out = torch.nn.Conv2d(block_in,
-                                        config["out_ch"],
-                                        kernel_size=3,
-                                        stride=1,
-                                        padding=1)
-    def forward(self, z: torch.Tensor) -> torch.Tensor:
-        temb = None  # timestep embedding
-        # z to block_in
-        h = self.conv_in(z)
-        # middle
-        h = self.mid.block_1(h, temb)
-        h = self.mid.attn_1(h)
-        h = self.mid.block_2(h, temb)
-        # upsampling
-        for i_level in reversed(range(self.num_resolutions)):
-            for i_block in range(self.config["num_res_blocks"] + 1):
-                h = self.up[i_level].block[i_block](h, temb)
-                if len(self.up[i_level].attn) > 0:
-                    h = self.up[i_level].attn[i_block](h)
-            if i_level != 0:
-                h = self.up[i_level].upsample(h)
-        # end
-        h = self.norm_out(h)
-        h = nonlinearity(h)
-        h = self.conv_out(h)
-        return h
-def nonlinearity(x: torch.Tensor) -> torch.Tensor:
-    # swish
-    return x * torch.sigmoid(x)
-def Normalize(in_channels: int) -> nn.Module:
-    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
-class Upsample(nn.Module):
-    def __init__(self, in_channels: int, with_conv: bool) -> None:
-        super().__init__()
-        self.with_conv = with_conv
-        if self.with_conv:
-            self.conv = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=3,
-                                        stride=1,
-                                        padding=1)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
-        if self.with_conv:
-            x = self.conv(x)
-        return x
-class Downsample(nn.Module):
-    def __init__(self, in_channels: int, with_conv: bool) -> None:
-        super().__init__()
-        self.with_conv = with_conv
-        if self.with_conv:
-            # no asymmetric padding in torch conv, must do it ourselves
-            self.conv = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=3,
-                                        stride=2,
-                                        padding=0)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        if self.with_conv:
-            pad = (0, 1, 0, 1)
-            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
-            x = self.conv(x)
-        else:
-            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
-        return x
-class ResnetBlock(nn.Module):
-    def __init__(self, *, in_channels: int, out_channels: int = None, conv_shortcut: bool = False,
-                 dropout: float, temb_channels: int = 512) -> None:
-        super().__init__()
-        self.in_channels = in_channels
-        out_channels = in_channels if out_channels is None else out_channels
-        self.out_channels = out_channels
-        self.use_conv_shortcut = conv_shortcut
-        self.norm1 = Normalize(in_channels)
-        self.conv1 = torch.nn.Conv2d(in_channels,
-                                     out_channels,
-                                     kernel_size=3,
-                                     stride=1,
-                                     padding=1)
-        if temb_channels > 0:
-            self.temb_proj = torch.nn.Linear(temb_channels,
-                                             out_channels)
-        self.norm2 = Normalize(out_channels)
-        self.dropout = torch.nn.Dropout(dropout)
-        self.conv2 = torch.nn.Conv2d(out_channels,
-                                     out_channels,
-                                     kernel_size=3,
-                                     stride=1,
-                                     padding=1)
-        if self.in_channels != self.out_channels:
-            if self.use_conv_shortcut:
-                self.conv_shortcut = torch.nn.Conv2d(in_channels,
-                                                     out_channels,
-                                                     kernel_size=3,
-                                                     stride=1,
-                                                     padding=1)
-            else:
-                self.nin_shortcut = torch.nn.Conv2d(in_channels,
-                                                    out_channels,
-                                                    kernel_size=1,
-                                                    stride=1,
-                                                    padding=0)
-    def forward(self, x: torch.Tensor, temb: torch.Tensor) -> torch.Tensor:
-        h = x
-        h = self.norm1(h)
-        h = nonlinearity(h)
-        h = self.conv1(h)
-        if temb is not None:
-            h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]
-        h = self.norm2(h)
-        h = nonlinearity(h)
-        h = self.dropout(h)
-        h = self.conv2(h)
-        if self.in_channels != self.out_channels:
-            if self.use_conv_shortcut:
-                x = self.conv_shortcut(x)
-            else:
-                x = self.nin_shortcut(x)
-        return x + h
-class AttnBlock(nn.Module):
-    def __init__(self, in_channels: int) -> None:
-        super().__init__()
-        self.in_channels = in_channels
-        self.norm = Normalize(in_channels)
-        self.q = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.k = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.v = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.proj_out = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=1,
-                                        stride=1,
-                                        padding=0)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        h_ = x
-        h_ = self.norm(h_)
-        q = self.q(h_)
-        k = self.k(h_)
-        v = self.v(h_)
-        # compute attention
-        b, c, h, w = q.shape
-        q = q.reshape(b, c, h * w)
-        q = q.permute(0, 2, 1)      # b,hw,c
-        k = k.reshape(b, c, h * w)  # b,c,hw
-        w_ = torch.bmm(q, k)        # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
-        w_ = w_ * (int(c) ** (-0.5))
-        w_ = torch.nn.functional.softmax(w_, dim=2)
-        # attend to values
-        v = v.reshape(b, c, h * w)
-        w_ = w_.permute(0, 2, 1)   # b,hw,hw (first hw of k, second of q)
-        h_ = torch.bmm(v, w_)     # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
-        h_ = h_.reshape(b, c, h, w)
-        h_ = self.proj_out(h_)
-        return x + h_

iris/models/slicer.py DELETED Viewed

@@ -1,53 +0,0 @@
-import math
-from typing import List
-import torch
-import torch.nn as nn
-class Slicer(nn.Module):
-    def __init__(self, max_blocks: int, block_mask: torch.Tensor) -> None:
-        super().__init__()
-        self.block_size = block_mask.size(0)
-        self.num_kept_tokens = block_mask.sum().long().item()
-        kept_indices = torch.where(block_mask)[0].repeat(max_blocks)
-        offsets = torch.arange(max_blocks).repeat_interleave(self.num_kept_tokens)
-        self.register_buffer('indices', kept_indices + block_mask.size(0) * offsets)
-    def compute_slice(self, num_steps: int, prev_steps: int = 0) -> torch.Tensor:
-        total_steps = num_steps + prev_steps
-        num_blocks = math.ceil(total_steps / self.block_size)
-        indices = self.indices[:num_blocks * self.num_kept_tokens]
-        return indices[torch.logical_and(prev_steps <= indices, indices < total_steps)] - prev_steps
-    def forward(self, *args, **kwargs):
-        raise NotImplementedError
-class Head(Slicer):
-    def __init__(self, max_blocks: int, block_mask: torch.Tensor, head_module: nn.Module) -> None:
-        super().__init__(max_blocks, block_mask)
-        assert isinstance(head_module, nn.Module)
-        self.head_module = head_module
-    def forward(self, x: torch.Tensor, num_steps: int, prev_steps: int) -> torch.Tensor:
-        x_sliced = x[:, self.compute_slice(num_steps, prev_steps)]  # x is (B, T, E)
-        return self.head_module(x_sliced)
-class Embedder(nn.Module):
-    def __init__(self, max_blocks: int, block_masks: List[torch.Tensor], embedding_tables: List[nn.Embedding]) -> None:
-        super().__init__()
-        assert len(block_masks) == len(embedding_tables)
-        assert (sum(block_masks) == 1).all()  # block mask are a partition of a block
-        self.embedding_dim = embedding_tables[0].embedding_dim
-        assert all([e.embedding_dim == self.embedding_dim for e in embedding_tables])
-        self.embedding_tables = embedding_tables
-        self.slicers = [Slicer(max_blocks, block_mask) for block_mask in block_masks]
-    def forward(self, tokens: torch.Tensor, num_steps: int, prev_steps: int) -> torch.Tensor:
-        assert tokens.ndim == 2  # x is (B, T)
-        output = torch.zeros(*tokens.size(), self.embedding_dim, device=tokens.device)
-        for slicer, emb in zip(self.slicers, self.embedding_tables):
-            s = slicer.compute_slice(num_steps, prev_steps)
-            output[:, s] = emb(tokens[:, s])
-        return output

iris/models/transformer.py DELETED Viewed

@@ -1,101 +0,0 @@
-"""
-Credits to https://github.com/karpathy/minGPT
-"""
-from dataclasses import dataclass
-import math
-from typing import Optional
-from einops import rearrange
-import torch
-import torch.nn as nn
-from torch.nn import functional as F
-from .kv_caching import KeysValues, KVCache
-class Transformer(nn.Module):
-    def __init__(self, config: dict) -> None:
-        super().__init__()
-        self.config = config
-        self.config["max_tokens"] = config["tokens_per_block"] * config["max_blocks"]
-        self.drop = nn.Dropout(config["embed_pdrop"])
-        self.blocks = nn.ModuleList([Block(config) for _ in range(config["num_layers"])])
-        self.ln_f = nn.LayerNorm(config["embed_dim"])
-    def generate_empty_keys_values(self, n: int, max_tokens: int) -> KeysValues:
-        device = self.ln_f.weight.device  # Assumption that all submodules are on the same device
-        return KeysValues(n, self.config["num_heads"], max_tokens, self.config["embed_dim"], self.config["num_layers"], device)
-    def forward(self, sequences: torch.Tensor, past_keys_values: Optional[KeysValues] = None) -> torch.Tensor:
-        assert past_keys_values is None or len(past_keys_values) == len(self.blocks)
-        x = self.drop(sequences)
-        for i, block in enumerate(self.blocks):
-            x = block(x, None if past_keys_values is None else past_keys_values[i])
-        x = self.ln_f(x)
-        return x
-class Block(nn.Module):
-    def __init__(self, config: dict) -> None:
-        super().__init__()
-        self.ln1 = nn.LayerNorm(config["embed_dim"])
-        self.ln2 = nn.LayerNorm(config["embed_dim"])
-        self.attn = SelfAttention(config)
-        self.mlp = nn.Sequential(
-            nn.Linear(config["embed_dim"], 4 * config["embed_dim"]),
-            nn.GELU(),
-            nn.Linear(4 * config["embed_dim"], config["embed_dim"]),
-            nn.Dropout(config["resid_pdrop"]),
-        )
-    def forward(self, x: torch.Tensor, past_keys_values: Optional[KeysValues] = None) -> torch.Tensor:
-        x_attn = self.attn(self.ln1(x), past_keys_values)
-        x = x + x_attn
-        x = x + self.mlp(self.ln2(x))
-        return x
-class SelfAttention(nn.Module):
-    def __init__(self, config: dict) -> None:
-        super().__init__()
-        assert config["embed_dim"] % config["num_heads"] == 0
-        assert config["attention"] in ('causal', 'block_causal')
-        self.num_heads = config["num_heads"]
-        self.key = nn.Linear(config["embed_dim"], config["embed_dim"])
-        self.query = nn.Linear(config["embed_dim"], config["embed_dim"])
-        self.value = nn.Linear(config["embed_dim"], config["embed_dim"])
-        self.attn_drop = nn.Dropout(config["attn_pdrop"])
-        self.resid_drop = nn.Dropout(config["resid_pdrop"])
-        self.proj = nn.Linear(config["embed_dim"], config["embed_dim"])
-        causal_mask = torch.tril(torch.ones(config["max_tokens"], config["max_tokens"]))
-        block_causal_mask = torch.max(causal_mask, torch.block_diag(*[torch.ones(config["tokens_per_block"], config["tokens_per_block"]) for _ in range(config["max_blocks"])]))
-        self.register_buffer('mask', causal_mask if config["attention"] == 'causal' else block_causal_mask)
-    def forward(self, x: torch.Tensor, kv_cache: Optional[KVCache] = None) -> torch.Tensor:
-        B, T, C = x.size()
-        if kv_cache is not None:
-            b, nh, L, c = kv_cache.shape
-            assert nh == self.num_heads and b == B and c * nh == C
-        else:
-            L = 0
-        q = self.query(x).view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2)   # (B, nh, T, hs)
-        k = self.key(x).view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2)     # (B, nh, T, hs)
-        v = self.value(x).view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2)   # (B, nh, T, hs)
-        if kv_cache is not None:
-            kv_cache.update(k, v)
-            k, v = kv_cache.get()
-        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
-        att = att.masked_fill(self.mask[L:L + T, :L + T] == 0, float('-inf'))
-        att = F.softmax(att, dim=-1)
-        att = self.attn_drop(att)
-        y = att @ v
-        y = rearrange(y, 'b h t e -> b t (h e)')
-        y = self.resid_drop(self.proj(y))
-        return y