Upload 8 files

Browse files

Files changed (8) hide show

iris/src/models/__init__.py +0 -0
iris/src/models/kv_caching.py +106 -0
iris/src/models/lpips.py +167 -0
iris/src/models/nets.py +345 -0
iris/src/models/slicer.py +53 -0
iris/src/models/transformer.py +101 -0
iris/src/tokenizer.py +81 -0
iris/src/world_model.py +93 -0

iris/src/models/__init__.py ADDED Viewed

File without changes

iris/src/models/kv_caching.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from typing import Tuple
+import numpy as np
+import torch
+class Cache:
+    def __init__(self, num_samples: int, num_heads: int, max_tokens: int, embed_dim: int, device: torch.device) -> None:
+        assert embed_dim % num_heads == 0
+        self._n, self._cache, self._size = num_samples, None, None
+        self._reset = lambda n: torch.empty(n, num_heads, max_tokens, embed_dim // num_heads, device=device)  # (B, nh, T, hs)
+        self.reset()
+    @property
+    def shape(self) -> Tuple[int, int, int, int]:
+        n, num_heads, _, head_dim = self._cache.shape
+        return n, num_heads, self._size, head_dim
+    def reset(self) -> None:
+        self._cache = self._reset(self._n)
+        self._size = 0
+    def prune(self, mask: np.ndarray) -> None:
+        assert mask.ndim == 1 and mask.shape[0] == self.shape[0]
+        self._cache = self._cache[mask]
+        self._n = self._cache.shape[0]
+    def get(self) -> torch.Tensor:
+        return self._cache[:, :, :self._size, :]
+    def update(self, x: torch.Tensor) -> None:
+        assert (x.ndim == self._cache.ndim) and all([x.size(i) == self._cache.size(i) for i in (0, 1, 3)])
+        assert self._size + x.size(2) <= self._cache.shape[2]
+        self._cache = AssignWithoutInplaceCheck.apply(self._cache, x, 2, self._size, self._size + x.size(2))
+        self._size += x.size(2)
+class KVCache:
+    def __init__(self, n: int, num_heads: int, max_tokens: int, embed_dim: int, device: torch.device) -> None:
+        self._k_cache = Cache(n, num_heads, max_tokens, embed_dim, device)
+        self._v_cache = Cache(n, num_heads, max_tokens, embed_dim, device)
+    @property
+    def shape(self) -> Tuple[int, int, int, int]:
+        return self._k_cache.shape
+    def reset(self) -> None:
+        self._k_cache.reset()
+        self._v_cache.reset()
+    def prune(self, mask: np.ndarray) -> None:
+        self._k_cache.prune(mask)
+        self._v_cache.prune(mask)
+    def get(self) -> Tuple[torch.Tensor, torch.Tensor]:
+        return self._k_cache.get(), self._v_cache.get()
+    def update(self, k: torch.Tensor, v: torch.Tensor):
+        self._k_cache.update(k)
+        self._v_cache.update(v)
+class KeysValues:
+    def __init__(self, n: int, num_heads: int, max_tokens: int, embed_dim: int, num_layers: int, device: torch.device) -> None:
+        self._keys_values = tuple([KVCache(n, num_heads, max_tokens, embed_dim, device) for _ in range(num_layers)])
+    def __getitem__(self, key: int) -> KVCache:
+        return self._keys_values[key]
+    def __len__(self):
+        return len(self._keys_values)
+    @property
+    def size(self):
+        return self._keys_values[0].shape[2]
+    def reset(self) -> None:
+        for kv_cache in self._keys_values:
+            kv_cache.reset()
+    def prune(self, mask: np.ndarray) -> None:
+        for kv_cache in self._keys_values:
+            kv_cache.prune(mask)
+class AssignWithoutInplaceCheck(torch.autograd.Function):
+    """
+    Inspired from : https://discuss.pytorch.org/t/disable-in-place-correctness-version-check-any-other-workaround/90738/4
+    Warning : do not use it to overwrite a slice twice.
+    """
+    @staticmethod
+    def get_slice(dim: int, start: int, stop: int) -> Tuple[slice]:
+        return tuple([slice(None), ] * dim + [slice(start, stop)])
+    @staticmethod
+    def forward(ctx, input: torch.Tensor, value: torch.Tensor, dim: int, start: int, stop: int) -> torch.Tensor:
+        ctx.dim = dim
+        ctx.start = start
+        ctx.stop = stop
+        input.data[AssignWithoutInplaceCheck.get_slice(dim, start, stop)] = value
+        return input
+    @staticmethod
+    def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor]:
+        return grad_out, grad_out[AssignWithoutInplaceCheck.get_slice(ctx.dim, ctx.start, ctx.stop)], None, None, None

iris/src/models/lpips.py ADDED Viewed

	@@ -0,0 +1,167 @@

+"""
+Credits to https://github.com/CompVis/taming-transformers
+"""
+from collections import namedtuple
+import hashlib
+import os
+from pathlib import Path
+import requests
+import torch
+import torch.nn as nn
+from torchvision import models
+from tqdm import tqdm
+class LPIPS(nn.Module):
+    # Learned perceptual metric
+    def __init__(self, use_dropout: bool = True):
+        super().__init__()
+        self.scaling_layer = ScalingLayer()
+        self.chns = [64, 128, 256, 512, 512]  # vg16 features
+        self.net = vgg16(pretrained=True, requires_grad=False)
+        self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
+        self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)
+        self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)
+        self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout)
+        self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout)
+        self.load_from_pretrained()
+        for param in self.parameters():
+            param.requires_grad = False
+    def load_from_pretrained(self) -> None:
+        ckpt = get_ckpt_path(name="vgg_lpips", root=Path.home() / ".cache/iris/tokenizer_pretrained_vgg")  # Download VGG if necessary
+        self.load_state_dict(torch.load(ckpt, map_location=torch.device("cpu")), strict=False)
+    def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        in0_input, in1_input = (self.scaling_layer(input), self.scaling_layer(target))
+        outs0, outs1 = self.net(in0_input), self.net(in1_input)
+        feats0, feats1, diffs = {}, {}, {}
+        lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4]
+        for kk in range(len(self.chns)):
+            feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk])
+            diffs[kk] = (feats0[kk] - feats1[kk]) ** 2
+        res = [spatial_average(lins[kk].model(diffs[kk]), keepdim=True) for kk in range(len(self.chns))]
+        val = res[0]
+        for i in range(1, len(self.chns)):
+            val += res[i]
+        return val
+class ScalingLayer(nn.Module):
+    def __init__(self) -> None:
+        super(ScalingLayer, self).__init__()
+        self.register_buffer('shift', torch.Tensor([-.030, -.088, -.188])[None, :, None, None])
+        self.register_buffer('scale', torch.Tensor([.458, .448, .450])[None, :, None, None])
+    def forward(self, inp: torch.Tensor) -> torch.Tensor:
+        return (inp - self.shift) / self.scale
+class NetLinLayer(nn.Module):
+    """ A single linear layer which does a 1x1 conv """
+    def __init__(self, chn_in: int, chn_out: int = 1, use_dropout: bool = False) -> None:
+        super(NetLinLayer, self).__init__()
+        layers = [nn.Dropout(), ] if (use_dropout) else []
+        layers += [nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False), ]
+        self.model = nn.Sequential(*layers)
+class vgg16(torch.nn.Module):
+    def __init__(self, requires_grad: bool = False, pretrained: bool = True) -> None:
+        super(vgg16, self).__init__()
+        vgg_pretrained_features = models.vgg16(pretrained=pretrained).features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        self.slice5 = torch.nn.Sequential()
+        self.N_slices = 5
+        for x in range(4):
+            self.slice1.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(4, 9):
+            self.slice2.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(9, 16):
+            self.slice3.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(16, 23):
+            self.slice4.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(23, 30):
+            self.slice5.add_module(str(x), vgg_pretrained_features[x])
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+    def forward(self, X: torch.Tensor) -> torch.Tensor:
+        h = self.slice1(X)
+        h_relu1_2 = h
+        h = self.slice2(h)
+        h_relu2_2 = h
+        h = self.slice3(h)
+        h_relu3_3 = h
+        h = self.slice4(h)
+        h_relu4_3 = h
+        h = self.slice5(h)
+        h_relu5_3 = h
+        vgg_outputs = namedtuple("VggOutputs", ['relu1_2', 'relu2_2', 'relu3_3', 'relu4_3', 'relu5_3'])
+        out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3)
+        return out
+def normalize_tensor(x: torch.Tensor, eps: float = 1e-10) -> torch.Tensor:
+    norm_factor = torch.sqrt(torch.sum(x ** 2, dim=1, keepdim=True))
+    return x / (norm_factor + eps)
+def spatial_average(x: torch.Tensor, keepdim: bool = True) -> torch.Tensor:
+    return x.mean([2, 3], keepdim=keepdim)
+# ********************************************************************
+# *************** Utilities to download pretrained vgg ***************
+# ********************************************************************
+URL_MAP = {
+    "vgg_lpips": "https://heibox.uni-heidelberg.de/f/607503859c864bc1b30b/?dl=1"
+}
+CKPT_MAP = {
+    "vgg_lpips": "vgg.pth"
+}
+MD5_MAP = {
+    "vgg_lpips": "d507d7349b931f0638a25a48a722f98a"
+}
+def download(url: str, local_path: str, chunk_size: int = 1024) -> None:
+    os.makedirs(os.path.split(local_path)[0], exist_ok=True)
+    with requests.get(url, stream=True) as r:
+        total_size = int(r.headers.get("content-length", 0))
+        with tqdm(total=total_size, unit="B", unit_scale=True) as pbar:
+            with open(local_path, "wb") as f:
+                for data in r.iter_content(chunk_size=chunk_size):
+                    if data:
+                        f.write(data)
+                        pbar.update(chunk_size)
+def md5_hash(path: str) -> str:
+    with open(path, "rb") as f:
+        content = f.read()
+    return hashlib.md5(content).hexdigest()
+def get_ckpt_path(name: str, root: str, check: bool = False) -> str:
+    assert name in URL_MAP
+    path = os.path.join(root, CKPT_MAP[name])
+    if not os.path.exists(path) or (check and not md5_hash(path) == MD5_MAP[name]):
+        print("Downloading {} model from {} to {}".format(name, URL_MAP[name], path))
+        download(URL_MAP[name], path)
+        md5 = md5_hash(path)
+        assert md5 == MD5_MAP[name], md5
+    return path

iris/src/models/nets.py ADDED Viewed

	@@ -0,0 +1,345 @@

+"""
+Credits to https://github.com/CompVis/taming-transformers
+"""
+import torch
+import torch.nn as nn
+class Encoder(nn.Module):
+    def __init__(self, config: dict) -> None:
+        super().__init__()
+        self.config = config
+        self.num_resolutions = len(config["ch_mult"])
+        temb_ch = 0  # timestep embedding #channels
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(config["in_channels"],
+                                       config["ch"],
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        curr_res = config["resolution"]
+        in_ch_mult = (1,) + tuple(config["ch_mult"])
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = config["ch"] * in_ch_mult[i_level]
+            block_out = config["ch"] * config["ch_mult"][i_level]
+            for i_block in range(self.config["num_res_blocks"]):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=temb_ch,
+                                         dropout=config["dropout"]))
+                block_in = block_out
+                if curr_res in config["attn_resolutions"]:
+                    attn.append(AttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in, with_conv=True)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=temb_ch,
+                                       dropout=config["dropout"])
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=temb_ch,
+                                       dropout=config["dropout"])
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        config["z_channels"],
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        temb = None  # timestep embedding
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.config["num_res_blocks"]):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(self, config: dict) -> None:
+        super().__init__()
+        self.config = config
+        temb_ch = 0
+        self.num_resolutions = len(config["ch_mult"])
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,) + tuple(config["ch_mult"])
+        block_in = config["ch"] * config["ch_mult"][self.num_resolutions - 1]
+        curr_res = config["resolution"] // 2 ** (self.num_resolutions - 1)
+        print(f"Tokenizer : shape of latent is {config["z_channels"], curr_res, curr_res}.")
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(config["z_channels"],
+                                       block_in,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=temb_ch,
+                                       dropout=config["dropout"])
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=temb_ch,
+                                       dropout=config["dropout"])
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = config["ch"] * config["ch_mult"][i_level]
+            for i_block in range(config["num_res_blocks"] + 1):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=temb_ch,
+                                         dropout=config["dropout"]))
+                block_in = block_out
+                if curr_res in config["attn_resolutions"]:
+                    attn.append(AttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, with_conv=True)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        config["out_ch"],
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, z: torch.Tensor) -> torch.Tensor:
+        temb = None  # timestep embedding
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.config["num_res_blocks"] + 1):
+                h = self.up[i_level].block[i_block](h, temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+def nonlinearity(x: torch.Tensor) -> torch.Tensor:
+    # swish
+    return x * torch.sigmoid(x)
+def Normalize(in_channels: int) -> nn.Module:
+    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+class Upsample(nn.Module):
+    def __init__(self, in_channels: int, with_conv: bool) -> None:
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, in_channels: int, with_conv: bool) -> None:
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(self, *, in_channels: int, out_channels: int = None, conv_shortcut: bool = False,
+                 dropout: float, temb_channels: int = 512) -> None:
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(in_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels,
+                                             out_channels)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels,
+                                                     out_channels,
+                                                     kernel_size=3,
+                                                     stride=1,
+                                                     padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels,
+                                                    out_channels,
+                                                    kernel_size=1,
+                                                    stride=1,
+                                                    padding=0)
+    def forward(self, x: torch.Tensor, temb: torch.Tensor) -> torch.Tensor:
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + h
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels: int) -> None:
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.k = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.v = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, h, w = q.shape
+        q = q.reshape(b, c, h * w)
+        q = q.permute(0, 2, 1)      # b,hw,c
+        k = k.reshape(b, c, h * w)  # b,c,hw
+        w_ = torch.bmm(q, k)        # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b, c, h * w)
+        w_ = w_.permute(0, 2, 1)   # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v, w_)     # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b, c, h, w)
+        h_ = self.proj_out(h_)
+        return x + h_

iris/src/models/slicer.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import math
+from typing import List
+import torch
+import torch.nn as nn
+class Slicer(nn.Module):
+    def __init__(self, max_blocks: int, block_mask: torch.Tensor) -> None:
+        super().__init__()
+        self.block_size = block_mask.size(0)
+        self.num_kept_tokens = block_mask.sum().long().item()
+        kept_indices = torch.where(block_mask)[0].repeat(max_blocks)
+        offsets = torch.arange(max_blocks).repeat_interleave(self.num_kept_tokens)
+        self.register_buffer('indices', kept_indices + block_mask.size(0) * offsets)
+    def compute_slice(self, num_steps: int, prev_steps: int = 0) -> torch.Tensor:
+        total_steps = num_steps + prev_steps
+        num_blocks = math.ceil(total_steps / self.block_size)
+        indices = self.indices[:num_blocks * self.num_kept_tokens]
+        return indices[torch.logical_and(prev_steps <= indices, indices < total_steps)] - prev_steps
+    def forward(self, *args, **kwargs):
+        raise NotImplementedError
+class Head(Slicer):
+    def __init__(self, max_blocks: int, block_mask: torch.Tensor, head_module: nn.Module) -> None:
+        super().__init__(max_blocks, block_mask)
+        assert isinstance(head_module, nn.Module)
+        self.head_module = head_module
+    def forward(self, x: torch.Tensor, num_steps: int, prev_steps: int) -> torch.Tensor:
+        x_sliced = x[:, self.compute_slice(num_steps, prev_steps)]  # x is (B, T, E)
+        return self.head_module(x_sliced)
+class Embedder(nn.Module):
+    def __init__(self, max_blocks: int, block_masks: List[torch.Tensor], embedding_tables: List[nn.Embedding]) -> None:
+        super().__init__()
+        assert len(block_masks) == len(embedding_tables)
+        assert (sum(block_masks) == 1).all()  # block mask are a partition of a block
+        self.embedding_dim = embedding_tables[0].embedding_dim
+        assert all([e.embedding_dim == self.embedding_dim for e in embedding_tables])
+        self.embedding_tables = embedding_tables
+        self.slicers = [Slicer(max_blocks, block_mask) for block_mask in block_masks]
+    def forward(self, tokens: torch.Tensor, num_steps: int, prev_steps: int) -> torch.Tensor:
+        assert tokens.ndim == 2  # x is (B, T)
+        output = torch.zeros(*tokens.size(), self.embedding_dim, device=tokens.device)
+        for slicer, emb in zip(self.slicers, self.embedding_tables):
+            s = slicer.compute_slice(num_steps, prev_steps)
+            output[:, s] = emb(tokens[:, s])
+        return output

iris/src/models/transformer.py ADDED Viewed

	@@ -0,0 +1,101 @@

+"""
+Credits to https://github.com/karpathy/minGPT
+"""
+from dataclasses import dataclass
+import math
+from typing import Optional
+from einops import rearrange
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from .kv_caching import KeysValues, KVCache
+class Transformer(nn.Module):
+    def __init__(self, config: dict) -> None:
+        super().__init__()
+        self.config = config
+        self.config["max_tokens"] = config["tokens_per_block"] * config["max_blocks"]
+        self.drop = nn.Dropout(config["embed_pdrop"])
+        self.blocks = nn.ModuleList([Block(config) for _ in range(config["num_layers"])])
+        self.ln_f = nn.LayerNorm(config["embed_dim"])
+    def generate_empty_keys_values(self, n: int, max_tokens: int) -> KeysValues:
+        device = self.ln_f.weight.device  # Assumption that all submodules are on the same device
+        return KeysValues(n, self.config["num_heads"], max_tokens, self.config["embed_dim"], self.config["num_layers"], device)
+    def forward(self, sequences: torch.Tensor, past_keys_values: Optional[KeysValues] = None) -> torch.Tensor:
+        assert past_keys_values is None or len(past_keys_values) == len(self.blocks)
+        x = self.drop(sequences)
+        for i, block in enumerate(self.blocks):
+            x = block(x, None if past_keys_values is None else past_keys_values[i])
+        x = self.ln_f(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, config: dict) -> None:
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config["embed_dim"])
+        self.ln2 = nn.LayerNorm(config["embed_dim"])
+        self.attn = SelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config["embed_dim"], 4 * config["embed_dim"]),
+            nn.GELU(),
+            nn.Linear(4 * config["embed_dim"], config["embed_dim"]),
+            nn.Dropout(config["resid_pdrop"]),
+        )
+    def forward(self, x: torch.Tensor, past_keys_values: Optional[KeysValues] = None) -> torch.Tensor:
+        x_attn = self.attn(self.ln1(x), past_keys_values)
+        x = x + x_attn
+        x = x + self.mlp(self.ln2(x))
+        return x
+class SelfAttention(nn.Module):
+    def __init__(self, config: dict) -> None:
+        super().__init__()
+        assert config["embed_dim"] % config["num_heads"] == 0
+        assert config["attention"] in ('causal', 'block_causal')
+        self.num_heads = config["num_heads"]
+        self.key = nn.Linear(config["embed_dim"], config["embed_dim"])
+        self.query = nn.Linear(config["embed_dim"], config["embed_dim"])
+        self.value = nn.Linear(config["embed_dim"], config["embed_dim"])
+        self.attn_drop = nn.Dropout(config["attn_pdrop"])
+        self.resid_drop = nn.Dropout(config["resid_pdrop"])
+        self.proj = nn.Linear(config["embed_dim"], config["embed_dim"])
+        causal_mask = torch.tril(torch.ones(config["max_tokens"], config["max_tokens"]))
+        block_causal_mask = torch.max(causal_mask, torch.block_diag(*[torch.ones(config["tokens_per_block"], config["tokens_per_block"]) for _ in range(config["max_blocks"])]))
+        self.register_buffer('mask', causal_mask if config["attention"] == 'causal' else block_causal_mask)
+    def forward(self, x: torch.Tensor, kv_cache: Optional[KVCache] = None) -> torch.Tensor:
+        B, T, C = x.size()
+        if kv_cache is not None:
+            b, nh, L, c = kv_cache.shape
+            assert nh == self.num_heads and b == B and c * nh == C
+        else:
+            L = 0
+        q = self.query(x).view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2)   # (B, nh, T, hs)
+        k = self.key(x).view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2)     # (B, nh, T, hs)
+        v = self.value(x).view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2)   # (B, nh, T, hs)
+        if kv_cache is not None:
+            kv_cache.update(k, v)
+            k, v = kv_cache.get()
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[L:L + T, :L + T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.attn_drop(att)
+        y = att @ v
+        y = rearrange(y, 'b h t e -> b t (h e)')
+        y = self.resid_drop(self.proj(y))
+        return y

iris/src/tokenizer.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""
+Credits to https://github.com/CompVis/taming-transformers
+"""
+from typing import Tuple
+from einops import rearrange
+import torch
+import torch.nn as nn
+from models.lpips import LPIPS
+from models.nets import Encoder, Decoder
+class Tokenizer(nn.Module):
+    def __init__(self, vocab_size: int, embed_dim: int, encoder: Encoder, decoder: Decoder, with_lpips: bool = True) -> None:
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.encoder = encoder
+        self.pre_quant_conv = torch.nn.Conv2d(encoder.config.z_channels, embed_dim, 1)
+        self.embedding = nn.Embedding(vocab_size, embed_dim)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, decoder.config.z_channels, 1)
+        self.decoder = decoder
+        self.embedding.weight.data.uniform_(-1.0 / vocab_size, 1.0 / vocab_size)
+        self.lpips = LPIPS().eval() if with_lpips else None
+    def __repr__(self) -> str:
+        return "tokenizer"
+    def forward(self, x: torch.Tensor, should_preprocess: bool = False, should_postprocess: bool = False) -> Tuple[torch.Tensor]:
+        outputs = self.encode(x, should_preprocess)
+        decoder_input = outputs.z + (outputs.z_quantized - outputs.z).detach()
+        reconstructions = self.decode(decoder_input, should_postprocess)
+        return outputs.z, outputs.z_quantized, reconstructions
+    def encode(self, x: torch.Tensor, should_preprocess: bool = False) -> dict:
+        if should_preprocess:
+            x = self.preprocess_input(x)
+        shape = x.shape  # (..., C, H, W)
+        x = x.view(-1, *shape[-3:])
+        z = self.encoder(x)
+        z = self.pre_quant_conv(z)
+        b, e, h, w = z.shape
+        z_flattened = rearrange(z, 'b e h w -> (b h w) e')
+        dist_to_embeddings = torch.sum(z_flattened ** 2, dim=1, keepdim=True) + torch.sum(self.embedding.weight**2, dim=1) - 2 * torch.matmul(z_flattened, self.embedding.weight.t())
+        tokens = dist_to_embeddings.argmin(dim=-1)
+        z_q = rearrange(self.embedding(tokens), '(b h w) e -> b e h w', b=b, e=e, h=h, w=w).contiguous()
+        # Reshape to original
+        z = z.reshape(*shape[:-3], *z.shape[1:])
+        z_q = z_q.reshape(*shape[:-3], *z_q.shape[1:])
+        tokens = tokens.reshape(*shape[:-3], -1)
+        return {
+            "z": z,
+            "z_quantized": z_q,
+            "tokens": tokens
+        }
+    def decode(self, z_q: torch.Tensor, should_postprocess: bool = False) -> torch.Tensor:
+        shape = z_q.shape  # (..., E, h, w)
+        z_q = z_q.view(-1, *shape[-3:])
+        z_q = self.post_quant_conv(z_q)
+        rec = self.decoder(z_q)
+        rec = rec.reshape(*shape[:-3], *rec.shape[1:])
+        if should_postprocess:
+            rec = self.postprocess_output(rec)
+        return rec
+    @torch.no_grad()
+    def encode_decode(self, x: torch.Tensor, should_preprocess: bool = False, should_postprocess: bool = False) -> torch.Tensor:
+        z_q = self.encode(x, should_preprocess).z_quantized
+        return self.decode(z_q, should_postprocess)
+    def preprocess_input(self, x: torch.Tensor) -> torch.Tensor:
+        """x is supposed to be channels first and in [0, 1]"""
+        return x.mul(2).sub(1)
+    def postprocess_output(self, y: torch.Tensor) -> torch.Tensor:
+        """y is supposed to be channels first and in [-1, 1]"""
+        return y.add(1).div(2)

iris/src/world_model.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from typing import Any, Optional, Tuple
+from einops import rearrange
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from models.kv_caching import KeysValues
+from models.slicer import Embedder, Head
+from models.transformer import Transformer
+class WorldModel(nn.Module):
+    def __init__(self, obs_vocab_size: int, act_vocab_size: int, config: dict) -> None:
+        super().__init__()
+        self.obs_vocab_size, self.act_vocab_size = obs_vocab_size, act_vocab_size
+        self.config = config
+        self.transformer = Transformer(config)
+        all_but_last_obs_tokens_pattern = torch.ones(config["tokens_per_block"])
+        all_but_last_obs_tokens_pattern[-2] = 0
+        act_tokens_pattern = torch.zeros(self.config["tokens_per_block"])
+        act_tokens_pattern[-1] = 1
+        obs_tokens_pattern = 1 - act_tokens_pattern
+        self.pos_emb = nn.Embedding(config["max_tokens"], config["embed_dim"])
+        self.embedder = Embedder(
+            max_blocks=config["max_blocks"],
+            block_masks=[act_tokens_pattern, obs_tokens_pattern],
+            embedding_tables=nn.ModuleList([nn.Embedding(act_vocab_size, config["embed_dim"]), nn.Embedding(obs_vocab_size, config["embed_dim"])])
+        )
+        self.head_observations = Head(
+            max_blocks=config["max_blocks"],
+            block_mask=all_but_last_obs_tokens_pattern,
+            head_module=nn.Sequential(
+                nn.Linear(config["embed_dim"], config["embed_dim"]),
+                nn.ReLU(),
+                nn.Linear(config["embed_dim"], obs_vocab_size)
+            )
+        )
+        self.head_rewards = Head(
+            max_blocks=config["max_blocks"],
+            block_mask=act_tokens_pattern,
+            head_module=nn.Sequential(
+                nn.Linear(config["embed_dim"], config["embed_dim"]),
+                nn.ReLU(),
+                nn.Linear(config["embed_dim"], 3)
+            )
+        )
+        self.head_ends = Head(
+            max_blocks=config["max_blocks"],
+            block_mask=act_tokens_pattern,
+            head_module=nn.Sequential(
+                nn.Linear(config["embed_dim"], config["embed_dim"]),
+                nn.ReLU(),
+                nn.Linear(config["embed_dim"], 2)
+            )
+        )
+    def __repr__(self) -> str:
+        return "world_model"
+    def forward(self, tokens: torch.LongTensor, past_keys_values: Optional[KeysValues] = None) -> dict:
+        num_steps = tokens.size(1)  # (B, T)
+        assert num_steps <= self.config["max_tokens"]
+        prev_steps = 0 if past_keys_values is None else past_keys_values.size
+        sequences = self.embedder(tokens, num_steps, prev_steps) + self.pos_emb(prev_steps + torch.arange(num_steps, device=tokens.device))
+        x = self.transformer(sequences, past_keys_values)
+        logits_observations = self.head_observations(x, num_steps=num_steps, prev_steps=prev_steps)
+        logits_rewards = self.head_rewards(x, num_steps=num_steps, prev_steps=prev_steps)
+        logits_ends = self.head_ends(x, num_steps=num_steps, prev_steps=prev_steps)
+        return {
+            "output_sequence": x,
+            "logits_observations": logits_observations,
+            "logits_rewards": logits_rewards,
+            "logits_ends": logits_ends
+        }
+    def compute_labels_world_model(self, obs_tokens: torch.Tensor, rewards: torch.Tensor, ends: torch.Tensor, mask_padding: torch.BoolTensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        assert torch.all(ends.sum(dim=1) <= 1)  # at most 1 done
+        mask_fill = torch.logical_not(mask_padding)
+        labels_observations = rearrange(obs_tokens.masked_fill(mask_fill.unsqueeze(-1).expand_as(obs_tokens), -100), 'b t k -> b (t k)')[:, 1:]
+        labels_rewards = (rewards.sign() + 1).masked_fill(mask_fill, -100).long()  # Rewards clipped to {-1, 0, 1}
+        labels_ends = ends.masked_fill(mask_fill, -100)
+        return labels_observations.reshape(-1), labels_rewards.reshape(-1), labels_ends.reshape(-1)