Xsmos
/

ml21cm

+# from dataclasses import dataclass
+# import h5py
+import torch
+import torch.nn as nn
+# from torch.utils.data import DataLoader, Dataset
+# from datasets import Dataset
+import matplotlib.pyplot as plt
+import numpy as np
+import random
+from abc import ABC, abstractmethod
+import torch.nn.functional as F
+import math
+# from PIL import Image
+import os
+# from torch.utils.tensorboard import SummaryWriter
+import copy
+# from tqdm.auto import tqdm
+# from torchvision import transforms
+# from diffusers import UNet2DModel#, UNet3DConditionModel
+# from diffusers import DDPMScheduler
+# from diffusers.utils import make_image_grid
+import datetime
+# from pathlib import Path
+# from diffusers.optimization import get_cosine_schedule_with_warmup
+# from accelerate import notebook_launcher, Accelerator
+# from huggingface_hub import create_repo, upload_folder
+# from load_h5 import Dataset4h5
+class GroupNorm32(nn.GroupNorm):
+    def __init__(self, num_groups, num_channels, swish, eps=1e-5):
+        super().__init__(num_groups=num_groups, num_channels=num_channels, eps=eps)
+        self.swish = swish
+    def forward(self, x):
+        y = super().forward(x.float()).to(x.dtype)
+        if self.swish == 1.0:
+            y = F.silu(y)
+        elif self.swish:
+            y = y * F.sigmoid(y * float(self.swish))
+        return y
+def normalization(channels, swish=0.0):
+    """
+    Make a standard normalization layer, with an optional swish activation.
+    :param channels: number of input channels.
+    :return: an nn.Module for normalization.
+    """
+    #print (channels)
+    return GroupNorm32(num_channels=channels, num_groups=32, swish=swish)
+Conv = {
+    1: nn.Conv1d,
+    2: nn.Conv2d,
+    3: nn.Conv3d,
+}
+AvgPool = {
+    1: nn.AvgPool1d,
+    2: nn.AvgPool2d,
+    3: nn.AvgPool3d
+}
+class Downsample(nn.Module):
+    def __init__(self, channels, use_conv, out_channels=None, dim=2, stride=(2,2)):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        # stride = config.stride
+        if use_conv:
+            # print("conv")
+            self.op = Conv[dim](channels, self.out_channels, 3, stride=stride, padding=1)
+        else:
+            # print("pool")
+            assert channels == self.out_channels
+            self.op = AvgPool[dim](kernel_size=stride, stride=stride)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)
+class Upsample(nn.Module):
+    def __init__(self, channels, use_conv, out_channels=None, dim=2, stride=(2,2)):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels
+        self.use_conv = use_conv
+        self.stride = stride
+        if self.use_conv:
+            self.conv = Conv[dim](self.channels, self.out_channels, 3, padding=1)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        # stride = config.stride
+        # print(torch.tensor(x.shape[2:]))
+        # print(torch.tensor(stride))
+        shape = torch.tensor(x.shape[2:]) * torch.tensor(self.stride)
+        shape = tuple(shape.detach().numpy())
+        # print(shape)
+        x = F.interpolate(x, shape, mode='nearest')
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+def zero_module(module):
+    """
+    clean gradient of parameters of the module
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+class TimestepBlock(ABC, nn.Module):
+    @abstractmethod
+    def forward(self, x, emb):
+        """
+        test
+        """
+class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
+    def forward(self, x, emb, encoder_out=None):
+        for layer in self:
+            if isinstance(layer, TimestepBlock):
+                x = layer(x, emb)
+            elif isinstance(layer, AttentionBlock):
+                x = layer(x, encoder_out)
+            else:
+                x = layer(x)
+        return x
+class ResBlock(TimestepBlock):
+    def __init__(
+        self, channels, emb_channels, dropout, out_channels=None, use_conv=False, use_checkpoint=False, use_scale_shift_norm=False, up=False, down=False, dim=2, stride=(2,2),
+        ):
+        super().__init__()
+        self.out_channels = out_channels or channels
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.stride = stride
+        self.in_layers = nn.Sequential(
+            # nn.BatchNorm2d(channels), # normalize to standard gaussian
+            normalization(channels, swish=1.0),
+            nn.Identity(),
+            Conv[dim](channels, self.out_channels, 3, padding=1),
+            )
+        self.updown = up or down
+        if up:
+            self.h_updown = Upsample(channels, False, dim=dim, stride=stride)
+            self.x_updown = Upsample(channels, False, dim=dim, stride=stride)
+        elif down:
+            self.h_updown = Downsample(channels, False, dim=dim, stride=stride)
+            self.x_updown = Downsample(channels, False, dim=dim, stride=stride)
+        else:
+            self.h_updown = self.x_updown = nn.Identity()
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(
+                emb_channels,
+                2 * self.out_channels if use_scale_shift_norm else self.out_channels,
+                ),
+        )
+        self.out_layers = nn.Sequential(
+            # nn.BatchNorm2d(self.out_channels),
+            normalization(self.out_channels, swish=0.0 if use_scale_shift_norm else 1.0),
+            nn.SiLU() if use_scale_shift_norm else nn.Identity(),
+            nn.Dropout(p=dropout),
+            zero_module(Conv[dim](self.out_channels, self.out_channels, 3, padding=1)),
+        )
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = Conv[dim](channels, self.out_channels, 3, padding=1)
+        else:
+            self.skip_connection = Conv[dim](channels, self.out_channels, 1)
+    def forward(self, x, emb):
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_updown(h)
+            x = self.x_updown(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+        emb_out = self.emb_layers(emb).type(h.dtype)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+        if self.use_scale_shift_norm:
+            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = torch.chunk(emb_out, 2, dim=1)
+            h = out_norm(h) * (1+scale) + shift
+            h = out_rest(h)
+        else:
+            h += emb_out
+            h = self.out_layers(h)
+        # print("ResBlock, torch.unique(h).shape =", torch.unique(h).shape)
+        return self.skip_connection(x) + h
+class QKVAttention(nn.Module):
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+        # print("QKVAttention, self.n_heads =", self.n_heads)
+    def forward(self, qkv, encoder_kv=None):
+        bs, width, length = qkv.shape
+        assert width % (3*self.n_heads) == 0
+        ch = width // (3*self.n_heads)
+        # print("QKVAttention", bs, self.n_heads, ch, length)
+        q, k, v = qkv.reshape(bs*self.n_heads, ch*3, length).split(ch, dim=1)
+        if encoder_kv is not None:
+            assert encoder_kv.shape[1] == self.n_heads * ch * 2
+            ek, ev = encoder_kv.reshape(bs*self.n_heads, ch*2, -1).split(ch, dim=1)
+            k = torch.cat([ek,k], dim=-1)
+            v = torch.cat([ev,v], dim=-1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = torch.einsum("bct,bcs->bts", q*scale, k*scale)
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = torch.einsum("bts,bcs->bct", weight, v)
+        return a.reshape(bs, -1, length)
+class AttentionBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        use_checkpoint=False,
+        encoder_channels=None,
+    ):
+        super().__init__()
+        self.channels = channels
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert channels % num_head_channels == 0,\
+                f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+            self.num_heads = channels // num_head_channels
+        self.use_checkpoint = use_checkpoint
+        # self.norm = nn.BatchNorm2d(channels)
+        self.norm = normalization(channels, swish=0.0)
+        self.qkv = nn.Conv1d(channels, channels * 3, 1)
+        self.attention = QKVAttention(self.num_heads)
+        if encoder_channels is not None:
+            self.encoder_kv = nn.Conv1d(encoder_channels, channels * 2, 1)
+        self.proj_out = zero_module(nn.Conv1d(channels, channels, 1))
+    def forward(self, x, encoder_out=None):
+        b, c, *spatial = x.shape
+        qkv = self.qkv(self.norm(x).view(b, c, -1))
+        if encoder_out is not None:
+            encoder_out = self.encoder_kv(encoder_out)
+            h = self.attention(qkv, encoder_out)
+        else:
+            h = self.attention(qkv)
+        # print("AttentionBlock, before proj_out, torch.unique(h).shape =", torch.unique(h).shape)
+        h = self.proj_out(h)
+        # print("AttentionBlock, after proj_out, torch.unique(h).shape =", torch.unique(h).shape)
+        return x + h.reshape(b, c, *spatial)
+def timestep_embedding(timesteps, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    #print (timesteps.shape)
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+    ).to(device=timesteps.device)
+    #print (timesteps[:, None].float().shape,freqs[None].shape)
+    args = timesteps[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+class ContextUnet(nn.Module):
+    def __init__(
+        self,
+        n_param=2,
+        image_size=64,
+        in_channels=1,
+        model_channels=128,
+        out_channels = 1,
+        channel_mult = None,
+        num_res_blocks = 2,
+        dropout = 0,
+        use_checkpoint = False,
+        use_scale_shift_norm = False,
+        attention_resolutions = (16, 8),
+        num_heads = 4,
+        num_head_channels = -1,
+        num_heads_upsample = -1,
+        resblock_updown = False,
+        conv_resample = True,
+        encoder_channels = None,
+        dim = 2,
+        stride = (2,2)
+        ):
+        super().__init__()
+        if channel_mult == None:
+            if image_size == 512:
+                channel_mult = (0.5, 1, 1, 2, 2, 4, 4)
+            elif image_size == 256:
+                channel_mult = (1, 1, 2, 2, 4, 4)
+            elif image_size == 128:
+                channel_mult = (1, 1, 2, 3, 4)
+            elif image_size == 64:
+                channel_mult = (1, 1, 2, 2, 4, 4)#(1, 2, 3, 4)
+            elif image_size == 28:
+                channel_mult = (1, 2)#(1, 2, 3, 4)
+            else:
+                raise ValueError(f"unsupported image size: {image_size}")
+        # else:
+        #     channel_mult = tuple(int(ch_mult) for ch_mult in channel_mult.split(","))
+        attention_ds = []
+        for res in attention_resolutions:
+            attention_ds.append(image_size // int(res))
+        # print("before, ContextUnet, num_heads_upsample =", num_heads_upsample, "num_heads =", num_heads)
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+        # print("after, ContextUnet, num_heads_upsample =", num_heads_upsample, "num_heads =", num_heads)
+        # self.n_param = n_param
+        self.model_channels = model_channels
+        self.dtype = torch.float32
+        self.token_embedding = nn.Linear(n_param, model_channels * 4)
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            nn.Linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            nn.Linear(time_embed_dim, time_embed_dim),
+        )
+        ch = input_ch = int(channel_mult[0] * model_channels)
+        ###################### input_blocks ######################
+        self.input_blocks = nn.ModuleList(
+            [TimestepEmbedSequential(Conv[dim](in_channels, ch, 3, padding=1))]
+        )
+        self._feature_size = ch
+        input_block_chans = [ch]
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels = int(mult * model_channels),
+                        use_checkpoint = use_checkpoint,
+                        use_scale_shift_norm = use_scale_shift_norm,
+                        dim = dim,
+                        stride = stride,
+                    )
+                ]
+                ch = int(mult * model_channels)
+                if ds in attention_ds:
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads = num_heads,
+                            num_head_channels = num_head_channels,
+                            encoder_channels = encoder_channels,
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            # dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                            dim = dim,
+                            stride = stride,
+                        )
+                        if resblock_updown
+                        else Downsample(ch, conv_resample, out_channels=out_ch, dim=dim, stride=stride)
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+                self._feature_size += ch
+        ###################### middle_blocks ######################
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+                dim = dim,
+                stride = stride,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=num_head_channels,
+                encoder_channels=encoder_channels,
+            ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+                dim = dim,
+                stride = stride,
+            ),
+        )
+        self._feature_size += ch
+        ###################### output_blocks ######################
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in list(enumerate(channel_mult))[::-1]:
+            for i in range(num_res_blocks + 1):
+                ich = input_block_chans.pop()
+                layers = [
+                    ResBlock(
+                        ch + ich,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=int(model_channels * mult),
+                        # dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                        dim = dim,
+                        stride = stride,
+                    )
+                ]
+                ch = int(model_channels * mult)
+                if ds in attention_ds:
+                    # print("ds in attention_resolutions, num_heads=", num_heads_upsample)
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads_upsample,
+                            num_head_channels=num_head_channels,
+                            encoder_channels=encoder_channels,
+                        )
+                    )
+                if level and i == num_res_blocks:
+                    out_ch = ch
+                    layers.append(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            # dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            up=True,
+                            dim = dim,
+                            stride = stride,
+                        )
+                        if resblock_updown
+                        else Upsample(ch, conv_resample, out_channels=out_ch, dim=dim, stride=stride)
+                    )
+                    ds //= 2
+                self.output_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+        self.out = nn.Sequential(
+            # nn.BatchNorm2d(ch),
+            normalization(ch, swish=1.0),
+            nn.Identity(),
+            zero_module(Conv[dim](input_ch, out_channels, 3, padding=1)),
+        )
+        # self.use_fp16 = use_fp16
+    def forward(self, x, timesteps, y=None):
+        hs = []
+        emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
+        if y != None:
+            text_outputs = self.token_embedding(y.float())
+            emb = emb + text_outputs.to(emb)
+        h = x.type(self.dtype)
+        # print("0,h.shape =", h.shape)
+        for module in self.input_blocks:
+            h = module(h, emb)
+            hs.append(h)
+            # print("module encoder, h.shape =", h.shape)
+        # print("2,h.shape =", h.shape)
+        h = self.middle_block(h, emb)
+        # print("middle block, h.shape =", h.shape)
+        # print("2,h.shape =", h.shape)
+        for module in self.output_blocks:
+            # print("for module in self.output_blocks, h.shape =", h.shape)
+            # print("len(hs) =", len(hs), ", hs[-1].shape =", hs[-1].shape)
+            h = torch.cat([h, hs.pop()], dim=1)
+            h = module(h, emb)
+            # print("module decoder, h.shape =", h.shape)
+        h = h.type(x.dtype)
+        h = self.out(h)
+        # print("self.out(h)", "h.shape =", h.shape)
+        return h

diffusion.ipynb CHANGED Viewed

@@ -33,7 +33,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "4f2bbf6f5e904828bc65afc7ad97df36",
        "version_major": 2,
        "version_minor": 0
       },
@@ -81,7 +81,10 @@
     "from pathlib import Path\n",
     "from diffusers.optimization import get_cosine_schedule_with_warmup\n",
     "from accelerate import notebook_launcher, Accelerator\n",
-    "from huggingface_hub import create_repo, upload_folder"
    ]
   },
   {
@@ -99,95 +102,95 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "class Dataset4h5(Dataset):\n",
-    "    def __init__(self, dir_name, num_image=10, field='brightness_temp', shuffle=True, idx=None, num_redshift=32, HII_DIM=32, rescale=True, drop_prob = 0, dim=2):\n",
-    "        super().__init__()\n",
     "        \n",
-    "        self.dir_name = dir_name\n",
-    "        self.num_image = num_image\n",
-    "        self.field = field\n",
-    "        self.shuffle = shuffle\n",
-    "        self.idx = idx\n",
-    "        self.num_redshift = num_redshift\n",
-    "        self.HII_DIM = HII_DIM\n",
-    "        self.drop_prob = drop_prob\n",
-    "        self.dim = dim\n",
-    "\n",
-    "        self.load_h5()\n",
-    "        if rescale:\n",
-    "            self.images = self.rescale(self.images, to=[-1,1])\n",
-    "            self.params = self.rescale(self.params, to=[0,1])\n",
-    "\n",
-    "        self.len = len(self.params)\n",
-    "        self.images = torch.from_numpy(self.images)\n",
-    "        print(f\"images rescaled to [{self.images.min()}, {self.images.max()}]\")\n",
-    "\n",
-    "        cond_filter = torch.bernoulli(torch.ones(len(self.params),1)-self.drop_prob).repeat(1,self.params.shape[1]).numpy()\n",
-    "        self.params = torch.from_numpy(self.params*cond_filter)\n",
-    "        print(f\"params rescaled to [{self.params.min()}, {self.params.max()}]\")\n",
-    "\n",
-    "    def load_h5(self):\n",
-    "        with h5py.File(self.dir_name, 'r') as f:\n",
-    "            print(f\"dataset content: {f.keys()}\")\n",
-    "            max_num_image = len(f['brightness_temp'])#.shape[0]\n",
-    "            print(f\"{max_num_image} images can be loaded\")\n",
-    "            field_shape = f['brightness_temp'].shape[1:]\n",
-    "            print(f\"field.shape = {field_shape}\")\n",
-    "            self.params_keys = list(f['params']['keys'])\n",
-    "            print(f\"params keys = {self.params_keys}\")\n",
-    "\n",
-    "            if self.idx is None:\n",
-    "                if self.shuffle:\n",
-    "                    self.idx = np.sort(random.sample(range(max_num_image), self.num_image))\n",
-    "                    print(f\"loading {self.num_image} images randomly\")\n",
-    "                    # print(self.idx)\n",
-    "                else:\n",
-    "                    self.idx = range(self.num_image)\n",
-    "                    print(f\"loading {len(self.idx)} images with idx = {self.idx}\")\n",
-    "            else:\n",
-    "                print(f\"loading {len(self.idx)} images with idx = {self.idx}\")\n",
-    "\n",
-    "            if self.dim == 2:\n",
-    "                self.images = f[self.field][self.idx,0,:self.HII_DIM,-self.num_redshift:][:,None]\n",
-    "            elif self.dim == 3:\n",
-    "                self.images = f[self.field][self.idx,:self.HII_DIM,:self.HII_DIM,-self.num_redshift:][:,None]\n",
-    "            print(f\"images loaded:\", self.images.shape)\n",
-    "\n",
-    "            self.params = f['params']['values'][self.idx]\n",
-    "            print(\"params loaded:\", self.params.shape)\n",
     "            \n",
-    "            # plt.imshow(self.images[0,0,0])\n",
-    "            # plt.show()\n",
-    "\n",
-    "    def rescale(self, value, to: list):\n",
-    "        # print(np.ndim(value))\n",
-    "        if np.ndim(value)==2:\n",
-    "            # print(f\"rescale params of shape {value.shape}\")\n",
-    "            ranges = \\\n",
-    "                {\n",
-    "                    0: [4, 6], # ION_Tvir_MIN\n",
-    "                    1: [10, 250], # HII_EFF_FACTOR\n",
-    "                }\n",
-    "        # elif np.ndim(value)==5:  \n",
-    "        else:  \n",
-    "            # value = np.array(value)\n",
-    "            # print(f\"rescale images of shape {np.shape(value)}\")\n",
-    "            ranges = \\\n",
-    "                {\n",
-    "                    0: [0, 80], # brightness_temp\n",
-    "                }\n",
-    "        # print(f\"value.min = {value.min()}, value.max = {value.max()}\")\n",
-    "        for i in range(np.shape(value)[1]):\n",
-    "            value[:,i] = (value[:,i] - ranges[i][0]) / (ranges[i][1]-ranges[i][0])\n",
-    "        # print(f\"value.min = {value.min()}, value.max = {value.max()}\")\n",
-    "        value = value * (to[1]-to[0]) + to[0]\n",
-    "        return value \n",
-    "\n",
-    "    def __getitem__(self, index):\n",
-    "        return self.images[index], self.params[index]\n",
-    "\n",
-    "    def __len__(self):\n",
-    "        return self.len"
    ]
   },
   {
@@ -346,596 +349,526 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "class GroupNorm32(nn.GroupNorm):\n",
-    "    def __init__(self, num_groups, num_channels, swish, eps=1e-5):\n",
-    "        super().__init__(num_groups=num_groups, num_channels=num_channels, eps=eps)\n",
-    "        self.swish = swish\n",
-    "\n",
-    "    def forward(self, x):\n",
-    "        y = super().forward(x.float()).to(x.dtype)\n",
-    "        if self.swish == 1.0:\n",
-    "            y = F.silu(y)\n",
-    "        elif self.swish:\n",
-    "            y = y * F.sigmoid(y * float(self.swish))\n",
-    "        return y\n",
-    "\n",
-    "def normalization(channels, swish=0.0):\n",
-    "    \"\"\"\n",
-    "    Make a standard normalization layer, with an optional swish activation.\n",
-    "\n",
-    "    :param channels: number of input channels.\n",
-    "    :return: an nn.Module for normalization.\n",
-    "    \"\"\"\n",
-    "    #print (channels)\n",
-    "    return GroupNorm32(num_channels=channels, num_groups=32, swish=swish)\n",
-    "\n",
-    "Conv = {\n",
-    "    1: nn.Conv1d,\n",
-    "    2: nn.Conv2d,\n",
-    "    3: nn.Conv3d,\n",
-    "}\n",
-    "\n",
-    "AvgPool = {\n",
-    "    1: nn.AvgPool1d,\n",
-    "    2: nn.AvgPool2d,\n",
-    "    3: nn.AvgPool3d\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class Downsample(nn.Module):\n",
-    "    def __init__(self, channels, use_conv, out_channels=None, dim=2, stride=(2,2)):\n",
-    "        super().__init__()\n",
-    "        self.channels = channels\n",
-    "        self.out_channels = out_channels or channels\n",
-    "        # stride = config.stride\n",
-    "        if use_conv:\n",
-    "            # print(\"conv\")\n",
-    "            self.op = Conv[dim](channels, self.out_channels, 3, stride=stride, padding=1)\n",
-    "        else:\n",
-    "            # print(\"pool\")\n",
-    "            assert channels == self.out_channels\n",
-    "            self.op = AvgPool[dim](kernel_size=stride, stride=stride)\n",
-    "\n",
-    "    def forward(self, x):\n",
-    "        assert x.shape[1] == self.channels\n",
-    "        return self.op(x)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class Upsample(nn.Module):\n",
-    "    def __init__(self, channels, use_conv, out_channels=None, dim=2, stride=(2,2)):\n",
-    "        super().__init__()\n",
-    "        self.channels = channels\n",
-    "        self.out_channels = out_channels\n",
-    "        self.use_conv = use_conv\n",
-    "        self.stride = stride\n",
-    "        if self.use_conv:\n",
-    "            self.conv = Conv[dim](self.channels, self.out_channels, 3, padding=1)\n",
-    "\n",
-    "    def forward(self, x):\n",
-    "        assert x.shape[1] == self.channels\n",
-    "        # stride = config.stride\n",
-    "        # print(torch.tensor(x.shape[2:]))\n",
-    "        # print(torch.tensor(stride))\n",
-    "        shape = torch.tensor(x.shape[2:]) * torch.tensor(self.stride)\n",
-    "        shape = tuple(shape.detach().numpy())\n",
-    "        # print(shape)\n",
-    "        x = F.interpolate(x, shape, mode='nearest')\n",
-    "        if self.use_conv:\n",
-    "            x = self.conv(x)\n",
-    "        return x"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def zero_module(module):\n",
-    "    \"\"\"\n",
-    "    clean gradient of parameters of the module\n",
-    "    \"\"\"\n",
-    "    for p in module.parameters():\n",
-    "        p.detach().zero_()\n",
-    "    return module"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class TimestepBlock(ABC, nn.Module):\n",
-    "    @abstractmethod\n",
-    "    def forward(self, x, emb):\n",
-    "        \"\"\"\n",
-    "        test\n",
-    "        \"\"\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class TimestepEmbedSequential(nn.Sequential, TimestepBlock):\n",
-    "    def forward(self, x, emb, encoder_out=None):\n",
-    "        for layer in self:\n",
-    "            if isinstance(layer, TimestepBlock):\n",
-    "                x = layer(x, emb)\n",
-    "            elif isinstance(layer, AttentionBlock):\n",
-    "                x = layer(x, encoder_out)\n",
-    "            else:\n",
-    "                x = layer(x)\n",
-    "        return x"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class ResBlock(TimestepBlock):\n",
-    "    def __init__(\n",
-    "        self, channels, emb_channels, dropout, out_channels=None, use_conv=False, use_checkpoint=False, use_scale_shift_norm=False, up=False, down=False, dim=2, stride=(2,2),\n",
-    "        ):\n",
-    "        super().__init__()\n",
-    "        self.out_channels = out_channels or channels\n",
-    "        self.use_scale_shift_norm = use_scale_shift_norm\n",
-    "        self.stride = stride\n",
-    "\n",
-    "        self.in_layers = nn.Sequential(\n",
-    "            # nn.BatchNorm2d(channels), # normalize to standard gaussian\n",
-    "            normalization(channels, swish=1.0),\n",
-    "            nn.Identity(),\n",
-    "            Conv[dim](channels, self.out_channels, 3, padding=1),\n",
-    "            )\n",
-    "\n",
-    "        self.updown = up or down\n",
-    "        if up:\n",
-    "            self.h_updown = Upsample(channels, False, dim=dim, stride=stride)\n",
-    "            self.x_updown = Upsample(channels, False, dim=dim, stride=stride)\n",
-    "        elif down:\n",
-    "            self.h_updown = Downsample(channels, False, dim=dim, stride=stride)\n",
-    "            self.x_updown = Downsample(channels, False, dim=dim, stride=stride)\n",
-    "        else:\n",
-    "            self.h_updown = self.x_updown = nn.Identity()\n",
-    "\n",
-    "        self.emb_layers = nn.Sequential(\n",
-    "            nn.SiLU(),\n",
-    "            nn.Linear(\n",
-    "                emb_channels,\n",
-    "                2 * self.out_channels if use_scale_shift_norm else self.out_channels,\n",
-    "                ),\n",
-    "        )\n",
-    "\n",
-    "        self.out_layers = nn.Sequential(\n",
-    "            # nn.BatchNorm2d(self.out_channels),\n",
-    "            normalization(self.out_channels, swish=0.0 if use_scale_shift_norm else 1.0),\n",
-    "            nn.SiLU() if use_scale_shift_norm else nn.Identity(),\n",
-    "            nn.Dropout(p=dropout),\n",
-    "            zero_module(Conv[dim](self.out_channels, self.out_channels, 3, padding=1)),\n",
-    "        )\n",
     "\n",
-    "        if self.out_channels == channels:\n",
-    "            self.skip_connection = nn.Identity()\n",
-    "        elif use_conv:\n",
-    "            self.skip_connection = Conv[dim](channels, self.out_channels, 3, padding=1)\n",
-    "        else:\n",
-    "            self.skip_connection = Conv[dim](channels, self.out_channels, 1)\n",
     "        \n",
     "\n",
-    "    def forward(self, x, emb):\n",
-    "        if self.updown:\n",
-    "            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]\n",
-    "            h = in_rest(x)\n",
-    "            h = self.h_updown(h)\n",
-    "            x = self.x_updown(x)\n",
-    "            h = in_conv(h)\n",
-    "        else:\n",
-    "            h = self.in_layers(x)\n",
-    "        emb_out = self.emb_layers(emb).type(h.dtype)\n",
     "\n",
-    "        while len(emb_out.shape) < len(h.shape):\n",
-    "            emb_out = emb_out[..., None]\n",
     "\n",
-    "        if self.use_scale_shift_norm:\n",
-    "            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]\n",
-    "            scale, shift = torch.chunk(emb_out, 2, dim=1)\n",
-    "            h = out_norm(h) * (1+scale) + shift\n",
-    "            h = out_rest(h)\n",
-    "        else:\n",
-    "            h += emb_out\n",
-    "            h = self.out_layers(h)\n",
-    "        # print(\"ResBlock, torch.unique(h).shape =\", torch.unique(h).shape)\n",
-    "        return self.skip_connection(x) + h"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class QKVAttention(nn.Module):\n",
-    "    def __init__(self, n_heads):\n",
-    "        super().__init__()\n",
-    "        self.n_heads = n_heads\n",
-    "        # print(\"QKVAttention, self.n_heads =\", self.n_heads)\n",
     "        \n",
-    "    def forward(self, qkv, encoder_kv=None):\n",
-    "        bs, width, length = qkv.shape\n",
-    "        assert width % (3*self.n_heads) == 0\n",
-    "        ch = width // (3*self.n_heads)\n",
-    "\n",
-    "        # print(\"QKVAttention\", bs, self.n_heads, ch, length)\n",
-    "        q, k, v = qkv.reshape(bs*self.n_heads, ch*3, length).split(ch, dim=1)\n",
-    "        if encoder_kv is not None:\n",
-    "            assert encoder_kv.shape[1] == self.n_heads * ch * 2\n",
-    "            ek, ev = encoder_kv.reshape(bs*self.n_heads, ch*2, -1).split(ch, dim=1)\n",
-    "            k = torch.cat([ek,k], dim=-1)\n",
-    "            v = torch.cat([ev,v], dim=-1)\n",
-    "\n",
-    "        scale = 1 / math.sqrt(math.sqrt(ch))\n",
-    "        weight = torch.einsum(\"bct,bcs->bts\", q*scale, k*scale)\n",
-    "        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)\n",
-    "\n",
-    "        a = torch.einsum(\"bts,bcs->bct\", weight, v)\n",
-    "        return a.reshape(bs, -1, length)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class AttentionBlock(nn.Module):\n",
-    "    def __init__(\n",
-    "        self,\n",
-    "        channels,\n",
-    "        num_heads=1,\n",
-    "        num_head_channels=-1,\n",
-    "        use_checkpoint=False,\n",
-    "        encoder_channels=None,\n",
-    "    ):\n",
-    "        super().__init__()\n",
-    "        self.channels = channels\n",
-    "        if num_head_channels == -1:\n",
-    "            self.num_heads = num_heads\n",
-    "        else:\n",
-    "            assert channels % num_head_channels == 0,\\\n",
-    "                f\"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}\"\n",
-    "            self.num_heads = channels // num_head_channels\n",
-    "\n",
-    "        self.use_checkpoint = use_checkpoint\n",
-    "        # self.norm = nn.BatchNorm2d(channels)\n",
-    "        self.norm = normalization(channels, swish=0.0)\n",
-    "        self.qkv = nn.Conv1d(channels, channels * 3, 1)\n",
     "        \n",
-    "        self.attention = QKVAttention(self.num_heads)\n",
-    "\n",
-    "        if encoder_channels is not None:\n",
-    "            self.encoder_kv = nn.Conv1d(encoder_channels, channels * 2, 1)\n",
-    "        self.proj_out = zero_module(nn.Conv1d(channels, channels, 1))\n",
-    "\n",
-    "    def forward(self, x, encoder_out=None):\n",
-    "        b, c, *spatial = x.shape\n",
-    "        qkv = self.qkv(self.norm(x).view(b, c, -1))\n",
-    "        if encoder_out is not None:\n",
-    "            encoder_out = self.encoder_kv(encoder_out)\n",
-    "            h = self.attention(qkv, encoder_out)\n",
-    "        else:\n",
-    "            h = self.attention(qkv)\n",
-    "        # print(\"AttentionBlock, before proj_out, torch.unique(h).shape =\", torch.unique(h).shape)\n",
-    "        h = self.proj_out(h)\n",
-    "        # print(\"AttentionBlock, after proj_out, torch.unique(h).shape =\", torch.unique(h).shape)\n",
-    "        return x + h.reshape(b, c, *spatial)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def timestep_embedding(timesteps, dim, max_period=10000):\n",
-    "    \"\"\"\n",
-    "    Create sinusoidal timestep embeddings.\n",
-    "\n",
-    "    :param timesteps: a 1-D Tensor of N indices, one per batch element.\n",
-    "                      These may be fractional.\n",
-    "    :param dim: the dimension of the output.\n",
-    "    :param max_period: controls the minimum frequency of the embeddings.\n",
-    "    :return: an [N x dim] Tensor of positional embeddings.\n",
-    "    \"\"\"\n",
-    "    #print (timesteps.shape)\n",
-    "    half = dim // 2\n",
-    "    freqs = torch.exp(\n",
-    "        -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half\n",
-    "    ).to(device=timesteps.device)\n",
-    "    #print (timesteps[:, None].float().shape,freqs[None].shape)\n",
-    "    args = timesteps[:, None].float() * freqs[None]\n",
-    "    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)\n",
-    "    if dim % 2:\n",
-    "        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)\n",
-    "    return embedding"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class ContextUnet(nn.Module):\n",
-    "    def __init__(\n",
-    "        self,\n",
-    "        n_param=2,\n",
-    "        image_size=64,\n",
-    "        in_channels=1,\n",
-    "        model_channels=128,\n",
-    "        out_channels = 1,\n",
-    "        channel_mult = None,\n",
-    "        num_res_blocks = 2,\n",
-    "        dropout = 0,\n",
-    "        use_checkpoint = False,\n",
-    "        use_scale_shift_norm = False,\n",
-    "        attention_resolutions = (16, 8),\n",
-    "        num_heads = 4,\n",
-    "        num_head_channels = -1,\n",
-    "        num_heads_upsample = -1,\n",
-    "        resblock_updown = False,\n",
-    "        conv_resample = True,\n",
-    "        encoder_channels = None,\n",
-    "        dim = 2,\n",
-    "        stride = (2,2)\n",
-    "        ):\n",
-    "        super().__init__()\n",
-    "\n",
-    "        if channel_mult == None:\n",
-    "            if image_size == 512:\n",
-    "                channel_mult = (0.5, 1, 1, 2, 2, 4, 4)\n",
-    "            elif image_size == 256:\n",
-    "                channel_mult = (1, 1, 2, 2, 4, 4)\n",
-    "            elif image_size == 128:\n",
-    "                channel_mult = (1, 1, 2, 3, 4)\n",
-    "            elif image_size == 64:\n",
-    "                channel_mult = (1, 1, 2, 2, 4, 4)#(1, 2, 3, 4)\n",
-    "            elif image_size == 28:\n",
-    "                channel_mult = (1, 2)#(1, 2, 3, 4)\n",
-    "            else:\n",
-    "                raise ValueError(f\"unsupported image size: {image_size}\")\n",
-    "        # else:\n",
-    "        #     channel_mult = tuple(int(ch_mult) for ch_mult in channel_mult.split(\",\"))\n",
     "        \n",
-    "        attention_ds = []\n",
-    "        for res in attention_resolutions:\n",
-    "            attention_ds.append(image_size // int(res))\n",
-    "\n",
-    "        # print(\"before, ContextUnet, num_heads_upsample =\", num_heads_upsample, \"num_heads =\", num_heads)\n",
-    "        if num_heads_upsample == -1:\n",
-    "            num_heads_upsample = num_heads\n",
-    "        # print(\"after, ContextUnet, num_heads_upsample =\", num_heads_upsample, \"num_heads =\", num_heads)\n",
-    "\n",
-    "        # self.n_param = n_param\n",
-    "        self.model_channels = model_channels\n",
-    "        self.dtype = torch.float32\n",
-    "\n",
-    "        self.token_embedding = nn.Linear(n_param, model_channels * 4)\n",
-    "\n",
-    "        time_embed_dim = model_channels * 4\n",
-    "        self.time_embed = nn.Sequential(\n",
-    "            nn.Linear(model_channels, time_embed_dim),\n",
-    "            nn.SiLU(),\n",
-    "            nn.Linear(time_embed_dim, time_embed_dim),\n",
-    "        )\n",
-    "\n",
-    "        ch = input_ch = int(channel_mult[0] * model_channels)\n",
-    "\n",
-    "        ###################### input_blocks ######################\n",
-    "        self.input_blocks = nn.ModuleList(\n",
-    "            [TimestepEmbedSequential(Conv[dim](in_channels, ch, 3, padding=1))]\n",
-    "        )\n",
-    "        self._feature_size = ch\n",
-    "        input_block_chans = [ch]\n",
-    "        ds = 1\n",
-    "\n",
-    "        for level, mult in enumerate(channel_mult):\n",
-    "            for _ in range(num_res_blocks):\n",
-    "                layers = [\n",
-    "                    ResBlock(\n",
-    "                        ch,\n",
-    "                        time_embed_dim,\n",
-    "                        dropout,\n",
-    "                        out_channels = int(mult * model_channels),\n",
-    "                        use_checkpoint = use_checkpoint,\n",
-    "                        use_scale_shift_norm = use_scale_shift_norm,\n",
-    "                        dim = dim,\n",
-    "                        stride = stride,\n",
-    "                    )\n",
-    "                ]\n",
-    "                ch = int(mult * model_channels)\n",
-    "                if ds in attention_ds:\n",
-    "                    layers.append(\n",
-    "                        AttentionBlock(\n",
-    "                            ch,\n",
-    "                            use_checkpoint=use_checkpoint,\n",
-    "                            num_heads = num_heads,\n",
-    "                            num_head_channels = num_head_channels,\n",
-    "                            encoder_channels = encoder_channels,\n",
-    "                        )\n",
-    "                    )\n",
-    "                self.input_blocks.append(TimestepEmbedSequential(*layers))\n",
-    "                self._feature_size += ch\n",
-    "                input_block_chans.append(ch)\n",
-    "\n",
-    "            if level != len(channel_mult) - 1:\n",
-    "                out_ch = ch\n",
-    "                self.input_blocks.append(\n",
-    "                    TimestepEmbedSequential(\n",
-    "                        ResBlock(\n",
-    "                            ch,\n",
-    "                            time_embed_dim,\n",
-    "                            dropout,\n",
-    "                            out_channels=out_ch,\n",
-    "                            # dims=dims,\n",
-    "                            use_checkpoint=use_checkpoint,\n",
-    "                            use_scale_shift_norm=use_scale_shift_norm,\n",
-    "                            down=True,\n",
-    "                            dim = dim,\n",
-    "                            stride = stride,\n",
-    "                        )\n",
-    "                        if resblock_updown\n",
-    "                        else Downsample(ch, conv_resample, out_channels=out_ch, dim=dim, stride=stride)\n",
-    "                    )\n",
-    "                )\n",
-    "                ch = out_ch\n",
-    "                input_block_chans.append(ch)\n",
-    "                ds *= 2\n",
-    "                self._feature_size += ch\n",
-    "\n",
-    "\n",
-    "        ###################### middle_blocks ######################\n",
-    "        self.middle_block = TimestepEmbedSequential(\n",
-    "            ResBlock(\n",
-    "                ch,\n",
-    "                time_embed_dim,\n",
-    "                dropout,\n",
-    "                use_checkpoint=use_checkpoint,\n",
-    "                use_scale_shift_norm=use_scale_shift_norm,\n",
-    "                dim = dim,\n",
-    "                stride = stride,\n",
-    "            ),\n",
-    "            AttentionBlock(\n",
-    "                ch,\n",
-    "                use_checkpoint=use_checkpoint,\n",
-    "                num_heads=num_heads,\n",
-    "                num_head_channels=num_head_channels,\n",
-    "                encoder_channels=encoder_channels,\n",
-    "            ),\n",
-    "            ResBlock(\n",
-    "                ch,\n",
-    "                time_embed_dim,\n",
-    "                dropout,\n",
-    "                use_checkpoint=use_checkpoint,\n",
-    "                use_scale_shift_norm=use_scale_shift_norm,\n",
-    "                dim = dim,\n",
-    "                stride = stride,\n",
-    "            ),\n",
-    "        )\n",
-    "        self._feature_size += ch\n",
-    "\n",
-    "\n",
-    "        ###################### output_blocks ######################\n",
-    "        self.output_blocks = nn.ModuleList([])\n",
-    "        for level, mult in list(enumerate(channel_mult))[::-1]:\n",
-    "            for i in range(num_res_blocks + 1):\n",
-    "                ich = input_block_chans.pop()\n",
-    "                layers = [\n",
-    "                    ResBlock(\n",
-    "                        ch + ich,\n",
-    "                        time_embed_dim,\n",
-    "                        dropout,\n",
-    "                        out_channels=int(model_channels * mult),\n",
-    "                        # dims=dims,\n",
-    "                        use_checkpoint=use_checkpoint,\n",
-    "                        use_scale_shift_norm=use_scale_shift_norm,\n",
-    "                        dim = dim,\n",
-    "                        stride = stride,\n",
-    "                    )\n",
-    "                ]\n",
-    "                ch = int(model_channels * mult)\n",
-    "                if ds in attention_ds:\n",
-    "                    # print(\"ds in attention_resolutions, num_heads=\", num_heads_upsample)\n",
-    "                    layers.append(\n",
-    "                        AttentionBlock(\n",
-    "                            ch,\n",
-    "                            use_checkpoint=use_checkpoint,\n",
-    "                            num_heads=num_heads_upsample,\n",
-    "                            num_head_channels=num_head_channels,\n",
-    "                            encoder_channels=encoder_channels,\n",
-    "                        )\n",
-    "                    )\n",
-    "                if level and i == num_res_blocks:\n",
-    "                    out_ch = ch\n",
-    "                    layers.append(\n",
-    "                        ResBlock(\n",
-    "                            ch,\n",
-    "                            time_embed_dim,\n",
-    "                            dropout,\n",
-    "                            out_channels=out_ch,\n",
-    "                            # dims=dims,\n",
-    "                            use_checkpoint=use_checkpoint,\n",
-    "                            use_scale_shift_norm=use_scale_shift_norm,\n",
-    "                            up=True,\n",
-    "                            dim = dim,\n",
-    "                            stride = stride,\n",
-    "                        )\n",
-    "                        if resblock_updown\n",
-    "                        else Upsample(ch, conv_resample, out_channels=out_ch, dim=dim, stride=stride)\n",
-    "                    )\n",
-    "                    ds //= 2\n",
-    "                self.output_blocks.append(TimestepEmbedSequential(*layers))\n",
-    "                self._feature_size += ch\n",
-    "\n",
-    "        self.out = nn.Sequential(\n",
-    "            # nn.BatchNorm2d(ch),\n",
-    "            normalization(ch, swish=1.0),\n",
-    "            nn.Identity(),\n",
-    "            zero_module(Conv[dim](input_ch, out_channels, 3, padding=1)),\n",
-    "        )\n",
-    "        # self.use_fp16 = use_fp16\n",
-    "\n",
-    "    def forward(self, x, timesteps, y=None):\n",
-    "        hs = []\n",
-    "        emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))\n",
-    "        if y != None:\n",
-    "            text_outputs = self.token_embedding(y.float())\n",
-    "            emb = emb + text_outputs.to(emb)\n",
-    "\n",
-    "        h = x.type(self.dtype)\n",
-    "        # print(\"0,h.shape =\", h.shape)\n",
-    "        for module in self.input_blocks:\n",
-    "            h = module(h, emb)\n",
-    "            hs.append(h)\n",
-    "            # print(\"module encoder, h.shape =\", h.shape)\n",
-    "        # print(\"2,h.shape =\", h.shape)\n",
-    "        h = self.middle_block(h, emb)\n",
-    "        # print(\"middle block, h.shape =\", h.shape)\n",
-    "        # print(\"2,h.shape =\", h.shape)\n",
-    "        for module in self.output_blocks:\n",
-    "            # print(\"for module in self.output_blocks, h.shape =\", h.shape)\n",
-    "            # print(\"len(hs) =\", len(hs), \", hs[-1].shape =\", hs[-1].shape)\n",
-    "            h = torch.cat([h, hs.pop()], dim=1)\n",
-    "            h = module(h, emb)\n",
-    "            # print(\"module decoder, h.shape =\", h.shape)\n",
-    "\n",
-    "        h = h.type(x.dtype)\n",
-    "        h = self.out(h)\n",
-    "        # print(\"self.out(h)\", \"h.shape =\", h.shape)\n",
-    "\n",
-    "        return h "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -960,12 +893,13 @@
     "        self.step += 1\n",
     "\n",
     "    def reset_parameters(self, ema_model, model):\n",
-    "        ema_model.load_state_dict(model.state_dict())"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1031,7 +965,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1041,7 +975,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1050,7 +984,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1074,7 +1008,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1272,7 +1206,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
@@ -1482,7 +1416,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
@@ -1509,14 +1443,14 @@
      "output_type": "stream",
      "text": [
       "params loaded: (200, 2)\n",
-      "images rescaled to [-1.0, 1.064338207244873]\n",
-      "params rescaled to [0.0, 0.9988593502151616]\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "2e0b629831714bc2b32e25d44a72f4b3",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1530,7 +1464,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c634a180ede04f3cb09ab74daf0401c6",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1544,7 +1478,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6f3a0791c42b4d7e958f2a9d57f64de8",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1558,7 +1492,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "9dce2de3e8a14aee83e2b182dc06608f",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1572,7 +1506,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d4596bdc71cc4d4cb780442b97849883",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1586,7 +1520,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6e68847216504241b81ebcb71c48f687",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1600,7 +1534,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "830c25eb902a47e7997dcdb40099c5a4",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1614,7 +1548,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "87fdac7b595c4d0ea7258ee8bb35de17",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1628,7 +1562,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b9f6be95f4bd403d85f6df34756e7b8d",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1642,7 +1576,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "28ec5d881b37440ba5f4c863fc552c17",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1660,7 +1594,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1678,7 +1612,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "58944c3b1e4f42bb8771f776c35a90a7",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1688,28 +1622,6 @@
      },
      "metadata": {},
      "output_type": "display_data"
-    },
-    {
-     "ename": "RuntimeError",
-     "evalue": "CUDA out of memory. Tried to allocate 640.00 MiB (GPU 0; 23.64 GiB total capacity; 21.65 GiB already allocated; 432.50 MiB free; 22.23 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[26], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m ddpm21cm\u001b[39m.\u001b[39;49msample(\u001b[39m\"\u001b[39;49m\u001b[39m./outputs/model_state_09.pth\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n",
-      "Cell \u001b[0;32mIn[25], line 177\u001b[0m, in \u001b[0;36mDDPM21CM.sample\u001b[0;34m(self, file, params, ema, entire)\u001b[0m\n\u001b[1;32m    171\u001b[0m nn_model\u001b[39m.\u001b[39meval()\n\u001b[1;32m    173\u001b[0m \u001b[39m# self.ema_model = ContextUnet(n_param=config.n_param, image_size=config.HII_DIM, dim=config.dim, stride=config.stride).to(config.device)\u001b[39;00m\n\u001b[1;32m    174\u001b[0m \u001b[39m# self.ema_model.load_state_dict(torch.load(os.path.join(config.output_dir, f\"{config.resume}\"))['ema_unet_state_dict'])\u001b[39;00m\n\u001b[1;32m    175\u001b[0m \u001b[39m# print(f\"resumed ema_model from {config.resume}\")\u001b[39;00m\n\u001b[0;32m--> 177\u001b[0m x_last, x_entire \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mddpm\u001b[39m.\u001b[39;49msample(\n\u001b[1;32m    178\u001b[0m     nn_model\u001b[39m=\u001b[39;49mnn_model, \n\u001b[1;32m    179\u001b[0m     params\u001b[39m=\u001b[39;49mparams\u001b[39m.\u001b[39;49mto(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mconfig\u001b[39m.\u001b[39;49mdevice), \n\u001b[1;32m    180\u001b[0m     device\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mconfig\u001b[39m.\u001b[39;49mdevice, \n\u001b[1;32m    181\u001b[0m     guide_w\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mconfig\u001b[39m.\u001b[39;49mguide_w\n\u001b[1;32m    182\u001b[0m     )\n\u001b[1;32m    184\u001b[0m np\u001b[39m.\u001b[39msave(os\u001b[39m.\u001b[39mpath\u001b[39m.\u001b[39mjoin(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconfig\u001b[39m.\u001b[39moutput_dir, \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconfig\u001b[39m.\u001b[39mrun_name\u001b[39m}\u001b[39;00m\u001b[39m{\u001b[39;00m\u001b[39m'\u001b[39m\u001b[39mema\u001b[39m\u001b[39m'\u001b[39m\u001b[39m \u001b[39m\u001b[39mif\u001b[39;00m\u001b[39m \u001b[39mema\u001b[39m \u001b[39m\u001b[39melse\u001b[39;00m\u001b[39m \u001b[39m\u001b[39mNone\u001b[39;00m\u001b[39m}\u001b[39;00m\u001b[39m.npy\u001b[39m\u001b[39m\"\u001b[39m), x_last)\n\u001b[1;32m    186\u001b[0m \u001b[39mif\u001b[39;00m entire:\n",
-      "Cell \u001b[0;32mIn[7], line 75\u001b[0m, in \u001b[0;36mDDPMScheduler.sample\u001b[0;34m(self, nn_model, params, device, guide_w)\u001b[0m\n\u001b[1;32m     71\u001b[0m t_is \u001b[39m=\u001b[39m t_is\u001b[39m.\u001b[39mrepeat(\u001b[39m2\u001b[39m)\n\u001b[1;32m     73\u001b[0m \u001b[39m# split predictions and compute weighting\u001b[39;00m\n\u001b[1;32m     74\u001b[0m \u001b[39m# print(\"nn_model input shape\", x_i.shape, t_is.shape, c_i.shape)\u001b[39;00m\n\u001b[0;32m---> 75\u001b[0m eps \u001b[39m=\u001b[39m nn_model(x_i, t_is, c_i)\n\u001b[1;32m     76\u001b[0m eps1 \u001b[39m=\u001b[39m eps[:n_sample]\n\u001b[1;32m     77\u001b[0m eps2 \u001b[39m=\u001b[39m eps[n_sample:]\n",
-      "File \u001b[0;32m/usr/local/pace-apps/manual/packages/pytorch/1.12.0/lib/python3.9/site-packages/torch/nn/modules/module.py:1130\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m   1126\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1127\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1128\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1129\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1130\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49m\u001b[39minput\u001b[39;49m, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1131\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1132\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
-      "Cell \u001b[0;32mIn[18], line 241\u001b[0m, in \u001b[0;36mContextUnet.forward\u001b[0;34m(self, x, timesteps, y)\u001b[0m\n\u001b[1;32m    237\u001b[0m \u001b[39mfor\u001b[39;00m module \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39moutput_blocks:\n\u001b[1;32m    238\u001b[0m     \u001b[39m# print(\"for module in self.output_blocks, h.shape =\", h.shape)\u001b[39;00m\n\u001b[1;32m    239\u001b[0m     \u001b[39m# print(\"len(hs) =\", len(hs), \", hs[-1].shape =\", hs[-1].shape)\u001b[39;00m\n\u001b[1;32m    240\u001b[0m     h \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mcat([h, hs\u001b[39m.\u001b[39mpop()], dim\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m)\n\u001b[0;32m--> 241\u001b[0m     h \u001b[39m=\u001b[39m module(h, emb)\n\u001b[1;32m    242\u001b[0m     \u001b[39m# print(\"module decoder, h.shape =\", h.shape)\u001b[39;00m\n\u001b[1;32m    244\u001b[0m h \u001b[39m=\u001b[39m h\u001b[39m.\u001b[39mtype(x\u001b[39m.\u001b[39mdtype)\n",
-      "File \u001b[0;32m/usr/local/pace-apps/manual/packages/pytorch/1.12.0/lib/python3.9/site-packages/torch/nn/modules/module.py:1130\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m   1126\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1127\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1128\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1129\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1130\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49m\u001b[39minput\u001b[39;49m, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1131\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1132\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
-      "Cell \u001b[0;32mIn[13], line 7\u001b[0m, in \u001b[0;36mTimestepEmbedSequential.forward\u001b[0;34m(self, x, emb, encoder_out)\u001b[0m\n\u001b[1;32m      5\u001b[0m     x \u001b[39m=\u001b[39m layer(x, emb)\n\u001b[1;32m      6\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(layer, AttentionBlock):\n\u001b[0;32m----> 7\u001b[0m     x \u001b[39m=\u001b[39m layer(x, encoder_out)\n\u001b[1;32m      8\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m      9\u001b[0m     x \u001b[39m=\u001b[39m layer(x)\n",
-      "File \u001b[0;32m/usr/local/pace-apps/manual/packages/pytorch/1.12.0/lib/python3.9/site-packages/torch/nn/modules/module.py:1130\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m   1126\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1127\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1128\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1129\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1130\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49m\u001b[39minput\u001b[39;49m, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1131\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1132\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
-      "Cell \u001b[0;32mIn[16], line 37\u001b[0m, in \u001b[0;36mAttentionBlock.forward\u001b[0;34m(self, x, encoder_out)\u001b[0m\n\u001b[1;32m     35\u001b[0m     h \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mattention(qkv, encoder_out)\n\u001b[1;32m     36\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m---> 37\u001b[0m     h \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mattention(qkv)\n\u001b[1;32m     38\u001b[0m \u001b[39m# print(\"AttentionBlock, before proj_out, torch.unique(h).shape =\", torch.unique(h).shape)\u001b[39;00m\n\u001b[1;32m     39\u001b[0m h \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mproj_out(h)\n",
-      "File \u001b[0;32m/usr/local/pace-apps/manual/packages/pytorch/1.12.0/lib/python3.9/site-packages/torch/nn/modules/module.py:1130\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m   1126\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1127\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1128\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1129\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1130\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49m\u001b[39minput\u001b[39;49m, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1131\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1132\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
-      "Cell \u001b[0;32mIn[15], line 21\u001b[0m, in \u001b[0;36mQKVAttention.forward\u001b[0;34m(self, qkv, encoder_kv)\u001b[0m\n\u001b[1;32m     18\u001b[0m     v \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mcat([ev,v], dim\u001b[39m=\u001b[39m\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m)\n\u001b[1;32m     20\u001b[0m scale \u001b[39m=\u001b[39m \u001b[39m1\u001b[39m \u001b[39m/\u001b[39m math\u001b[39m.\u001b[39msqrt(math\u001b[39m.\u001b[39msqrt(ch))\n\u001b[0;32m---> 21\u001b[0m weight \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39;49meinsum(\u001b[39m\"\u001b[39;49m\u001b[39mbct,bcs->bts\u001b[39;49m\u001b[39m\"\u001b[39;49m, q\u001b[39m*\u001b[39;49mscale, k\u001b[39m*\u001b[39;49mscale)\n\u001b[1;32m     22\u001b[0m weight \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39msoftmax(weight\u001b[39m.\u001b[39mfloat(), dim\u001b[39m=\u001b[39m\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m)\u001b[39m.\u001b[39mtype(weight\u001b[39m.\u001b[39mdtype)\n\u001b[1;32m     24\u001b[0m a \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39meinsum(\u001b[39m\"\u001b[39m\u001b[39mbts,bcs->bct\u001b[39m\u001b[39m\"\u001b[39m, weight, v)\n",
-      "File \u001b[0;32m/usr/local/pace-apps/manual/packages/pytorch/1.12.0/lib/python3.9/site-packages/torch/functional.py:360\u001b[0m, in \u001b[0;36meinsum\u001b[0;34m(*args)\u001b[0m\n\u001b[1;32m    356\u001b[0m     \u001b[39m# recurse incase operands contains value that has torch function\u001b[39;00m\n\u001b[1;32m    357\u001b[0m     \u001b[39m# in the original implementation this line is omitted\u001b[39;00m\n\u001b[1;32m    358\u001b[0m     \u001b[39mreturn\u001b[39;00m einsum(equation, \u001b[39m*\u001b[39m_operands)\n\u001b[0;32m--> 360\u001b[0m \u001b[39mreturn\u001b[39;00m _VF\u001b[39m.\u001b[39;49meinsum(equation, operands)\n",
-      "\u001b[0;31mRuntimeError\u001b[0m: CUDA out of memory. Tried to allocate 640.00 MiB (GPU 0; 23.64 GiB total capacity; 21.65 GiB already allocated; 432.50 MiB free; 22.23 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF"
-     ]
     }
    ],
    "source": [

     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0e2d634b9f734693a5e1eace447bd2e1",
        "version_major": 2,
        "version_minor": 0
       },
     "from pathlib import Path\n",
     "from diffusers.optimization import get_cosine_schedule_with_warmup\n",
     "from accelerate import notebook_launcher, Accelerator\n",
+    "from huggingface_hub import create_repo, upload_folder\n",
+    "\n",
+    "from load_h5 import Dataset4h5\n",
+    "from context_unet import ContextUnet"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# class Dataset4h5(Dataset):\n",
+    "#     def __init__(self, dir_name, num_image=10, field='brightness_temp', shuffle=True, idx=None, num_redshift=32, HII_DIM=32, rescale=True, drop_prob = 0, dim=2):\n",
+    "#         super().__init__()\n",
     "        \n",
+    "#         self.dir_name = dir_name\n",
+    "#         self.num_image = num_image\n",
+    "#         self.field = field\n",
+    "#         self.shuffle = shuffle\n",
+    "#         self.idx = idx\n",
+    "#         self.num_redshift = num_redshift\n",
+    "#         self.HII_DIM = HII_DIM\n",
+    "#         self.drop_prob = drop_prob\n",
+    "#         self.dim = dim\n",
+    "\n",
+    "#         self.load_h5()\n",
+    "#         if rescale:\n",
+    "#             self.images = self.rescale(self.images, to=[-1,1])\n",
+    "#             self.params = self.rescale(self.params, to=[0,1])\n",
+    "\n",
+    "#         self.len = len(self.params)\n",
+    "#         self.images = torch.from_numpy(self.images)\n",
+    "#         print(f\"images rescaled to [{self.images.min()}, {self.images.max()}]\")\n",
+    "\n",
+    "#         cond_filter = torch.bernoulli(torch.ones(len(self.params),1)-self.drop_prob).repeat(1,self.params.shape[1]).numpy()\n",
+    "#         self.params = torch.from_numpy(self.params*cond_filter)\n",
+    "#         print(f\"params rescaled to [{self.params.min()}, {self.params.max()}]\")\n",
+    "\n",
+    "#     def load_h5(self):\n",
+    "#         with h5py.File(self.dir_name, 'r') as f:\n",
+    "#             print(f\"dataset content: {f.keys()}\")\n",
+    "#             max_num_image = len(f['brightness_temp'])#.shape[0]\n",
+    "#             print(f\"{max_num_image} images can be loaded\")\n",
+    "#             field_shape = f['brightness_temp'].shape[1:]\n",
+    "#             print(f\"field.shape = {field_shape}\")\n",
+    "#             self.params_keys = list(f['params']['keys'])\n",
+    "#             print(f\"params keys = {self.params_keys}\")\n",
+    "\n",
+    "#             if self.idx is None:\n",
+    "#                 if self.shuffle:\n",
+    "#                     self.idx = np.sort(random.sample(range(max_num_image), self.num_image))\n",
+    "#                     print(f\"loading {self.num_image} images randomly\")\n",
+    "#                     # print(self.idx)\n",
+    "#                 else:\n",
+    "#                     self.idx = range(self.num_image)\n",
+    "#                     print(f\"loading {len(self.idx)} images with idx = {self.idx}\")\n",
+    "#             else:\n",
+    "#                 print(f\"loading {len(self.idx)} images with idx = {self.idx}\")\n",
+    "\n",
+    "#             if self.dim == 2:\n",
+    "#                 self.images = f[self.field][self.idx,0,:self.HII_DIM,-self.num_redshift:][:,None]\n",
+    "#             elif self.dim == 3:\n",
+    "#                 self.images = f[self.field][self.idx,:self.HII_DIM,:self.HII_DIM,-self.num_redshift:][:,None]\n",
+    "#             print(f\"images loaded:\", self.images.shape)\n",
+    "\n",
+    "#             self.params = f['params']['values'][self.idx]\n",
+    "#             print(\"params loaded:\", self.params.shape)\n",
     "            \n",
+    "#             # plt.imshow(self.images[0,0,0])\n",
+    "#             # plt.show()\n",
+    "\n",
+    "#     def rescale(self, value, to: list):\n",
+    "#         # print(np.ndim(value))\n",
+    "#         if np.ndim(value)==2:\n",
+    "#             # print(f\"rescale params of shape {value.shape}\")\n",
+    "#             ranges = \\\n",
+    "#                 {\n",
+    "#                     0: [4, 6], # ION_Tvir_MIN\n",
+    "#                     1: [10, 250], # HII_EFF_FACTOR\n",
+    "#                 }\n",
+    "#         # elif np.ndim(value)==5:  \n",
+    "#         else:  \n",
+    "#             # value = np.array(value)\n",
+    "#             # print(f\"rescale images of shape {np.shape(value)}\")\n",
+    "#             ranges = \\\n",
+    "#                 {\n",
+    "#                     0: [0, 80], # brightness_temp\n",
+    "#                 }\n",
+    "#         # print(f\"value.min = {value.min()}, value.max = {value.max()}\")\n",
+    "#         for i in range(np.shape(value)[1]):\n",
+    "#             value[:,i] = (value[:,i] - ranges[i][0]) / (ranges[i][1]-ranges[i][0])\n",
+    "#         # print(f\"value.min = {value.min()}, value.max = {value.max()}\")\n",
+    "#         value = value * (to[1]-to[0]) + to[0]\n",
+    "#         return value \n",
+    "\n",
+    "#     def __getitem__(self, index):\n",
+    "#         return self.images[index], self.params[index]\n",
+    "\n",
+    "#     def __len__(self):\n",
+    "#         return self.len"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# class GroupNorm32(nn.GroupNorm):\n",
+    "#     def __init__(self, num_groups, num_channels, swish, eps=1e-5):\n",
+    "#         super().__init__(num_groups=num_groups, num_channels=num_channels, eps=eps)\n",
+    "#         self.swish = swish\n",
+    "\n",
+    "#     def forward(self, x):\n",
+    "#         y = super().forward(x.float()).to(x.dtype)\n",
+    "#         if self.swish == 1.0:\n",
+    "#             y = F.silu(y)\n",
+    "#         elif self.swish:\n",
+    "#             y = y * F.sigmoid(y * float(self.swish))\n",
+    "#         return y\n",
+    "\n",
+    "# def normalization(channels, swish=0.0):\n",
+    "#     \"\"\"\n",
+    "#     Make a standard normalization layer, with an optional swish activation.\n",
+    "\n",
+    "#     :param channels: number of input channels.\n",
+    "#     :return: an nn.Module for normalization.\n",
+    "#     \"\"\"\n",
+    "#     #print (channels)\n",
+    "#     return GroupNorm32(num_channels=channels, num_groups=32, swish=swish)\n",
+    "\n",
+    "# Conv = {\n",
+    "#     1: nn.Conv1d,\n",
+    "#     2: nn.Conv2d,\n",
+    "#     3: nn.Conv3d,\n",
+    "# }\n",
+    "\n",
+    "# AvgPool = {\n",
+    "#     1: nn.AvgPool1d,\n",
+    "#     2: nn.AvgPool2d,\n",
+    "#     3: nn.AvgPool3d\n",
+    "# }\n",
+    "\n",
+    "# class Downsample(nn.Module):\n",
+    "#     def __init__(self, channels, use_conv, out_channels=None, dim=2, stride=(2,2)):\n",
+    "#         super().__init__()\n",
+    "#         self.channels = channels\n",
+    "#         self.out_channels = out_channels or channels\n",
+    "#         # stride = config.stride\n",
+    "#         if use_conv:\n",
+    "#             # print(\"conv\")\n",
+    "#             self.op = Conv[dim](channels, self.out_channels, 3, stride=stride, padding=1)\n",
+    "#         else:\n",
+    "#             # print(\"pool\")\n",
+    "#             assert channels == self.out_channels\n",
+    "#             self.op = AvgPool[dim](kernel_size=stride, stride=stride)\n",
+    "\n",
+    "#     def forward(self, x):\n",
+    "#         assert x.shape[1] == self.channels\n",
+    "#         return self.op(x)\n",
+    "\n",
+    "# class Upsample(nn.Module):\n",
+    "#     def __init__(self, channels, use_conv, out_channels=None, dim=2, stride=(2,2)):\n",
+    "#         super().__init__()\n",
+    "#         self.channels = channels\n",
+    "#         self.out_channels = out_channels\n",
+    "#         self.use_conv = use_conv\n",
+    "#         self.stride = stride\n",
+    "#         if self.use_conv:\n",
+    "#             self.conv = Conv[dim](self.channels, self.out_channels, 3, padding=1)\n",
+    "\n",
+    "#     def forward(self, x):\n",
+    "#         assert x.shape[1] == self.channels\n",
+    "#         # stride = config.stride\n",
+    "#         # print(torch.tensor(x.shape[2:]))\n",
+    "#         # print(torch.tensor(stride))\n",
+    "#         shape = torch.tensor(x.shape[2:]) * torch.tensor(self.stride)\n",
+    "#         shape = tuple(shape.detach().numpy())\n",
+    "#         # print(shape)\n",
+    "#         x = F.interpolate(x, shape, mode='nearest')\n",
+    "#         if self.use_conv:\n",
+    "#             x = self.conv(x)\n",
+    "#         return x\n",
+    "\n",
+    "# def zero_module(module):\n",
+    "#     \"\"\"\n",
+    "#     clean gradient of parameters of the module\n",
+    "#     \"\"\"\n",
+    "#     for p in module.parameters():\n",
+    "#         p.detach().zero_()\n",
+    "#     return module\n",
+    "\n",
+    "# class TimestepBlock(ABC, nn.Module):\n",
+    "#     @abstractmethod\n",
+    "#     def forward(self, x, emb):\n",
+    "#         \"\"\"\n",
+    "#         test\n",
+    "#         \"\"\"\n",
+    "\n",
+    "# class TimestepEmbedSequential(nn.Sequential, TimestepBlock):\n",
+    "#     def forward(self, x, emb, encoder_out=None):\n",
+    "#         for layer in self:\n",
+    "#             if isinstance(layer, TimestepBlock):\n",
+    "#                 x = layer(x, emb)\n",
+    "#             elif isinstance(layer, AttentionBlock):\n",
+    "#                 x = layer(x, encoder_out)\n",
+    "#             else:\n",
+    "#                 x = layer(x)\n",
+    "#         return x\n",
+    "\n",
+    "# class ResBlock(TimestepBlock):\n",
+    "#     def __init__(\n",
+    "#         self, channels, emb_channels, dropout, out_channels=None, use_conv=False, use_checkpoint=False, use_scale_shift_norm=False, up=False, down=False, dim=2, stride=(2,2),\n",
+    "#         ):\n",
+    "#         super().__init__()\n",
+    "#         self.out_channels = out_channels or channels\n",
+    "#         self.use_scale_shift_norm = use_scale_shift_norm\n",
+    "#         self.stride = stride\n",
+    "\n",
+    "#         self.in_layers = nn.Sequential(\n",
+    "#             # nn.BatchNorm2d(channels), # normalize to standard gaussian\n",
+    "#             normalization(channels, swish=1.0),\n",
+    "#             nn.Identity(),\n",
+    "#             Conv[dim](channels, self.out_channels, 3, padding=1),\n",
+    "#             )\n",
     "\n",
+    "#         self.updown = up or down\n",
+    "#         if up:\n",
+    "#             self.h_updown = Upsample(channels, False, dim=dim, stride=stride)\n",
+    "#             self.x_updown = Upsample(channels, False, dim=dim, stride=stride)\n",
+    "#         elif down:\n",
+    "#             self.h_updown = Downsample(channels, False, dim=dim, stride=stride)\n",
+    "#             self.x_updown = Downsample(channels, False, dim=dim, stride=stride)\n",
+    "#         else:\n",
+    "#             self.h_updown = self.x_updown = nn.Identity()\n",
+    "\n",
+    "#         self.emb_layers = nn.Sequential(\n",
+    "#             nn.SiLU(),\n",
+    "#             nn.Linear(\n",
+    "#                 emb_channels,\n",
+    "#                 2 * self.out_channels if use_scale_shift_norm else self.out_channels,\n",
+    "#                 ),\n",
+    "#         )\n",
+    "\n",
+    "#         self.out_layers = nn.Sequential(\n",
+    "#             # nn.BatchNorm2d(self.out_channels),\n",
+    "#             normalization(self.out_channels, swish=0.0 if use_scale_shift_norm else 1.0),\n",
+    "#             nn.SiLU() if use_scale_shift_norm else nn.Identity(),\n",
+    "#             nn.Dropout(p=dropout),\n",
+    "#             zero_module(Conv[dim](self.out_channels, self.out_channels, 3, padding=1)),\n",
+    "#         )\n",
+    "\n",
+    "#         if self.out_channels == channels:\n",
+    "#             self.skip_connection = nn.Identity()\n",
+    "#         elif use_conv:\n",
+    "#             self.skip_connection = Conv[dim](channels, self.out_channels, 3, padding=1)\n",
+    "#         else:\n",
+    "#             self.skip_connection = Conv[dim](channels, self.out_channels, 1)\n",
     "        \n",
     "\n",
+    "#     def forward(self, x, emb):\n",
+    "#         if self.updown:\n",
+    "#             in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]\n",
+    "#             h = in_rest(x)\n",
+    "#             h = self.h_updown(h)\n",
+    "#             x = self.x_updown(x)\n",
+    "#             h = in_conv(h)\n",
+    "#         else:\n",
+    "#             h = self.in_layers(x)\n",
+    "#         emb_out = self.emb_layers(emb).type(h.dtype)\n",
     "\n",
+    "#         while len(emb_out.shape) < len(h.shape):\n",
+    "#             emb_out = emb_out[..., None]\n",
     "\n",
+    "#         if self.use_scale_shift_norm:\n",
+    "#             out_norm, out_rest = self.out_layers[0], self.out_layers[1:]\n",
+    "#             scale, shift = torch.chunk(emb_out, 2, dim=1)\n",
+    "#             h = out_norm(h) * (1+scale) + shift\n",
+    "#             h = out_rest(h)\n",
+    "#         else:\n",
+    "#             h += emb_out\n",
+    "#             h = self.out_layers(h)\n",
+    "#         # print(\"ResBlock, torch.unique(h).shape =\", torch.unique(h).shape)\n",
+    "#         return self.skip_connection(x) + h\n",
+    "\n",
+    "# class QKVAttention(nn.Module):\n",
+    "#     def __init__(self, n_heads):\n",
+    "#         super().__init__()\n",
+    "#         self.n_heads = n_heads\n",
+    "#         # print(\"QKVAttention, self.n_heads =\", self.n_heads)\n",
     "        \n",
+    "#     def forward(self, qkv, encoder_kv=None):\n",
+    "#         bs, width, length = qkv.shape\n",
+    "#         assert width % (3*self.n_heads) == 0\n",
+    "#         ch = width // (3*self.n_heads)\n",
+    "\n",
+    "#         # print(\"QKVAttention\", bs, self.n_heads, ch, length)\n",
+    "#         q, k, v = qkv.reshape(bs*self.n_heads, ch*3, length).split(ch, dim=1)\n",
+    "#         if encoder_kv is not None:\n",
+    "#             assert encoder_kv.shape[1] == self.n_heads * ch * 2\n",
+    "#             ek, ev = encoder_kv.reshape(bs*self.n_heads, ch*2, -1).split(ch, dim=1)\n",
+    "#             k = torch.cat([ek,k], dim=-1)\n",
+    "#             v = torch.cat([ev,v], dim=-1)\n",
+    "\n",
+    "#         scale = 1 / math.sqrt(math.sqrt(ch))\n",
+    "#         weight = torch.einsum(\"bct,bcs->bts\", q*scale, k*scale)\n",
+    "#         weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)\n",
+    "\n",
+    "#         a = torch.einsum(\"bts,bcs->bct\", weight, v)\n",
+    "#         return a.reshape(bs, -1, length)\n",
+    "\n",
+    "# class AttentionBlock(nn.Module):\n",
+    "#     def __init__(\n",
+    "#         self,\n",
+    "#         channels,\n",
+    "#         num_heads=1,\n",
+    "#         num_head_channels=-1,\n",
+    "#         use_checkpoint=False,\n",
+    "#         encoder_channels=None,\n",
+    "#     ):\n",
+    "#         super().__init__()\n",
+    "#         self.channels = channels\n",
+    "#         if num_head_channels == -1:\n",
+    "#             self.num_heads = num_heads\n",
+    "#         else:\n",
+    "#             assert channels % num_head_channels == 0,\\\n",
+    "#                 f\"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}\"\n",
+    "#             self.num_heads = channels // num_head_channels\n",
+    "\n",
+    "#         self.use_checkpoint = use_checkpoint\n",
+    "#         # self.norm = nn.BatchNorm2d(channels)\n",
+    "#         self.norm = normalization(channels, swish=0.0)\n",
+    "#         self.qkv = nn.Conv1d(channels, channels * 3, 1)\n",
     "        \n",
+    "#         self.attention = QKVAttention(self.num_heads)\n",
+    "\n",
+    "#         if encoder_channels is not None:\n",
+    "#             self.encoder_kv = nn.Conv1d(encoder_channels, channels * 2, 1)\n",
+    "#         self.proj_out = zero_module(nn.Conv1d(channels, channels, 1))\n",
+    "\n",
+    "#     def forward(self, x, encoder_out=None):\n",
+    "#         b, c, *spatial = x.shape\n",
+    "#         qkv = self.qkv(self.norm(x).view(b, c, -1))\n",
+    "#         if encoder_out is not None:\n",
+    "#             encoder_out = self.encoder_kv(encoder_out)\n",
+    "#             h = self.attention(qkv, encoder_out)\n",
+    "#         else:\n",
+    "#             h = self.attention(qkv)\n",
+    "#         # print(\"AttentionBlock, before proj_out, torch.unique(h).shape =\", torch.unique(h).shape)\n",
+    "#         h = self.proj_out(h)\n",
+    "#         # print(\"AttentionBlock, after proj_out, torch.unique(h).shape =\", torch.unique(h).shape)\n",
+    "#         return x + h.reshape(b, c, *spatial)\n",
+    "\n",
+    "# def timestep_embedding(timesteps, dim, max_period=10000):\n",
+    "#     \"\"\"\n",
+    "#     Create sinusoidal timestep embeddings.\n",
+    "\n",
+    "#     :param timesteps: a 1-D Tensor of N indices, one per batch element.\n",
+    "#                       These may be fractional.\n",
+    "#     :param dim: the dimension of the output.\n",
+    "#     :param max_period: controls the minimum frequency of the embeddings.\n",
+    "#     :return: an [N x dim] Tensor of positional embeddings.\n",
+    "#     \"\"\"\n",
+    "#     #print (timesteps.shape)\n",
+    "#     half = dim // 2\n",
+    "#     freqs = torch.exp(\n",
+    "#         -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half\n",
+    "#     ).to(device=timesteps.device)\n",
+    "#     #print (timesteps[:, None].float().shape,freqs[None].shape)\n",
+    "#     args = timesteps[:, None].float() * freqs[None]\n",
+    "#     embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)\n",
+    "#     if dim % 2:\n",
+    "#         embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)\n",
+    "#     return embedding\n",
+    "\n",
+    "# class ContextUnet(nn.Module):\n",
+    "#     def __init__(\n",
+    "#         self,\n",
+    "#         n_param=2,\n",
+    "#         image_size=64,\n",
+    "#         in_channels=1,\n",
+    "#         model_channels=128,\n",
+    "#         out_channels = 1,\n",
+    "#         channel_mult = None,\n",
+    "#         num_res_blocks = 2,\n",
+    "#         dropout = 0,\n",
+    "#         use_checkpoint = False,\n",
+    "#         use_scale_shift_norm = False,\n",
+    "#         attention_resolutions = (16, 8),\n",
+    "#         num_heads = 4,\n",
+    "#         num_head_channels = -1,\n",
+    "#         num_heads_upsample = -1,\n",
+    "#         resblock_updown = False,\n",
+    "#         conv_resample = True,\n",
+    "#         encoder_channels = None,\n",
+    "#         dim = 2,\n",
+    "#         stride = (2,2)\n",
+    "#         ):\n",
+    "#         super().__init__()\n",
+    "\n",
+    "#         if channel_mult == None:\n",
+    "#             if image_size == 512:\n",
+    "#                 channel_mult = (0.5, 1, 1, 2, 2, 4, 4)\n",
+    "#             elif image_size == 256:\n",
+    "#                 channel_mult = (1, 1, 2, 2, 4, 4)\n",
+    "#             elif image_size == 128:\n",
+    "#                 channel_mult = (1, 1, 2, 3, 4)\n",
+    "#             elif image_size == 64:\n",
+    "#                 channel_mult = (1, 1, 2, 2, 4, 4)#(1, 2, 3, 4)\n",
+    "#             elif image_size == 28:\n",
+    "#                 channel_mult = (1, 2)#(1, 2, 3, 4)\n",
+    "#             else:\n",
+    "#                 raise ValueError(f\"unsupported image size: {image_size}\")\n",
+    "#         # else:\n",
+    "#         #     channel_mult = tuple(int(ch_mult) for ch_mult in channel_mult.split(\",\"))\n",
     "        \n",
+    "#         attention_ds = []\n",
+    "#         for res in attention_resolutions:\n",
+    "#             attention_ds.append(image_size // int(res))\n",
+    "\n",
+    "#         # print(\"before, ContextUnet, num_heads_upsample =\", num_heads_upsample, \"num_heads =\", num_heads)\n",
+    "#         if num_heads_upsample == -1:\n",
+    "#             num_heads_upsample = num_heads\n",
+    "#         # print(\"after, ContextUnet, num_heads_upsample =\", num_heads_upsample, \"num_heads =\", num_heads)\n",
+    "\n",
+    "#         # self.n_param = n_param\n",
+    "#         self.model_channels = model_channels\n",
+    "#         self.dtype = torch.float32\n",
+    "\n",
+    "#         self.token_embedding = nn.Linear(n_param, model_channels * 4)\n",
+    "\n",
+    "#         time_embed_dim = model_channels * 4\n",
+    "#         self.time_embed = nn.Sequential(\n",
+    "#             nn.Linear(model_channels, time_embed_dim),\n",
+    "#             nn.SiLU(),\n",
+    "#             nn.Linear(time_embed_dim, time_embed_dim),\n",
+    "#         )\n",
+    "\n",
+    "#         ch = input_ch = int(channel_mult[0] * model_channels)\n",
+    "\n",
+    "#         ###################### input_blocks ######################\n",
+    "#         self.input_blocks = nn.ModuleList(\n",
+    "#             [TimestepEmbedSequential(Conv[dim](in_channels, ch, 3, padding=1))]\n",
+    "#         )\n",
+    "#         self._feature_size = ch\n",
+    "#         input_block_chans = [ch]\n",
+    "#         ds = 1\n",
+    "\n",
+    "#         for level, mult in enumerate(channel_mult):\n",
+    "#             for _ in range(num_res_blocks):\n",
+    "#                 layers = [\n",
+    "#                     ResBlock(\n",
+    "#                         ch,\n",
+    "#                         time_embed_dim,\n",
+    "#                         dropout,\n",
+    "#                         out_channels = int(mult * model_channels),\n",
+    "#                         use_checkpoint = use_checkpoint,\n",
+    "#                         use_scale_shift_norm = use_scale_shift_norm,\n",
+    "#                         dim = dim,\n",
+    "#                         stride = stride,\n",
+    "#                     )\n",
+    "#                 ]\n",
+    "#                 ch = int(mult * model_channels)\n",
+    "#                 if ds in attention_ds:\n",
+    "#                     layers.append(\n",
+    "#                         AttentionBlock(\n",
+    "#                             ch,\n",
+    "#                             use_checkpoint=use_checkpoint,\n",
+    "#                             num_heads = num_heads,\n",
+    "#                             num_head_channels = num_head_channels,\n",
+    "#                             encoder_channels = encoder_channels,\n",
+    "#                         )\n",
+    "#                     )\n",
+    "#                 self.input_blocks.append(TimestepEmbedSequential(*layers))\n",
+    "#                 self._feature_size += ch\n",
+    "#                 input_block_chans.append(ch)\n",
+    "\n",
+    "#             if level != len(channel_mult) - 1:\n",
+    "#                 out_ch = ch\n",
+    "#                 self.input_blocks.append(\n",
+    "#                     TimestepEmbedSequential(\n",
+    "#                         ResBlock(\n",
+    "#                             ch,\n",
+    "#                             time_embed_dim,\n",
+    "#                             dropout,\n",
+    "#                             out_channels=out_ch,\n",
+    "#                             # dims=dims,\n",
+    "#                             use_checkpoint=use_checkpoint,\n",
+    "#                             use_scale_shift_norm=use_scale_shift_norm,\n",
+    "#                             down=True,\n",
+    "#                             dim = dim,\n",
+    "#                             stride = stride,\n",
+    "#                         )\n",
+    "#                         if resblock_updown\n",
+    "#                         else Downsample(ch, conv_resample, out_channels=out_ch, dim=dim, stride=stride)\n",
+    "#                     )\n",
+    "#                 )\n",
+    "#                 ch = out_ch\n",
+    "#                 input_block_chans.append(ch)\n",
+    "#                 ds *= 2\n",
+    "#                 self._feature_size += ch\n",
+    "\n",
+    "\n",
+    "#         ###################### middle_blocks ######################\n",
+    "#         self.middle_block = TimestepEmbedSequential(\n",
+    "#             ResBlock(\n",
+    "#                 ch,\n",
+    "#                 time_embed_dim,\n",
+    "#                 dropout,\n",
+    "#                 use_checkpoint=use_checkpoint,\n",
+    "#                 use_scale_shift_norm=use_scale_shift_norm,\n",
+    "#                 dim = dim,\n",
+    "#                 stride = stride,\n",
+    "#             ),\n",
+    "#             AttentionBlock(\n",
+    "#                 ch,\n",
+    "#                 use_checkpoint=use_checkpoint,\n",
+    "#                 num_heads=num_heads,\n",
+    "#                 num_head_channels=num_head_channels,\n",
+    "#                 encoder_channels=encoder_channels,\n",
+    "#             ),\n",
+    "#             ResBlock(\n",
+    "#                 ch,\n",
+    "#                 time_embed_dim,\n",
+    "#                 dropout,\n",
+    "#                 use_checkpoint=use_checkpoint,\n",
+    "#                 use_scale_shift_norm=use_scale_shift_norm,\n",
+    "#                 dim = dim,\n",
+    "#                 stride = stride,\n",
+    "#             ),\n",
+    "#         )\n",
+    "#         self._feature_size += ch\n",
+    "\n",
+    "\n",
+    "#         ###################### output_blocks ######################\n",
+    "#         self.output_blocks = nn.ModuleList([])\n",
+    "#         for level, mult in list(enumerate(channel_mult))[::-1]:\n",
+    "#             for i in range(num_res_blocks + 1):\n",
+    "#                 ich = input_block_chans.pop()\n",
+    "#                 layers = [\n",
+    "#                     ResBlock(\n",
+    "#                         ch + ich,\n",
+    "#                         time_embed_dim,\n",
+    "#                         dropout,\n",
+    "#                         out_channels=int(model_channels * mult),\n",
+    "#                         # dims=dims,\n",
+    "#                         use_checkpoint=use_checkpoint,\n",
+    "#                         use_scale_shift_norm=use_scale_shift_norm,\n",
+    "#                         dim = dim,\n",
+    "#                         stride = stride,\n",
+    "#                     )\n",
+    "#                 ]\n",
+    "#                 ch = int(model_channels * mult)\n",
+    "#                 if ds in attention_ds:\n",
+    "#                     # print(\"ds in attention_resolutions, num_heads=\", num_heads_upsample)\n",
+    "#                     layers.append(\n",
+    "#                         AttentionBlock(\n",
+    "#                             ch,\n",
+    "#                             use_checkpoint=use_checkpoint,\n",
+    "#                             num_heads=num_heads_upsample,\n",
+    "#                             num_head_channels=num_head_channels,\n",
+    "#                             encoder_channels=encoder_channels,\n",
+    "#                         )\n",
+    "#                     )\n",
+    "#                 if level and i == num_res_blocks:\n",
+    "#                     out_ch = ch\n",
+    "#                     layers.append(\n",
+    "#                         ResBlock(\n",
+    "#                             ch,\n",
+    "#                             time_embed_dim,\n",
+    "#                             dropout,\n",
+    "#                             out_channels=out_ch,\n",
+    "#                             # dims=dims,\n",
+    "#                             use_checkpoint=use_checkpoint,\n",
+    "#                             use_scale_shift_norm=use_scale_shift_norm,\n",
+    "#                             up=True,\n",
+    "#                             dim = dim,\n",
+    "#                             stride = stride,\n",
+    "#                         )\n",
+    "#                         if resblock_updown\n",
+    "#                         else Upsample(ch, conv_resample, out_channels=out_ch, dim=dim, stride=stride)\n",
+    "#                     )\n",
+    "#                     ds //= 2\n",
+    "#                 self.output_blocks.append(TimestepEmbedSequential(*layers))\n",
+    "#                 self._feature_size += ch\n",
+    "\n",
+    "#         self.out = nn.Sequential(\n",
+    "#             # nn.BatchNorm2d(ch),\n",
+    "#             normalization(ch, swish=1.0),\n",
+    "#             nn.Identity(),\n",
+    "#             zero_module(Conv[dim](input_ch, out_channels, 3, padding=1)),\n",
+    "#         )\n",
+    "#         # self.use_fp16 = use_fp16\n",
+    "\n",
+    "#     def forward(self, x, timesteps, y=None):\n",
+    "#         hs = []\n",
+    "#         emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))\n",
+    "#         if y != None:\n",
+    "#             text_outputs = self.token_embedding(y.float())\n",
+    "#             emb = emb + text_outputs.to(emb)\n",
+    "\n",
+    "#         h = x.type(self.dtype)\n",
+    "#         # print(\"0,h.shape =\", h.shape)\n",
+    "#         for module in self.input_blocks:\n",
+    "#             h = module(h, emb)\n",
+    "#             hs.append(h)\n",
+    "#             # print(\"module encoder, h.shape =\", h.shape)\n",
+    "#         # print(\"2,h.shape =\", h.shape)\n",
+    "#         h = self.middle_block(h, emb)\n",
+    "#         # print(\"middle block, h.shape =\", h.shape)\n",
+    "#         # print(\"2,h.shape =\", h.shape)\n",
+    "#         for module in self.output_blocks:\n",
+    "#             # print(\"for module in self.output_blocks, h.shape =\", h.shape)\n",
+    "#             # print(\"len(hs) =\", len(hs), \", hs[-1].shape =\", hs[-1].shape)\n",
+    "#             h = torch.cat([h, hs.pop()], dim=1)\n",
+    "#             h = module(h, emb)\n",
+    "#             # print(\"module decoder, h.shape =\", h.shape)\n",
+    "\n",
+    "#         h = h.type(x.dtype)\n",
+    "#         h = self.out(h)\n",
+    "#         # print(\"self.out(h)\", \"h.shape =\", h.shape)\n",
+    "\n",
+    "#         return h "
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
     "        self.step += 1\n",
     "\n",
     "    def reset_parameters(self, ema_model, model):\n",
+    "        ema_model.load_state_dict(model.state_dict())\n",
+    "        "
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
   },
   {
    "cell_type": "code",
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
      "output_type": "stream",
      "text": [
       "params loaded: (200, 2)\n",
+      "images rescaled to [-1.0, 1.056351900100708]\n",
+      "params rescaled to [0.0, 0.999164249684298]\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fec693362692472581efafa594095278",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "929a642531414269ae5516eb9d9a9ba2",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2fb5460387ad4a3798499bbae31d301e",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f7213d3285cd46ad9f2604f88b45725b",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5ec52d75f5b54fe7a8d912baf75686c6",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c68ccbc52fdb4c0fbeec1932bd8f74d5",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5dc2869a9e694a0388336d2ec71818f5",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9ed1309b7afb46d59b568e212ee2ac0a",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f3ee8347673c47759bc4b419e363f39a",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cb4824a035494e97a647d0c185645318",
        "version_major": 2,
        "version_minor": 0
       },
   },
   {
    "cell_type": "code",
+   "execution_count": 27,
    "metadata": {},
    "outputs": [
     {
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "402d3818dd8a45cdaf774a7a1c19a4f4",
        "version_major": 2,
        "version_minor": 0
       },
      },
      "metadata": {},
      "output_type": "display_data"
     }
    ],
    "source": [

load_h5.py CHANGED Viewed

@@ -1,27 +1,116 @@
 from dataclasses import dataclass
 import h5py
 import torch
-import torch.nn as nn
 from torch.utils.data import DataLoader, Dataset
 # from datasets import Dataset
 import matplotlib.pyplot as plt
 import numpy as np
 import random
-from abc import ABC, abstractmethod
-import torch.nn.functional as F
 import math
-from PIL import Image
 import os
-from torch.utils.tensorboard import SummaryWriter
-import copy
-from tqdm.auto import tqdm
 # from torchvision import transforms
 # from diffusers import UNet2DModel#, UNet3DConditionModel
 # from diffusers import DDPMScheduler
-from diffusers.utils import make_image_grid
 import datetime
-from pathlib import Path
-from diffusers.optimization import get_cosine_schedule_with_warmup
-from accelerate import notebook_launcher, Accelerator
-from huggingface_hub import create_repo, upload_folder

 from dataclasses import dataclass
 import h5py
 import torch
+# import torch.nn as nn
 from torch.utils.data import DataLoader, Dataset
 # from datasets import Dataset
 import matplotlib.pyplot as plt
 import numpy as np
 import random
+# from abc import ABC, abstractmethod
+# import torch.nn.functional as F
 import math
+# from PIL import Image
 import os
+# from torch.utils.tensorboard import SummaryWriter
+# import copy
+# from tqdm.auto import tqdm
 # from torchvision import transforms
 # from diffusers import UNet2DModel#, UNet3DConditionModel
 # from diffusers import DDPMScheduler
+# from diffusers.utils import make_image_grid
 import datetime
+# from pathlib import Path
+# from diffusers.optimization import get_cosine_schedule_with_warmup
+# from accelerate import notebook_launcher, Accelerator
+# from huggingface_hub import create_repo, upload_folder
+class Dataset4h5(Dataset):
+    def __init__(self, dir_name, num_image=10, field='brightness_temp', shuffle=True, idx=None, num_redshift=32, HII_DIM=32, rescale=True, drop_prob = 0, dim=2):
+        super().__init__()
+        self.dir_name = dir_name
+        self.num_image = num_image
+        self.field = field
+        self.shuffle = shuffle
+        self.idx = idx
+        self.num_redshift = num_redshift
+        self.HII_DIM = HII_DIM
+        self.drop_prob = drop_prob
+        self.dim = dim
+        self.load_h5()
+        if rescale:
+            self.images = self.rescale(self.images, to=[-1,1])
+            self.params = self.rescale(self.params, to=[0,1])
+        self.len = len(self.params)
+        self.images = torch.from_numpy(self.images)
+        print(f"images rescaled to [{self.images.min()}, {self.images.max()}]")
+        cond_filter = torch.bernoulli(torch.ones(len(self.params),1)-self.drop_prob).repeat(1,self.params.shape[1]).numpy()
+        self.params = torch.from_numpy(self.params*cond_filter)
+        print(f"params rescaled to [{self.params.min()}, {self.params.max()}]")
+    def load_h5(self):
+        with h5py.File(self.dir_name, 'r') as f:
+            print(f"dataset content: {f.keys()}")
+            max_num_image = len(f['brightness_temp'])#.shape[0]
+            print(f"{max_num_image} images can be loaded")
+            field_shape = f['brightness_temp'].shape[1:]
+            print(f"field.shape = {field_shape}")
+            self.params_keys = list(f['params']['keys'])
+            print(f"params keys = {self.params_keys}")
+            if self.idx is None:
+                if self.shuffle:
+                    self.idx = np.sort(random.sample(range(max_num_image), self.num_image))
+                    print(f"loading {self.num_image} images randomly")
+                    # print(self.idx)
+                else:
+                    self.idx = range(self.num_image)
+                    print(f"loading {len(self.idx)} images with idx = {self.idx}")
+            else:
+                print(f"loading {len(self.idx)} images with idx = {self.idx}")
+            if self.dim == 2:
+                self.images = f[self.field][self.idx,0,:self.HII_DIM,-self.num_redshift:][:,None]
+            elif self.dim == 3:
+                self.images = f[self.field][self.idx,:self.HII_DIM,:self.HII_DIM,-self.num_redshift:][:,None]
+            print(f"images loaded:", self.images.shape)
+            self.params = f['params']['values'][self.idx]
+            print("params loaded:", self.params.shape)
+            # plt.imshow(self.images[0,0,0])
+            # plt.show()
+    def rescale(self, value, to: list):
+        # print(np.ndim(value))
+        if np.ndim(value)==2:
+            # print(f"rescale params of shape {value.shape}")
+            ranges = \
+                {
+                    0: [4, 6], # ION_Tvir_MIN
+                    1: [10, 250], # HII_EFF_FACTOR
+                }
+        # elif np.ndim(value)==5:
+        else:
+            # value = np.array(value)
+            # print(f"rescale images of shape {np.shape(value)}")
+            ranges = \
+                {
+                    0: [0, 80], # brightness_temp
+                }
+        # print(f"value.min = {value.min()}, value.max = {value.max()}")
+        for i in range(np.shape(value)[1]):
+            value[:,i] = (value[:,i] - ranges[i][0]) / (ranges[i][1]-ranges[i][0])
+        # print(f"value.min = {value.min()}, value.max = {value.max()}")
+        value = value * (to[1]-to[0]) + to[0]
+        return value
+    def __getitem__(self, index):
+        return self.images[index], self.params[index]
+    def __len__(self):
+        return self.len