ahahahaha it works

Browse files

Files changed (9) hide show

__pycache__/bert_vectorize.cpython-311.pyc +0 -0
__pycache__/factories.cpython-311.pyc +0 -0
__pycache__/logger.cpython-311.pyc +0 -0
__pycache__/wrapper.cpython-311.pyc +0 -0
bert_vectorize.py +27 -0
factories.py +63 -285
logger.py +40 -0
runs/run_3_jxa/ckpt/latest.pt +3 -0
wrapper.py +198 -0

__pycache__/bert_vectorize.cpython-311.pyc ADDED Viewed

Binary file (2.04 kB). View file

__pycache__/factories.cpython-311.pyc ADDED Viewed

Binary file (20.1 kB). View file

__pycache__/logger.cpython-311.pyc ADDED Viewed

Binary file (2.14 kB). View file

__pycache__/wrapper.cpython-311.pyc ADDED Viewed

Binary file (11.1 kB). View file

bert_vectorize.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from transformers import BertTokenizer, BertModel, DistilBertTokenizer, DistilBertModel
+import torch
+tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+model = DistilBertModel.from_pretrained('distilbert-base-uncased', output_hidden_states=True)
+model.eval()
+device = "mps" if torch.backends.mps.is_available() else "cpu"
+model = model.to(device)
+def vectorize_text_with_bert(text):# from hf docs
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    hidden_states = outputs.hidden_states
+    last_layer_hidden_states = hidden_states[-1]
+    text_representation = torch.mean(last_layer_hidden_states, dim=1).squeeze(0)
+    return text_representation
+if __name__ == "__main__":
+    text = "A man walking down the street with a dog holding a balloon in one hand."
+    text_representation = vectorize_text_with_bert(text)
+    print("Vectorized representation:", text_representation)
+    print(text_representation.shape)

factories.py CHANGED Viewed

@@ -78,7 +78,6 @@ class CrossAttention(nn.Module):
         # Reshape and permute x for multi-head attention
         batch_size, channels, height, width = x.size()
         x = x.view(-1, self.channels, self.size * self.size).swapaxes(1,2)
         x_ln = self.ln(x)
@@ -124,7 +123,7 @@ class DoubleConv(nn.Module):
 class Down(nn.Module):
-    def __init__(self, in_channels, out_channels, emb_dim=1024):
         super().__init__()
         self.maxpool_conv = nn.Sequential(
             nn.MaxPool2d(2),
@@ -147,7 +146,7 @@ class Down(nn.Module):
 class Up(nn.Module):
-    def __init__(self, in_channels, out_channels, emb_dim=1024):
         super().__init__()
         self.up = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True)
@@ -172,63 +171,30 @@ class Up(nn.Module):
         return x + emb
-class UNet_conditional_large(nn.Module):
-    def __init__(self, c_in=3, c_out=3, time_dim=1024, num_classes=1024, context_dim=None, device="mps"):
         super().__init__()
-        if context_dim is None:
-            context_dim = num_classes
         self.device = device
         self.time_dim = time_dim
-        start_depth = 128
-        xa_amt_depth = 64 # dont change
-        self.inc = DoubleConv(c_in, start_depth)
-        self.down1 = Down(start_depth, start_depth * 2)
-        self.xa1 = CrossAttention(start_depth * 2, xa_amt_depth // 2, context_dim)
-        self.down2 = Down(start_depth * 2, start_depth * 4)
-        self.xa2 = CrossAttention(start_depth * 4, xa_amt_depth // 4, context_dim)
-        self.down3 = Down(start_depth * 4, start_depth * 8)
-        self.xa3 = CrossAttention(start_depth * 8, xa_amt_depth // 8, context_dim)
-        self.down4 = Down(start_depth * 8, start_depth * 8)
-        self.xa4 = CrossAttention(start_depth * 8, xa_amt_depth // 16, context_dim)
-        self.bot1 = DoubleConv(start_depth * 8, start_depth * 16)
-        self.bot2 = DoubleConv(start_depth * 16, start_depth * 16)
-        self.bot3 = DoubleConv(start_depth * 16, start_depth * 8)
-        self.up1 = Up(start_depth * 16, start_depth * 4)
-        self.xa5 = CrossAttention(start_depth * 4, xa_amt_depth // 8, context_dim)
-        self.up2 = Up(start_depth * 8, start_depth * 2)
-        self.xa6 = CrossAttention(start_depth * 2, xa_amt_depth // 4, context_dim)
-        self.up3 = Up(start_depth * 4, start_depth)
-        self.xa7 = CrossAttention(start_depth, xa_amt_depth // 2, context_dim)
-        self.up4 = Up(start_depth * 2, start_depth)
-        self.xa8 = CrossAttention(start_depth, xa_amt_depth, context_dim)
-        self.outc = nn.Conv2d(start_depth, c_out, kernel_size=1)
-        if num_classes is not None:
-            self.label_emb = nn.Linear(num_classes, time_dim)#Embedding(num_classes, time_dim)
-            self.num_classes = num_classes
-            if context_dim is None:
-                context_dim = num_classes
-            self.context_dim = context_dim
-            self.label_crossattn_emb = nn.Linear(num_classes, context_dim)
     def pos_encoding(self, t, channels):
         inv_freq = 1.0 / (
@@ -240,201 +206,34 @@ class UNet_conditional_large(nn.Module):
         pos_enc = torch.cat([pos_enc_a, pos_enc_b], dim=-1)
         return pos_enc
-    def forward(self, x, t, y):
         t = t.unsqueeze(-1).type(torch.float)
         t = self.pos_encoding(t, self.time_dim)
-        if y is not None:
-            attn_y = y[:,:self.num_classes]
-            attn_y = self.label_crossattn_emb(attn_y)
-            # y = y[:,:self.num_classes]
-            # y = self.label_emb(y)
-            # t += y
         x1 = self.inc(x)
         x2 = self.down1(x1, t)
-        x2 = self.xa1(x2, attn_y)
         x3 = self.down2(x2, t)
-        x3 = self.xa2(x3, attn_y)
         x4 = self.down3(x3, t)
-        x4 = self.xa3(x4, attn_y)
-        x5 = self.down4(x4, t)
-        x5 = self.xa4(x5, attn_y)
-        x5 = self.bot1(x5)
-        x5 = self.bot2(x5)
-        x5 = self.bot3(x5)
-        x = self.up1(x5, x4, t)
-        x = self.xa5(x,attn_y)
-        x = self.up2(x, x3, t)
-        x = self.xa6(x,attn_y)
-        x = self.up3(x, x2, t)
-        x = self.xa7(x, attn_y)
-        x = self.up4(x, x1, t)
-        x = self.xa8(x, attn_y)
         output = self.outc(x)
         return output
-class UNet_conditional_efficient(nn.Module):
-    def __init__(self, c_in=3, c_out=3, time_dim=1024, num_classes=1024, context_dim=None, device="mps"):
-        super().__init__()
-        if context_dim is None:
-            context_dim = num_classes
-        self.device = device
-        self.time_dim = time_dim
-        start_depth = 128
-        xa_amt_depth = 64 # dont change
-        self.inc = DoubleConv(c_in, start_depth * 2)
-        self.downsample = nn.MaxPool2d(2)
-        self.down2 = Down(start_depth * 2, start_depth * 4)
-        self.xa2 = CrossAttention(start_depth * 4, xa_amt_depth // 4, context_dim)
-        self.down3 = Down(start_depth * 4, start_depth * 8)
-        self.xa3 = CrossAttention(start_depth * 8, xa_amt_depth // 8, context_dim)
-        self.down4 = Down(start_depth * 8, start_depth * 8)
-        self.xa4 = CrossAttention(start_depth * 8, xa_amt_depth // 16, context_dim)
-        self.bot1 = DoubleConv(start_depth * 8, start_depth * 16)
-        self.bot2 = DoubleConv(start_depth * 16, start_depth * 16)
-        self.bot3 = DoubleConv(start_depth * 16, start_depth * 8)
-        self.up1 = Up(start_depth * 16, start_depth * 4)
-        self.xa5 = CrossAttention(start_depth * 4, xa_amt_depth // 8, context_dim)
-        self.up2 = Up(start_depth * 8, start_depth * 2)
-        self.xa6 = CrossAttention(start_depth * 2, xa_amt_depth // 4, context_dim)
-        self.up3 = Up(start_depth * 4, start_depth)
-        self.xa7 = CrossAttention(start_depth, xa_amt_depth // 2, context_dim)
-        self.up4 = Up(start_depth * 2, start_depth)
-        self.xa8 = CrossAttention(start_depth, xa_amt_depth, context_dim)
-        self.upsample = nn.Upsample(scale_factor=2, mode="bilinear")
-        self.outc = nn.Conv2d(start_depth, c_out, kernel_size=1)
-        if num_classes is not None:
-            self.label_emb = nn.Linear(num_classes, time_dim)#Embedding(num_classes, time_dim)
-            self.num_classes = num_classes
-            if context_dim is None:
-                context_dim = num_classes
-            self.context_dim = context_dim
-            self.label_crossattn_emb = nn.Linear(num_classes, context_dim)
-    def pos_encoding(self, t, channels):
-        inv_freq = 1.0 / (
-            10000
-            ** (torch.arange(0, channels, 2, device=self.device).float() / channels)
-        )
-        pos_enc_a = torch.sin(t.repeat(1, channels // 2) * inv_freq)
-        pos_enc_b = torch.cos(t.repeat(1, channels // 2) * inv_freq)
-        pos_enc = torch.cat([pos_enc_a, pos_enc_b], dim=-1)
-        return pos_enc
-    def forward(self, x, t, y):
-        t = t.unsqueeze(-1).type(torch.float)
-        t = self.pos_encoding(t, self.time_dim)
-        if y is not None:
-            attn_y = y[:,:self.num_classes]
-            attn_y = self.label_crossattn_emb(attn_y)
-            # y = y[:,:self.num_classes]
-            # y = self.label_emb(y)
-            # t += y
-        x1 = self.inc(x)
-        x2 = self.downsample(x1)
-        x3 = self.down2(x2, t)
-        x3 = self.xa2(x3, attn_y)
-        x4 = self.down3(x3, t)
-        x4 = self.xa3(x4, attn_y)
-        x5 = self.down4(x4, t)
-        x5 = self.xa4(x5, attn_y)
-        x5 = self.bot1(x5)
-        x5 = self.bot2(x5)
-        x5 = self.bot3(x5)
-        x = self.up1(x5, x4, t)
-        x = self.xa5(x,attn_y)
-        x = self.up2(x, x3, t)
-        x = self.xa6(x,attn_y)
-        x = self.up3(x, x2, t)
-        x = self.xa7(x, attn_y)
-        x = self.upsample(x)
-        output = self.outc(x)
-        return output
-class UNet_conditional_start_depth(nn.Module):
-    def __init__(self, c_in=3, c_out=3, time_dim=1024, num_classes=None, context_dim=None, device="mps"):
         super().__init__()
         if context_dim is None:
@@ -443,36 +242,31 @@ class UNet_conditional_start_depth(nn.Module):
         self.time_dim = time_dim
-        start_depth = 128
-        xa_amt_depth = 64
-        self.inc = DoubleConv(c_in, start_depth)
-        self.down1 = Down(start_depth, start_depth * 2)
-        self.xa1 = CrossAttention(start_depth * 2, xa_amt_depth // 2, context_dim)
-        self.down2 = Down(start_depth * 2, start_depth * 4)
-        self.xa2 = CrossAttention(start_depth * 4, xa_amt_depth // 4, context_dim)
-        self.down3 = Down(start_depth * 4, start_depth * 4)
-        self.xa3 = CrossAttention(start_depth * 4, xa_amt_depth // 8, context_dim)
-        self.bot1 = DoubleConv(start_depth * 4, start_depth * 8)
-        self.bot2 = DoubleConv(start_depth * 8, start_depth * 8)
-        self.bot3 = DoubleConv(start_depth * 8, start_depth * 4)
-        self.up1 = Up(start_depth * 8, start_depth * 2)
-        self.xa4 = CrossAttention(start_depth * 2, xa_amt_depth // 4, context_dim)
-        self.up2 = Up(start_depth * 4, start_depth)
-        self.xa5 = CrossAttention(start_depth, xa_amt_depth // 2, context_dim)
-        self.up3 = Up(start_depth * 2, start_depth)
-        self.xa6 = CrossAttention(start_depth, xa_amt_depth, context_dim)
-        self.outc = nn.Conv2d(start_depth, c_out, kernel_size=1)
         if num_classes is not None:
             self.label_emb = nn.Linear(num_classes, time_dim)#Embedding(num_classes, time_dim)
@@ -547,19 +341,3 @@ class UNet_conditional_start_depth(nn.Module):
         #output = F.sigmoid(x)
         return output
-if __name__ == "__main__":
-    net = UNet_conditional_start_depth(num_classes=1024).to("mps")
-    def count_parameters(model):
-        return torch.tensor([p.numel() for p in model.parameters() if p.requires_grad]).sum().item()
-    print(f"Parameters: {count_parameters(net)}")
-    minibatch = torch.randn((1,3,64,64)).to("mps")
-    o = net(minibatch, torch.randint(low=1, high=1000, size=(1,)).to("mps"), torch.randn((1,1024)).to("mps"))
-    print(o.size())

         # Reshape and permute x for multi-head attention
         batch_size, channels, height, width = x.size()
         x = x.view(-1, self.channels, self.size * self.size).swapaxes(1,2)
         x_ln = self.ln(x)
 class Down(nn.Module):
+    def __init__(self, in_channels, out_channels, emb_dim=256):
         super().__init__()
         self.maxpool_conv = nn.Sequential(
             nn.MaxPool2d(2),
 class Up(nn.Module):
+    def __init__(self, in_channels, out_channels, emb_dim=256):
         super().__init__()
         self.up = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True)
         return x + emb
+class Dome_UNet(nn.Module):
+    def __init__(self, c_in=3, c_out=3, time_dim=256, device="mps"):
         super().__init__()
         self.device = device
         self.time_dim = time_dim
+        self.inc = DoubleConv(c_in, 64)
+        self.down1 = Down(64, 128)
+        self.sa1 = SelfAttention(128, 32)
+        self.down2 = Down(128, 256)
+        self.sa2 = SelfAttention(256, 16)
+        self.down3 = Down(256, 256)
+        self.sa3 = SelfAttention(256, 8)
+        self.bot1 = DoubleConv(256, 512)
+        self.bot2 = DoubleConv(512, 512)
+        self.bot3 = DoubleConv(512, 256)
+        self.up1 = Up(512, 128)
+        self.sa4 = SelfAttention(128, 16)
+        self.up2 = Up(256, 64)
+        self.sa5 = SelfAttention(64, 32)
+        self.up3 = Up(128, 64)
+        self.sa6 = SelfAttention(64, 64)
+        self.outc = nn.Conv2d(64, c_out, kernel_size=1)
     def pos_encoding(self, t, channels):
         inv_freq = 1.0 / (
         pos_enc = torch.cat([pos_enc_a, pos_enc_b], dim=-1)
         return pos_enc
+    def forward(self, x, t):
         t = t.unsqueeze(-1).type(torch.float)
         t = self.pos_encoding(t, self.time_dim)
         x1 = self.inc(x)
         x2 = self.down1(x1, t)
+        x2 = self.sa1(x2)
         x3 = self.down2(x2, t)
+        x3 = self.sa2(x3)
         x4 = self.down3(x3, t)
+        x4 = self.sa3(x4)
+        x4 = self.bot1(x4)
+        x4 = self.bot2(x4)
+        x4 = self.bot3(x4)
+        x = self.up1(x4, x3, t)
+        x = self.sa4(x)
+        x = self.up2(x, x2, t)
+        x = self.sa5(x)
+        x = self.up3(x, x1, t)
+        x = self.sa6(x)
         output = self.outc(x)
         return output
+class UNet_conditional(nn.Module):
+    def __init__(self, c_in=3, c_out=3, time_dim=256, num_classes=None, context_dim=None, device="mps"):
         super().__init__()
         if context_dim is None:
         self.time_dim = time_dim
+        self.inc = DoubleConv(c_in, 64)
+        self.down1 = Down(64, 128)
+        self.sa1 = SelfAttention(128, 32)
+        self.xa1 = CrossAttention(128, 32, context_dim)
+        self.down2 = Down(128, 256)
+        self.xa2 = CrossAttention(256, 16, context_dim)
+        self.sa2 = SelfAttention(256, 16)
+        self.down3 = Down(256, 256)
+        self.xa3 = CrossAttention(256, 8, context_dim)
+        self.sa3 = SelfAttention(256, 8)
+        self.bot1 = DoubleConv(256, 512)
+        self.bot2 = DoubleConv(512, 512)
+        self.bot3 = DoubleConv(512, 256)
+        self.up1 = Up(512, 128)
+        self.xa4 = CrossAttention(128, 16, context_dim)
+        self.sa4 = SelfAttention(128, 16)
+        self.up2 = Up(256, 64)
+        self.xa5 = CrossAttention(64, 32, context_dim)
+        self.sa5 = SelfAttention(64, 32)
+        self.up3 = Up(128, 64)
+        self.xa6 = CrossAttention(64, 64, context_dim)
+        self.sa6 = SelfAttention(64, 64)
+        self.outc = nn.Conv2d(64, c_out, kernel_size=1)
         if num_classes is not None:
             self.label_emb = nn.Linear(num_classes, time_dim)#Embedding(num_classes, time_dim)
         #output = F.sigmoid(x)
         return output

logger.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch
+from torch.utils.tensorboard import SummaryWriter
+import matplotlib.pyplot as plt
+writer = None
+def log_data(data, i):
+    for key in data.keys():
+        writer.add_scalar(key, data[key], i)
+def log_img(img, name):
+    writer.add_image(name, img)
+def save_grid_with_label(img_grid, label, out_file):
+    img_grid = img_grid.permute(1, 2, 0).numpy()
+    fig, ax = plt.subplots(figsize=(8, 8))
+    ax.imshow(img_grid)
+    ax.set_title(label, fontsize=20)
+    ax.axis('off')
+    plt.subplots_adjust(top=0.85)
+    plt.savefig(out_file, bbox_inches='tight', pad_inches=0.1)
+    plt.close(fig)
+    plt.close("all")
+def init_logger(dir="runs"):
+    global writer
+    if not writer:
+        writer = SummaryWriter(dir)

runs/run_3_jxa/ckpt/latest.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0cd39e8429ea0ace24bb40d4bd404baebb8aae471385987b898a966eb79dcc5f
+size 103503678

wrapper.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import torch
+import torch.nn as nn
+from enum import Enum
+from tqdm import trange
+Schedule = Enum('Schedule', ['LINEAR', 'COSINE'])
+class DiffusionManager(nn.Module):
+    def __init__(self, model: nn.Module, noise_steps=1000, start=0.0001, end=0.02, device="cpu", **kwargs ) -> None:
+        super().__init__(**kwargs)
+        self.model = model
+        self.noise_steps = noise_steps
+        self.start = start
+        self.end = end
+        self.device = device
+        self.schedule = None
+        self.set_schedule()
+        #model.set_parent(self)
+    def _get_schedule(self, schedule_type: Schedule = Schedule.LINEAR):
+        if schedule_type == Schedule.LINEAR:
+            return torch.linspace(self.start, self.end, self.noise_steps)
+        elif schedule_type == Schedule.COSINE:
+            # https://arxiv.org/pdf/2102.09672 page 4
+            #https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+            #line 18
+            def get_alphahat_at(t):
+                def f(t):
+                    s=self.start
+                    return torch.cos((t/self.noise_steps + s)/(1+s) * torch.pi/2) ** 2
+                return f(t)/f(torch.zeros_like(t))
+            t = torch.Tensor(range(self.noise_steps))
+            t = 1-(get_alphahat_at(t + 1)/get_alphahat_at(t))
+            t = torch.minimum(t, torch.ones_like(t) * 0.999) #"In practice, we clip β_t to be no larger than 0.999 to prevent singularities at the end of the diffusion process n"
+            return t
+    def set_schedule(self, schedule: Schedule = Schedule.LINEAR):
+        self.schedule = self._get_schedule(schedule).to(self.device)
+    def get_schedule_at(self, step):
+        beta = self.schedule
+        alpha = 1 - beta
+        alpha_hat = torch.cumprod(alpha, dim=0)
+        return self._unsqueezify(beta.data[step]), self._unsqueezify(alpha.data[step]), self._unsqueezify(alpha_hat.data[step])
+    @staticmethod
+    def _unsqueezify(value):
+        return value.view(-1, 1, 1, 1)#.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
+    def noise_image(self, image, step):
+        image = image.to(self.device)
+        beta, alpha, alpha_hat = self.get_schedule_at(step)
+        epsilon = torch.randn_like(image)
+        # print(alpha_hat)
+        # print(alpha_hat.size())
+        # print(image.size())
+        noised_img = torch.sqrt(alpha_hat) * image  + torch.sqrt(1 - alpha_hat) * epsilon
+        return noised_img, epsilon
+    def random_timesteps(self, amt=1):
+        return torch.randint(low=1, high=self.noise_steps, size=(amt,))
+    def sample(self, img_size, condition, amt=5, use_tqdm=True):
+        if tuple(condition.shape)[0] < amt:
+            condition = condition.repeat(amt, 1)
+        self.model.eval()
+        condition = condition.to(self.device)
+        my_trange = lambda x, y, z: trange(x,y, z, leave=False,dynamic_ncols=True)
+        fn = my_trange if use_tqdm else range
+        with torch.no_grad():
+            cur_img = torch.randn((amt, 3, img_size, img_size)).to(self.device)
+            for i in fn(self.noise_steps-1, 0, -1):
+                timestep = torch.ones(amt) * (i)
+                timestep = timestep.to(self.device)
+                predicted_noise = self.model(cur_img, timestep, condition)
+                beta, alpha, alpha_hat = self.get_schedule_at(i)
+                cur_img = (1/torch.sqrt(alpha))*(cur_img - (beta/torch.sqrt(1-alpha_hat))*predicted_noise)
+                if i > 1:
+                    cur_img = cur_img + torch.sqrt(beta)*torch.randn_like(cur_img)
+        self.model.train()
+        return cur_img
+    def sample_multicond(self, img_size, condition, use_tqdm=True):
+        num_conditions = condition.shape[0]
+        amt = num_conditions
+        self.model.eval()
+        condition = condition.to(self.device)
+        my_trange = lambda x, y, z: trange(x, y, z, leave=False, dynamic_ncols=True)
+        fn = my_trange if use_tqdm else range
+        with torch.no_grad():
+            cur_img = torch.randn((amt, 3, img_size, img_size)).to(self.device)
+            for i in fn(self.noise_steps-1, 0, -1):
+                timestep = torch.ones(amt) * i
+                timestep = timestep.to(self.device)
+                predicted_noise = self.model(cur_img, timestep, condition)
+                beta, alpha, alpha_hat = self.get_schedule_at(i)
+                cur_img = (1 / torch.sqrt(alpha)) * (cur_img - (beta / torch.sqrt(1 - alpha_hat)) * predicted_noise)
+                if i > 1:
+                    cur_img = cur_img + torch.sqrt(beta) * torch.randn_like(cur_img)
+        self.model.train()
+        # Return images sampled for each condition
+        return cur_img
+    def training_loop_iteration(self, optimizer, batch, label, criterion):
+        def print_(string):
+            for i in range(10):
+                print(string)
+        batch = batch.to(self.device)
+        #label = label.long() # uncomment for nn.Embedding
+        label = label.to(self.device)
+        timesteps = self.random_timesteps(batch.shape[0]).to(self.device)
+        noisy_batch, real_noise = self.noise_image(batch, timesteps)
+        if torch.isnan(noisy_batch).any() or torch.isnan(real_noise).any():
+            print_("NaNs detected in the noisy batch or real noise")
+        pred_noise = self.model(noisy_batch, timesteps, label)
+        if torch.isnan(pred_noise).any():
+            print_("NaNs detected in the predicted noise")
+        loss = criterion(real_noise, pred_noise)
+        if torch.isnan(loss).any():
+            print_("NaNs detected in the loss")
+        loss.backward()
+        optimizer.step()
+        return loss.item()