TuKoResearch
/

AuriStream7BWide_mid

Safetensors

AuriStream.AuriStream

custom_code

Model card Files Files and versions

xet

Community

klemenk commited on May 10, 2025

Commit

0df1def

verified ·

1 Parent(s): 75983a3

Update modeling_auristream.py

Browse files

Files changed (1) hide show

modeling_auristream.py +40 -11

modeling_auristream.py CHANGED Viewed

@@ -88,6 +88,9 @@ class AuriStream(PreTrainedModel):
             x = self.transformer.drop(tok_emb + pos_emb)
         else:
             x = self.transformer.drop(tok_emb)
         all_hidden_states = []
         for block_idx, block in enumerate(self.transformer.h):
@@ -97,6 +100,9 @@ class AuriStream(PreTrainedModel):
                 break
             x = block(x)
         # append the last hidden state if we did not exit early
         if up_until_layer is None or block_idx == len(self.transformer.h) - 1:
             all_hidden_states.append(x)
@@ -530,18 +536,18 @@ class MLP(nn.Module):
 class Rotary(torch.nn.Module):
-    def __init__(self, dim, base=500000, learned=True):
         super().__init__()
         # Compute the base inverse frequencies as before.
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
-        # If learned is True, register as a parameter; otherwise, as a buffer.
-        if learned:
-            # Initialize randomly and register as a parameter.
-            self.inv_freq = torch.nn.Parameter(inv_freq)
-            nn.init.normal_(self.inv_freq, mean=0.0, std=0.02)
-        else:
-            self.register_buffer("inv_freq", inv_freq)
-        self.learned = learned  # (optional) Save the flag if needed later
     def forward(self, x):
         seq_len = x.shape[1]
@@ -553,6 +559,7 @@ class Rotary(torch.nn.Module):
         sin_cached = freqs.sin()
         return cos_cached[None, :, None, :], sin_cached[None, :, None, :]
 def apply_rotary_emb(x, cos, sin):
     assert x.ndim == 4  # multihead attention expected
     d = x.shape[3] // 2
@@ -577,4 +584,26 @@ class RMSNorm(nn.Module):
         output = self._norm(x.float()).type_as(x)
         if self.weight is not None:
             return output * self.weight
-        return output

             x = self.transformer.drop(tok_emb + pos_emb)
         else:
             x = self.transformer.drop(tok_emb)
+        if self.dwa is not None:
+            x = self.dwa.init_accumulators(x)
         all_hidden_states = []
         for block_idx, block in enumerate(self.transformer.h):
                 break
             x = block(x)
+            if self.dwa is not None:
+                x = self.dwa(x)
         # append the last hidden state if we did not exit early
         if up_until_layer is None or block_idx == len(self.transformer.h) - 1:
             all_hidden_states.append(x)
 class Rotary(torch.nn.Module):
+    def __init__(self, dim, base=10000, learned=False):
         super().__init__()
         # Compute the base inverse frequencies as before.
+        self.inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        # # If learned is True, register as a parameter; otherwise, as a buffer.
+        # if learned:
+        #     # Initialize randomly and register as a parameter.
+        #     self.inv_freq = torch.nn.Parameter(inv_freq)
+        #     nn.init.normal_(self.inv_freq, mean=0.0, std=0.02)
+        # else:
+        #     self.register_buffer("inv_freq", inv_freq)
+        # self.learned = learned  # (optional) Save the flag if needed later
     def forward(self, x):
         seq_len = x.shape[1]
         sin_cached = freqs.sin()
         return cos_cached[None, :, None, :], sin_cached[None, :, None, :]
 def apply_rotary_emb(x, cos, sin):
     assert x.ndim == 4  # multihead attention expected
     d = x.shape[3] // 2
         output = self._norm(x.float()).type_as(x)
         if self.weight is not None:
             return output * self.weight
+        return output
+class DWA(nn.Module):
+    """ Depth Weighted Average layer that averages representations across the layers of a transformer """
+    """ From: https://arxiv.org/pdf/2402.02622"""
+    def __init__(self, n_layers: int):
+        super().__init__()
+        self.alphas = nn.Parameter(torch.zeros(n_layers, n_layers))
+        self.alphas.data = torch.eye(n_layers)
+        self.accumulators = []
+    def init_accumulators(self, x):
+        self.accumulators = [x]
+        return x * self.alphas[0, 0]
+    def forward(self, x):
+        self.accumulators.append(x)
+        x = 0.0
+        for i in range(len(self.accumulators)):
+            x = x + self.alphas[i, len(self.accumulators)-1] * self.accumulators[i]
+        return x