SequentialLearning
/

SuperLinear

mixture-of-experts

Model card Files Files and versions

razmars commited on Apr 27, 2025

Commit

25927a3

·

verified ·

1 Parent(s): 03ea7d0

Update modeling_super_linear.py

Files changed (1) hide show

modeling_super_linear.py +24 -2

modeling_super_linear.py CHANGED Viewed

@@ -208,9 +208,31 @@ class RLinear(nn.Module):
             final_scaling  = original_norm / new_norm if new_norm.item() != 0 else 1.0
             #final_scaling  = 1
             new_W          = new_W * final_scaling
-        self.zero_shot_Linear        = new_W
     def forward(self, x):
         # x: [Batch, Input length,Channel]
@@ -218,7 +240,7 @@ class RLinear(nn.Module):
         if x.shape[1] < self.seq_len:
             if self.zero_shot_Linear is None:
                 #print(F"new Lookkback : {x.shape[1]}")
-                self.transform_model(x.shape[1],2)
             x = x.clone()
             #x = x * (x.shape[1]/512)

             final_scaling  = original_norm / new_norm if new_norm.item() != 0 else 1.0
             #final_scaling  = 1
             new_W          = new_W * final_scaling
+            self.zero_shot_Linear        = new_W
+        else:
+            W = self.Linear.weight.detach()
+            target_indices = torch.linspace(0, self.seq_len - 1, steps=new_lookback, device=W.device)
+            source_indices = torch.arange(0, self.seq_len, device=W.device).float()
+            # Initialize the new weight matrix
+            new_W = torch.zeros((W.size(0), new_lookback), device=W.device)
+            # Linear interpolation for each row
+            for i in range(W.size(0)):
+                new_W[i] = torch.tensor([torch.sum(W[i] * (1 - torch.abs(idx - source_indices) / self.seq_len).clamp(min=0))
+                                        for idx in target_indices], device=W.device)
+            # Maintain the same norm as the original weights
+            original_norm = torch.norm(W, p=2)
+            new_norm = torch.norm(new_W, p=2)
+            final_scaling = original_norm / new_norm if new_norm.item() != 0 else 1.0
+            new_W = new_W * final_scaling
+            self.zero_shot_Linear = new_W
     def forward(self, x):
         # x: [Batch, Input length,Channel]
         if x.shape[1] < self.seq_len:
             if self.zero_shot_Linear is None:
                 #print(F"new Lookkback : {x.shape[1]}")
+                self.transform_model(x.shape[1],3)
             x = x.clone()
             #x = x * (x.shape[1]/512)