SequentialLearning
/

SuperLinear

@@ -371,71 +371,6 @@ class SparseNoisyMoE(nn.Module):
         load_balancing_loss = self.calculate_load_balancing_loss(self.gate_outputs, batch_size)
-        expert_probs = F.softmax(self.gate_outputs, dim=1)
-        expert_probs = expert_probs[1,:]
-        # Plot the expert probabilities
-        import matplotlib.pyplot as plt
-        # Get expert probabilities and convert to numpy
-        probs_np = expert_probs.detach().cpu().numpy()
-        # Create a nicer figure with a modern style
-        plt.style.use('ggplot')
-        plt.figure(figsize=(12, 8), dpi=120)
-        plt.subplot(111)
-        ax = plt.subplot(111)
-        # Create color gradient based on probability values
-        colors = plt.cm.viridis(probs_np)
-        # Plot bars with more attractive styling
-        bars = plt.bar(range(len(probs_np)), probs_np, color=colors, width=0.6,
-                       edgecolor='black', linewidth=0.5, alpha=0.85)
-        # Add value annotations on top of each bar
-        for i, bar in enumerate(bars):
-            height = bar.get_height()
-            plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
-                     f'{height:.3f}', ha='center', va='bottom', fontsize=9,
-                     rotation=0, fontweight='bold')
-        # Add expert names to x-axis
-        if hasattr(self, 'experts') and isinstance(getattr(self, 'experts', None), dict):
-            # If experts are stored in a dictionary with meaningful keys
-            expert_names = list(self.experts.keys())
-            plt.xticks(range(len(probs_np)), expert_names, rotation=45, ha='right')
-        else:
-            # Default numbering if expert names aren't available
-            plt.xticks(range(len(probs_np)), [f'Expert {i}' for i in range(len(probs_np))])
-        # Add grid for better readability
-        plt.grid(axis='y', linestyle='--', alpha=0.7)
-        # Add timestamp to title
-        timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        plt.title(f'Expert Selection Probabilities\n{timestamp}', fontsize=14, fontweight='bold')
-        plt.xlabel('Expert Models', fontsize=12)
-        plt.ylabel('Selection Probability', fontsize=12)
-        # Highlight the most probable expert
-        max_idx = np.argmax(probs_np)
-        bars[max_idx].set_color('orangered')
-        bars[max_idx].set_edgecolor('black')
-        bars[max_idx].set_linewidth(1.5)
-        # Add stats in a text box
-        textstr = f'Max: {probs_np.max():.4f} (Expert {max_idx})\n'
-        textstr += f'Min: {probs_np.min():.4f}\n'
-        textstr += f'Mean: {probs_np.mean():.4f}'
-        props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
-        plt.text(0.02, 0.97, textstr, transform=plt.gca().transAxes, fontsize=10,
-                 verticalalignment='top', bbox=props)
-        plt.tight_layout()
-        plt.savefig(F"expert_probabilities_{self.i}.png", bbox_inches='tight')
-        self.i+=1
-        plt.close()
-        print(expert_probs.shape)
         if get_prob:
             expert_probs = F.softmax(self.gate_outputs, dim=1)
             print(expert_probs.shape)
@@ -684,9 +619,6 @@ class SuperLinearForCausalLM(PreTrainedModel, GenerationMixin):
         # 3. Inverse FFT to the shorter grid
         y = torch.fft.irfft(X_crop, n=target_len, dim=1)
-        # 4. Renormalise amplitudes:
-        #    irfft divides by `target_len`, whereas the forward rfft used length `L`.
-        #    Multiply by (target_len / L) so DC and low-freq amplitudes match input.
         return y

         load_balancing_loss = self.calculate_load_balancing_loss(self.gate_outputs, batch_size)
         if get_prob:
             expert_probs = F.softmax(self.gate_outputs, dim=1)
             print(expert_probs.shape)
         # 3. Inverse FFT to the shorter grid
         y = torch.fft.irfft(X_crop, n=target_len, dim=1)
         return y