tefoteknik commited on
Commit
98c160d
·
verified ·
1 Parent(s): 40c390b

Phase 7: Curriculum Learning (20K steps, BPC 1.78)

Browse files
Files changed (1) hide show
  1. src/models/layers.py +6 -6
src/models/layers.py CHANGED
@@ -1,15 +1,14 @@
1
  ## Developer: inkbytefo
2
- ## Modified: 2025-11-22
3
 
4
  import torch
5
  import torch.nn as nn
6
  import torch.nn.functional as F
7
- from .memory import HebbianMemory # NEW IMPORT
8
 
9
  class SlidingWindowAttention(nn.Module):
10
  """
11
  Local Attention mechanism restricted to a sliding window.
12
- Using standard SDPA for stability.
13
  """
14
  def __init__(self, d_model: int, num_heads: int, window_size: int):
15
  super().__init__()
@@ -57,7 +56,8 @@ class HybridBlock(nn.Module):
57
  # Local Precision
58
  self.attn = SlidingWindowAttention(d_model, num_heads, window_size)
59
 
60
- # Global Context (Hebbian Memory) - Replaces LinearAttention
 
61
  self.memory = HebbianMemory(d_model, num_heads, dropout)
62
 
63
  self.out_proj = nn.Linear(d_model, d_model)
@@ -74,9 +74,9 @@ class HybridBlock(nn.Module):
74
  residual = x
75
  x_norm = self.norm1(x)
76
 
77
- # Parallel Branches
78
  attn_out = self.attn(x_norm)
79
- memory_out = self.memory(x_norm) # Using Hebbian Memory
80
 
81
  # Fusion
82
  x = residual + self.out_proj(attn_out + memory_out)
 
1
  ## Developer: inkbytefo
2
+ ## Modified: 2025-11-23
3
 
4
  import torch
5
  import torch.nn as nn
6
  import torch.nn.functional as F
7
+ from .memory import HebbianMemory
8
 
9
  class SlidingWindowAttention(nn.Module):
10
  """
11
  Local Attention mechanism restricted to a sliding window.
 
12
  """
13
  def __init__(self, d_model: int, num_heads: int, window_size: int):
14
  super().__init__()
 
56
  # Local Precision
57
  self.attn = SlidingWindowAttention(d_model, num_heads, window_size)
58
 
59
+ # Global Context (Hebbian Memory)
60
+ # Replaces the static LinearAttention with dynamic Fast Weights
61
  self.memory = HebbianMemory(d_model, num_heads, dropout)
62
 
63
  self.out_proj = nn.Linear(d_model, d_model)
 
74
  residual = x
75
  x_norm = self.norm1(x)
76
 
77
+ # Parallel Branches: Local Attention + Global Hebbian Memory
78
  attn_out = self.attn(x_norm)
79
+ memory_out = self.memory(x_norm)
80
 
81
  # Fusion
82
  x = residual + self.out_proj(attn_out + memory_out)