Phase 7: Curriculum Learning (20K steps, BPC 1.78)
Browse files- src/models/layers.py +6 -6
src/models/layers.py
CHANGED
|
@@ -1,15 +1,14 @@
|
|
| 1 |
## Developer: inkbytefo
|
| 2 |
-
## Modified: 2025-11-
|
| 3 |
|
| 4 |
import torch
|
| 5 |
import torch.nn as nn
|
| 6 |
import torch.nn.functional as F
|
| 7 |
-
from .memory import HebbianMemory
|
| 8 |
|
| 9 |
class SlidingWindowAttention(nn.Module):
|
| 10 |
"""
|
| 11 |
Local Attention mechanism restricted to a sliding window.
|
| 12 |
-
Using standard SDPA for stability.
|
| 13 |
"""
|
| 14 |
def __init__(self, d_model: int, num_heads: int, window_size: int):
|
| 15 |
super().__init__()
|
|
@@ -57,7 +56,8 @@ class HybridBlock(nn.Module):
|
|
| 57 |
# Local Precision
|
| 58 |
self.attn = SlidingWindowAttention(d_model, num_heads, window_size)
|
| 59 |
|
| 60 |
-
# Global Context (Hebbian Memory)
|
|
|
|
| 61 |
self.memory = HebbianMemory(d_model, num_heads, dropout)
|
| 62 |
|
| 63 |
self.out_proj = nn.Linear(d_model, d_model)
|
|
@@ -74,9 +74,9 @@ class HybridBlock(nn.Module):
|
|
| 74 |
residual = x
|
| 75 |
x_norm = self.norm1(x)
|
| 76 |
|
| 77 |
-
# Parallel Branches
|
| 78 |
attn_out = self.attn(x_norm)
|
| 79 |
-
memory_out = self.memory(x_norm)
|
| 80 |
|
| 81 |
# Fusion
|
| 82 |
x = residual + self.out_proj(attn_out + memory_out)
|
|
|
|
| 1 |
## Developer: inkbytefo
|
| 2 |
+
## Modified: 2025-11-23
|
| 3 |
|
| 4 |
import torch
|
| 5 |
import torch.nn as nn
|
| 6 |
import torch.nn.functional as F
|
| 7 |
+
from .memory import HebbianMemory
|
| 8 |
|
| 9 |
class SlidingWindowAttention(nn.Module):
|
| 10 |
"""
|
| 11 |
Local Attention mechanism restricted to a sliding window.
|
|
|
|
| 12 |
"""
|
| 13 |
def __init__(self, d_model: int, num_heads: int, window_size: int):
|
| 14 |
super().__init__()
|
|
|
|
| 56 |
# Local Precision
|
| 57 |
self.attn = SlidingWindowAttention(d_model, num_heads, window_size)
|
| 58 |
|
| 59 |
+
# Global Context (Hebbian Memory)
|
| 60 |
+
# Replaces the static LinearAttention with dynamic Fast Weights
|
| 61 |
self.memory = HebbianMemory(d_model, num_heads, dropout)
|
| 62 |
|
| 63 |
self.out_proj = nn.Linear(d_model, d_model)
|
|
|
|
| 74 |
residual = x
|
| 75 |
x_norm = self.norm1(x)
|
| 76 |
|
| 77 |
+
# Parallel Branches: Local Attention + Global Hebbian Memory
|
| 78 |
attn_out = self.attn(x_norm)
|
| 79 |
+
memory_out = self.memory(x_norm)
|
| 80 |
|
| 81 |
# Fusion
|
| 82 |
x = residual + self.out_proj(attn_out + memory_out)
|