Upload 14 files

Browse files

Files changed (6) hide show

AsteriskForCausalLM.py +113 -27
README.md +101 -8
chat_template.jinja +1 -1
config.json +2 -1
model.safetensors +2 -2
training_args.bin +1 -1

AsteriskForCausalLM.py CHANGED Viewed

@@ -36,6 +36,7 @@ class AsteriskConfig(LlamaConfig):
         aspp_hidden_dim: Optional[int] = None,
         aspp_num_steps: int = 2,
         aspp_dropout: float = 0.1,
         # π-flow parameters
         pi_flow: bool = False,
         pi_flow_steps: int = 1,
@@ -48,6 +49,7 @@ class AsteriskConfig(LlamaConfig):
         self.aspp_hidden_dim = aspp_hidden_dim
         self.aspp_num_steps = aspp_num_steps
         self.aspp_dropout = aspp_dropout
         # π-flow config
         self.pi_flow = pi_flow
         self.pi_flow_steps = pi_flow_steps
@@ -57,26 +59,31 @@ class AsteriskConfig(LlamaConfig):
 class ASPPOperator(nn.Module):
     """
-    Asterisk Operator (ASPP) - Point-wise Parallel Propagation
-    Simplified version WITHOUT neighbor gathering to reduce overfitting:
-    - Optional dimensionality reduction for efficiency
-    - Point-wise evolution: h_i^(t+1) = φ(h_i^(t))  [NO neighbors]
-    - Multi-step evolution for depth without added complexity
-    - Dropout for regularization
     Args:
         hidden_size: Dimension of hidden states (input/output)
         aspp_hidden_dim: Internal dimension for ASPP (default: None, use hidden_size)
         num_steps: Number of evolution steps K (default: 2)
         dropout: Dropout rate for regularization (default: 0.1)
     """
-    def __init__(self, hidden_size: int, aspp_hidden_dim: Optional[int] = None, num_steps: int = 2, dropout: float = 0.1):
         super().__init__()
         self.hidden_size = hidden_size
         self.aspp_hidden_dim = aspp_hidden_dim or hidden_size
         self.num_steps = num_steps
         # Projection to lower dimension (if specified)
         self.use_projection = (self.aspp_hidden_dim != hidden_size)
@@ -85,10 +92,22 @@ class ASPPOperator(nn.Module):
             self.up_proj = nn.Linear(self.aspp_hidden_dim, hidden_size)
             self.proj_dropout = nn.Dropout(dropout)
-        # Point-wise update function φ - NO neighbor gathering
-        # Much smaller: only processes current position
-        self.update_net = nn.Sequential(
-            nn.Linear(self.aspp_hidden_dim, self.aspp_hidden_dim * 2),
             nn.SiLU(),
             nn.Dropout(dropout),
             nn.Linear(self.aspp_hidden_dim * 2, self.aspp_hidden_dim),
@@ -96,7 +115,6 @@ class ASPPOperator(nn.Module):
         )
         # Learnable K-step parameter
-        # sigmoid(1.0) ≈ 0.73, giving k_steps ≈ 1.5 → 2 steps initially
         self.k_logit = nn.Parameter(torch.tensor(1.0))
         # Learnable residual scale
@@ -112,6 +130,8 @@ class ASPPOperator(nn.Module):
         Returns:
             evolved_states: [batch_size, seq_len, hidden_size]
         """
         # Project to lower dimension if needed
         if self.use_projection:
             h_t = self.down_proj(hidden_states)
@@ -122,12 +142,70 @@ class ASPPOperator(nn.Module):
         # Learnable number of steps
         k_steps = max(1, int(torch.sigmoid(self.k_logit) * self.num_steps))
-        # K-step point-wise evolution (NO neighbor gathering)
         for t in range(k_steps):
-            # Apply point-wise update rule φ
-            h_t_next = self.update_net(h_t)
-            # Scaled residual connection for stability
             h_t = h_t + self.residual_scale * h_t_next
             h_t = self.norm(h_t)
@@ -153,7 +231,7 @@ class HybridASPPAttentionLayer(LlamaDecoderLayer):
     4. Feed-forward network
     """
-    def __init__(self, config: LlamaConfig, layer_idx: int, aspp_hidden_dim: Optional[int] = None, aspp_num_steps: int = 2, aspp_dropout: float = 0.1):
         # Initialize parent LlamaDecoderLayer
         super().__init__(config, layer_idx)
@@ -162,7 +240,8 @@ class HybridASPPAttentionLayer(LlamaDecoderLayer):
             hidden_size=config.hidden_size,
             aspp_hidden_dim=aspp_hidden_dim,
             num_steps=aspp_num_steps,
-            dropout=aspp_dropout
         )
         # Gated fusion mechanism with dropout
@@ -182,7 +261,8 @@ class HybridASPPAttentionLayer(LlamaDecoderLayer):
                 hidden_size=config.hidden_size,
                 aspp_hidden_dim=aspp_hidden_dim,
                 num_steps=aspp_num_steps,
-                dropout=aspp_dropout
             )
             # Learnable flow scale (per-layer)
@@ -276,7 +356,7 @@ class AsteriskLlamaModel(LlamaModel):
     All layers use hybrid ASPP+Attention by default for maximum expressiveness.
     """
-    def __init__(self, config: LlamaConfig, hybrid_layer_indices: Optional[List[int]] = None, aspp_hidden_dim: Optional[int] = None, aspp_num_steps: int = 2, aspp_dropout: float = 0.1):
         super().__init__(config)
         # Determine which layers to make hybrid (default: ALL layers)
@@ -295,7 +375,8 @@ class AsteriskLlamaModel(LlamaModel):
                     layer_idx=idx,
                     aspp_hidden_dim=aspp_hidden_dim,
                     aspp_num_steps=aspp_num_steps,
-                    aspp_dropout=aspp_dropout
                 )
         # Initialize weights
@@ -311,7 +392,7 @@ class AsteriskForCausalLM(LlamaForCausalLM):
     config_class = AsteriskConfig
-    def __init__(self, config: AsteriskConfig, hybrid_layer_indices: Optional[List[int]] = None, aspp_hidden_dim: Optional[int] = None, aspp_num_steps: int = 2, aspp_dropout: float = 0.1):
         # Read all ASPP parameters from config if not explicitly provided
         if hybrid_layer_indices is None and hasattr(config, 'hybrid_layer_indices'):
             hybrid_layer_indices = config.hybrid_layer_indices
@@ -321,11 +402,13 @@ class AsteriskForCausalLM(LlamaForCausalLM):
             aspp_num_steps = config.aspp_num_steps
         if hasattr(config, 'aspp_dropout'):
             aspp_dropout = config.aspp_dropout
         super().__init__(config)
         # Replace model with Asterisk version
-        self.model = AsteriskLlamaModel(config, hybrid_layer_indices, aspp_hidden_dim, aspp_num_steps, aspp_dropout)
         # Store hybrid layer info in config for serialization
         self.config.hybrid_layer_indices = hybrid_layer_indices
@@ -341,6 +424,7 @@ class AsteriskForCausalLM(LlamaForCausalLM):
         aspp_hidden_dim: Optional[int] = None,
         aspp_num_steps: int = 2,
         aspp_dropout: float = 0.1,
         # π-flow parameters
         pi_flow: bool = False,
         pi_flow_steps: int = 1,
@@ -357,6 +441,7 @@ class AsteriskForCausalLM(LlamaForCausalLM):
             aspp_hidden_dim: Internal dimension for ASPP (None = use model hidden_size)
             aspp_num_steps: Number of evolution steps K for ASPP (default: 2)
             aspp_dropout: Dropout rate for ASPP regularization (default: 0.1)
             pi_flow: Enable π-flow refinement step (default: False)
             pi_flow_steps: Number of flow refinement steps (default: 1)
             pi_flow_scale: Initial flow scale parameter (default: 0.2)
@@ -373,6 +458,7 @@ class AsteriskForCausalLM(LlamaForCausalLM):
             aspp_hidden_dim=aspp_hidden_dim,
             aspp_num_steps=aspp_num_steps,
             aspp_dropout=aspp_dropout,
             pi_flow=pi_flow,
             pi_flow_steps=pi_flow_steps,
             pi_flow_scale=pi_flow_scale,
@@ -380,15 +466,15 @@ class AsteriskForCausalLM(LlamaForCausalLM):
         )
         # Create Asterisk model
-        asterisk_model = cls(asterisk_config, hybrid_layer_indices, aspp_hidden_dim, aspp_num_steps, aspp_dropout)
         # Transfer weights from base model (non-hybrid layers and embeddings)
         asterisk_model.load_state_dict(base_model.state_dict(), strict=False)
-        print(f"✓ Converted base model to Asterisk architecture")
         print(f"  Hybrid layers: {asterisk_model.model.hybrid_layer_indices}")
         aspp_dim_str = f"{aspp_hidden_dim}" if aspp_hidden_dim else f"{base_config.hidden_size} (full)"
-        print(f"  ASPP config: dim={aspp_dim_str}, steps={aspp_num_steps}, dropout={aspp_dropout}")
         if pi_flow:
             print(f"  π-flow enabled: steps={pi_flow_steps}, scale={pi_flow_scale}, gate={pi_flow_use_gate}")

         aspp_hidden_dim: Optional[int] = None,
         aspp_num_steps: int = 2,
         aspp_dropout: float = 0.1,
+        aspp_num_neighbors: int = 8,  # NEW: number of semantic neighbors
         # π-flow parameters
         pi_flow: bool = False,
         pi_flow_steps: int = 1,
         self.aspp_hidden_dim = aspp_hidden_dim
         self.aspp_num_steps = aspp_num_steps
         self.aspp_dropout = aspp_dropout
+        self.aspp_num_neighbors = aspp_num_neighbors
         # π-flow config
         self.pi_flow = pi_flow
         self.pi_flow_steps = pi_flow_steps
 class ASPPOperator(nn.Module):
     """
+    Asterisk Operator (ASPP) - Graph Propagation with Adjacency List (2D Grid)
+    Converts 1D sequence to 2D grid and performs 8-neighbor graph propagation:
+    - Adjacency list: 8 neighbors per position (棋盘8邻居)
+        ↖ ↑ ↗
+        ← ● →
+        ↙ ↓ ↘
+    - Learnable adjacency weights (randomly initialized)
+    - Padding for dynamic sequence lengths
+    - Message passing: h_i^(t+1) = φ(h_i^(t), Σ_j∈N(i) A_j * h_j^(t))
     Args:
         hidden_size: Dimension of hidden states (input/output)
         aspp_hidden_dim: Internal dimension for ASPP (default: None, use hidden_size)
         num_steps: Number of evolution steps K (default: 2)
         dropout: Dropout rate for regularization (default: 0.1)
+        num_neighbors: Number of neighbors (default: 8, for 8-directional grid)
     """
+    def __init__(self, hidden_size: int, aspp_hidden_dim: Optional[int] = None, num_steps: int = 2, dropout: float = 0.1, num_neighbors: int = 8):
         super().__init__()
         self.hidden_size = hidden_size
         self.aspp_hidden_dim = aspp_hidden_dim or hidden_size
         self.num_steps = num_steps
+        self.num_neighbors = num_neighbors
         # Projection to lower dimension (if specified)
         self.use_projection = (self.aspp_hidden_dim != hidden_size)
             self.up_proj = nn.Linear(self.aspp_hidden_dim, hidden_size)
             self.proj_dropout = nn.Dropout(dropout)
+        # 8-directional offsets for 2D grid neighbors (row, col)
+        # ↖(-1,-1) ↑(-1,0) ↗(-1,1)
+        # ←(0,-1)  ●(0,0)  →(0,1)
+        # ↙(1,-1)  ↓(1,0)  ↘(1,1)
+        self.register_buffer('neighbor_offsets', torch.tensor([
+            [-1, -1], [-1, 0], [-1, 1],  # top-left, top, top-right
+            [0, -1],           [0, 1],   # left, right
+            [1, -1],  [1, 0],  [1, 1]    # bottom-left, bottom, bottom-right
+        ], dtype=torch.long))  # [8, 2]
+        # Learnable adjacency weights for 8 directions (randomly initialized)
+        self.adjacency_weights = nn.Parameter(torch.randn(num_neighbors) * 0.1)
+        # Message aggregation function: combines self + neighbors
+        self.message_net = nn.Sequential(
+            nn.Linear(self.aspp_hidden_dim * 2, self.aspp_hidden_dim * 2),
             nn.SiLU(),
             nn.Dropout(dropout),
             nn.Linear(self.aspp_hidden_dim * 2, self.aspp_hidden_dim),
         )
         # Learnable K-step parameter
         self.k_logit = nn.Parameter(torch.tensor(1.0))
         # Learnable residual scale
         Returns:
             evolved_states: [batch_size, seq_len, hidden_size]
         """
+        batch_size, seq_len, _ = hidden_states.shape
         # Project to lower dimension if needed
         if self.use_projection:
             h_t = self.down_proj(hidden_states)
         # Learnable number of steps
         k_steps = max(1, int(torch.sigmoid(self.k_logit) * self.num_steps))
+        # K-step graph propagation with 2D grid adjacency list
         for t in range(k_steps):
+            # 1. Reshape 1D sequence to 2D grid
+            # Dynamic grid size: H ≈ W ≈ sqrt(seq_len)
+            H = int(torch.ceil(torch.sqrt(torch.tensor(seq_len, dtype=torch.float32))).item())
+            W = int(torch.ceil(torch.tensor(seq_len, dtype=torch.float32) / H).item())
+            grid_size = H * W
+            # Pad sequence to grid_size
+            if seq_len < grid_size:
+                padding = grid_size - seq_len
+                h_t_padded = F.pad(h_t, (0, 0, 0, padding), mode='constant', value=0)  # [B, H*W, D]
+            else:
+                h_t_padded = h_t
+            # Reshape to 2D grid: [B, H*W, D] -> [B, H, W, D]
+            h_grid = h_t_padded.view(batch_size, H, W, self.aspp_hidden_dim)
+            # 2. Add boundary padding for neighbor gathering (pad with zeros)
+            # Pad 1 row/col on each side: [B, H, W, D] -> [B, H+2, W+2, D]
+            h_grid_padded = F.pad(h_grid, (0, 0, 1, 1, 1, 1), mode='constant', value=0)
+            # 3. Gather neighbors using adjacency list (offsets)
+            # neighbor_offsets: [8, 2] with (row_offset, col_offset)
+            # Only use first num_neighbors offsets
+            neighbors_list = []
+            for offset in self.neighbor_offsets[:self.num_neighbors]:
+                # Offset is relative to center, but we need absolute indices in padded grid
+                # Center at (i+1, j+1) in padded grid, neighbor at (i+1+di, j+1+dj)
+                # Use roll to shift the grid
+                di, dj = offset[0].item(), offset[1].item()
+                # Create index tensors for gathering
+                # For each position (i,j) in original grid, get neighbor at (i+di, j+dj) in padded grid
+                row_indices = torch.arange(H, device=h_t.device).view(-1, 1).expand(H, W) + 1 + di
+                col_indices = torch.arange(W, device=h_t.device).view(1, -1).expand(H, W) + 1 + dj
+                # Gather neighbor features: [B, H, W, D]
+                neighbor = h_grid_padded[:, row_indices, col_indices, :]  # [B, H, W, D]
+                neighbors_list.append(neighbor)
+            # Stack neighbors: [B, H, W, num_neighbors, D]
+            neighbors = torch.stack(neighbors_list, dim=3)  # [B, H, W, num_neighbors, D]
+            # 4. Apply learnable adjacency weights
+            # adjacency_weights: [num_neighbors] -> normalize with softmax
+            adj_weights = F.softmax(self.adjacency_weights, dim=0)  # [num_neighbors]
+            # Weighted aggregation: [B, H, W, D]
+            aggregated_neighbors = torch.sum(neighbors * adj_weights.view(1, 1, 1, self.num_neighbors, 1), dim=3)
+            # 5. Message passing: combine self + neighbors
+            # Flatten back to sequence: [B, H, W, D] -> [B, H*W, D]
+            h_grid_flat = h_grid.view(batch_size, grid_size, self.aspp_hidden_dim)
+            aggregated_flat = aggregated_neighbors.view(batch_size, grid_size, self.aspp_hidden_dim)
+            # Concat and pass through message net
+            message_input = torch.cat([h_grid_flat, aggregated_flat], dim=-1)  # [B, H*W, 2D]
+            h_t_next = self.message_net(message_input)  # [B, H*W, D]
+            # Remove padding to restore original seq_len
+            h_t_next = h_t_next[:, :seq_len, :]  # [B, L, D]
+            # 6. Scaled residual connection for stability
             h_t = h_t + self.residual_scale * h_t_next
             h_t = self.norm(h_t)
     4. Feed-forward network
     """
+    def __init__(self, config: LlamaConfig, layer_idx: int, aspp_hidden_dim: Optional[int] = None, aspp_num_steps: int = 2, aspp_dropout: float = 0.1, aspp_num_neighbors: int = 8):
         # Initialize parent LlamaDecoderLayer
         super().__init__(config, layer_idx)
             hidden_size=config.hidden_size,
             aspp_hidden_dim=aspp_hidden_dim,
             num_steps=aspp_num_steps,
+            dropout=aspp_dropout,
+            num_neighbors=aspp_num_neighbors
         )
         # Gated fusion mechanism with dropout
                 hidden_size=config.hidden_size,
                 aspp_hidden_dim=aspp_hidden_dim,
                 num_steps=aspp_num_steps,
+                dropout=aspp_dropout,
+                num_neighbors=aspp_num_neighbors
             )
             # Learnable flow scale (per-layer)
     All layers use hybrid ASPP+Attention by default for maximum expressiveness.
     """
+    def __init__(self, config: LlamaConfig, hybrid_layer_indices: Optional[List[int]] = None, aspp_hidden_dim: Optional[int] = None, aspp_num_steps: int = 2, aspp_dropout: float = 0.1, aspp_num_neighbors: int = 8):
         super().__init__(config)
         # Determine which layers to make hybrid (default: ALL layers)
                     layer_idx=idx,
                     aspp_hidden_dim=aspp_hidden_dim,
                     aspp_num_steps=aspp_num_steps,
+                    aspp_dropout=aspp_dropout,
+                    aspp_num_neighbors=aspp_num_neighbors
                 )
         # Initialize weights
     config_class = AsteriskConfig
+    def __init__(self, config: AsteriskConfig, hybrid_layer_indices: Optional[List[int]] = None, aspp_hidden_dim: Optional[int] = None, aspp_num_steps: int = 2, aspp_dropout: float = 0.1, aspp_num_neighbors: int = 8):
         # Read all ASPP parameters from config if not explicitly provided
         if hybrid_layer_indices is None and hasattr(config, 'hybrid_layer_indices'):
             hybrid_layer_indices = config.hybrid_layer_indices
             aspp_num_steps = config.aspp_num_steps
         if hasattr(config, 'aspp_dropout'):
             aspp_dropout = config.aspp_dropout
+        if hasattr(config, 'aspp_num_neighbors'):
+            aspp_num_neighbors = config.aspp_num_neighbors
         super().__init__(config)
         # Replace model with Asterisk version
+        self.model = AsteriskLlamaModel(config, hybrid_layer_indices, aspp_hidden_dim, aspp_num_steps, aspp_dropout, aspp_num_neighbors)
         # Store hybrid layer info in config for serialization
         self.config.hybrid_layer_indices = hybrid_layer_indices
         aspp_hidden_dim: Optional[int] = None,
         aspp_num_steps: int = 2,
         aspp_dropout: float = 0.1,
+        aspp_num_neighbors: int = 8,  # NEW: number of semantic neighbors
         # π-flow parameters
         pi_flow: bool = False,
         pi_flow_steps: int = 1,
             aspp_hidden_dim: Internal dimension for ASPP (None = use model hidden_size)
             aspp_num_steps: Number of evolution steps K for ASPP (default: 2)
             aspp_dropout: Dropout rate for ASPP regularization (default: 0.1)
+            aspp_num_neighbors: Number of semantic neighbors for graph propagation (default: 8)
             pi_flow: Enable π-flow refinement step (default: False)
             pi_flow_steps: Number of flow refinement steps (default: 1)
             pi_flow_scale: Initial flow scale parameter (default: 0.2)
             aspp_hidden_dim=aspp_hidden_dim,
             aspp_num_steps=aspp_num_steps,
             aspp_dropout=aspp_dropout,
+            aspp_num_neighbors=aspp_num_neighbors,
             pi_flow=pi_flow,
             pi_flow_steps=pi_flow_steps,
             pi_flow_scale=pi_flow_scale,
         )
         # Create Asterisk model
+        asterisk_model = cls(asterisk_config, hybrid_layer_indices, aspp_hidden_dim, aspp_num_steps, aspp_dropout, aspp_num_neighbors)
         # Transfer weights from base model (non-hybrid layers and embeddings)
         asterisk_model.load_state_dict(base_model.state_dict(), strict=False)
+        print(f"✓ Converted base model to Asterisk architecture with Graph Propagation")
         print(f"  Hybrid layers: {asterisk_model.model.hybrid_layer_indices}")
         aspp_dim_str = f"{aspp_hidden_dim}" if aspp_hidden_dim else f"{base_config.hidden_size} (full)"
+        print(f"  ASPP config: dim={aspp_dim_str}, steps={aspp_num_steps}, dropout={aspp_dropout}, neighbors={aspp_num_neighbors}")
         if pi_flow:
             print(f"  π-flow enabled: steps={pi_flow_steps}, scale={pi_flow_scale}, gate={pi_flow_use_gate}")

README.md CHANGED Viewed

@@ -100,12 +100,41 @@ class HybridASPPAttentionLayer:
     Combines ASPP operator with standard attention
     Components:
-    - ASPP operator: Local structured reasoning
     - Standard attention: Global context
     - Gated fusion: Dynamic balancing
     """
 ```
 **Fusion mechanism:**
 ```
 aspp_out = ASPP(hidden_states)
@@ -240,9 +269,9 @@ Total: ~10,148 training samples
 ### Training Configuration
 - **Starting Point**: Asterisk checkpoint (base ASPP-Attention model)
-- **Optimizer**: AdamW (lr=5e-4, weight_decay=0.1)
-- **Batch Size**: 2 per device, gradient accumulation=4 (effective batch=8)
-- **Epochs**: 2
 - **Scheduler**: Linear warmup (10% of steps)
 - **Mixed Precision**: bfloat16
 - **Gradient Checkpointing**: Enabled
@@ -263,9 +292,16 @@ pi_flow_use_gate = True     # Token-wise adaptive gating
 aspp_hidden_dim = 256       # Internal dimension (vs 576 model hidden_size)
 aspp_num_steps = 4          # Evolution steps for ASPP
 aspp_dropout = 0.2          # Regularization
 hybrid_layer_indices = None # All 30 layers
 ```
 ## Model Creation from Base Asterisk
 ```python
@@ -283,6 +319,10 @@ config.pi_flow_steps = 2
 config.pi_flow_scale = 1.0
 config.pi_flow_use_gate = True
 # Create model with π-flow
 model = AsteriskForCausalLM(config)
@@ -334,6 +374,40 @@ This creates a **hierarchical refinement cascade** enabling gradual convergence
 ## Implementation Details
 ### Return Type Handling
 Critical for Transformers compatibility:
@@ -386,22 +460,41 @@ Asterisk-Pi/
 ## Known Issues & Solutions
-### 1. Return Type Errors
 **Issue**: `AttributeError: 'tuple' object has no attribute 'dtype'`
 **Solution**: `HybridASPPAttentionLayer.forward()` must return `torch.Tensor` only, not tuple. This matches the `LlamaDecoderLayer` API in transformers 4.57.6.
-### 2. π-Flow in All Layers vs Final Layer
 **Initial approach**: π-flow only in final layer (limited expressiveness)
 **Current approach**: π-flow in all 30 hybrid layers for maximum refinement capability.
-### 3. Training Stability
 π-Flow can cause instability with high learning rates. Use:
-- Lower learning rate (5e-4 vs 2e-5 for base)
 - Gradient clipping (max_norm=1.0)
 - Conservative initial flow scale (0.2-1.0)

     Combines ASPP operator with standard attention
     Components:
+    - ASPP operator: Local structured reasoning with graph propagation
     - Standard attention: Global context
     - Gated fusion: Dynamic balancing
     """
 ```
+**ASPP Operator - Graph Propagation:**
+The ASPP operator converts the 1D sequence into a 2D grid and performs graph-based message passing:
+```
+Sequence [1, 2, 3, 4, ...] → 2D Grid:
+┌───┬───┬───┐
+│ 1 │ 2 │ 3 │
+├───┼───┼───┤
+│ 4 │ 5 │ 6 │
+└───┴───┴───┘
+8-directional neighbors (default):
+  ↖ ↑ ↗
+  ← ● →
+  ↙ ↓ ↘
+4-directional neighbors (optional):
+    ↑
+  ← ● →
+    ↓
+```
+**Key features:**
+- **Configurable neighbors**: `aspp_num_neighbors` (default: 8)
+- **Learnable adjacency weights**: Each direction has a learnable weight
+- **K-step evolution**: Iterative message passing for `aspp_num_steps` (default: 4)
+- **Dynamic grid**: Grid dimensions adapt to sequence length (H ≈ W ≈ √seq_len)
 **Fusion mechanism:**
 ```
 aspp_out = ASPP(hidden_states)
 ### Training Configuration
 - **Starting Point**: Asterisk checkpoint (base ASPP-Attention model)
+- **Optimizer**: AdamW (lr=1e-4, weight_decay=0.1)
+- **Batch Size**: 4 per device, gradient accumulation=4 (effective batch=16)
+- **Epochs**: 2.5
 - **Scheduler**: Linear warmup (10% of steps)
 - **Mixed Precision**: bfloat16
 - **Gradient Checkpointing**: Enabled
 aspp_hidden_dim = 256       # Internal dimension (vs 576 model hidden_size)
 aspp_num_steps = 4          # Evolution steps for ASPP
 aspp_dropout = 0.2          # Regularization
+aspp_num_neighbors = 8      # Number of semantic neighbors for graph propagation (default: 8)
 hybrid_layer_indices = None # All 30 layers
 ```
+**Graph Propagation Neighbors:**
+- Default: 8-directional grid (↖↑↗←→↙↓↘)
+- Configurable: Can use fewer neighbors (e.g., 4-directional: ↑←→↓)
+- The neighbor offsets are defined in a buffer, and only the first `num_neighbors` are used
+- Learnable adjacency weights adapt importance of each direction during training
 ## Model Creation from Base Asterisk
 ```python
 config.pi_flow_scale = 1.0
 config.pi_flow_use_gate = True
+# Optional: Configure ASPP graph propagation neighbors
+# config.aspp_num_neighbors = 8  # Default: 8 (full 8-directional grid)
+# config.aspp_num_neighbors = 4  # Alternative: 4 (cardinal directions only)
 # Create model with π-flow
 model = AsteriskForCausalLM(config)
 ## Implementation Details
+### ASPP Graph Propagation Configuration
+The ASPP operator supports configurable neighbor connectivity:
+```python
+class ASPPOperator(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        aspp_hidden_dim: Optional[int] = None,
+        num_steps: int = 2,
+        dropout: float = 0.1,
+        num_neighbors: int = 8  # Configurable: 4, 8, etc.
+    ):
+        # Neighbor offsets buffer (8 directions hardcoded)
+        self.register_buffer('neighbor_offsets', torch.tensor([
+            [-1, -1], [-1, 0], [-1, 1],  # ↖ ↑ ↗
+            [0, -1],           [0, 1],   # ← →
+            [1, -1],  [1, 0],  [1, 1]    # ↙ ↓ ↘
+        ]))
+        # Only first num_neighbors are used
+        self.num_neighbors = num_neighbors
+        # Learnable adjacency weights
+        self.adjacency_weights = nn.Parameter(torch.randn(num_neighbors) * 0.1)
+```
+**Usage:**
+- `num_neighbors=8`: Full 8-directional connectivity (default)
+- `num_neighbors=4`: Cardinal directions only (↑←→↓)
+- The buffer always contains 8 offsets, but only the first `num_neighbors` are used
+- Adjacency weights are softmax-normalized for stability
 ### Return Type Handling
 Critical for Transformers compatibility:
 ## Known Issues & Solutions
+### 1. Neighbor Count Configuration (Fixed in Latest Version)
+**Previous issue**: ASPP operator hardcoded 8 neighbors in reshape operations, causing errors when using different neighbor counts.
+**Solution**: Updated implementation to use `self.num_neighbors` dynamically:
+```python
+# Fixed: Dynamic neighbor count
+for offset in self.neighbor_offsets[:self.num_neighbors]:  # Only use first N
+    # ...gather neighbors...
+# Fixed: Dynamic reshape
+aggregated_neighbors = torch.sum(
+    neighbors * adj_weights.view(1, 1, 1, self.num_neighbors, 1),
+    dim=3
+)
+```
+Now supports any neighbor count (4, 8, etc.) without modification.
+### 2. Return Type Errors
 **Issue**: `AttributeError: 'tuple' object has no attribute 'dtype'`
 **Solution**: `HybridASPPAttentionLayer.forward()` must return `torch.Tensor` only, not tuple. This matches the `LlamaDecoderLayer` API in transformers 4.57.6.
+### 3. π-Flow in All Layers vs Final Layer
 **Initial approach**: π-flow only in final layer (limited expressiveness)
 **Current approach**: π-flow in all 30 hybrid layers for maximum refinement capability.
+### 4. Training Stability
 π-Flow can cause instability with high learning rates. Use:
+- Lower learning rate (1e-4 recommended for stability)
 - Gradient clipping (max_norm=1.0)
 - Conservative initial flow scale (0.2-1.0)

chat_template.jinja CHANGED Viewed

@@ -1,5 +1,5 @@
 {% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system
-You are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>
 ' }}{% endif %}{{'<|im_start|>' + message['role'] + '
 ' + message['content'] + '<|im_end|>' + '
 '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant

 {% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system
+You are a helpful AI assistant named Asterisk, trained by NoesisLab<|im_end|>
 ' }}{% endif %}{{'<|im_start|>' + message['role'] + '
 ' + message['content'] + '<|im_end|>' + '
 '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant

config.json CHANGED Viewed

@@ -2,8 +2,9 @@
   "architectures": [
     "AsteriskForCausalLM"
   ],
-  "aspp_dropout": 0.2,
   "aspp_hidden_dim": 256,
   "aspp_num_steps": 4,
   "attention_bias": false,
   "attention_dropout": 0.0,

   "architectures": [
     "AsteriskForCausalLM"
   ],
+  "aspp_dropout": 0.1,
   "aspp_hidden_dim": 256,
+  "aspp_num_neighbors": 8,
   "aspp_num_steps": 4,
   "attention_bias": false,
   "attention_dropout": 0.0,

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cd3411332c19c27ac340b99a92d91e0b93f224b62fa3e0cccf7777b4e126b802
-size 381107624

 version https://git-lfs.github.com/spec/v1
+oid sha256:41550e1413295a2b5e02127758543c905650e9c834c6b53e382238f4021f3668
+size 396858360

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:357a1e8bcbd247f80b9437f6d4dd9e81a29edbafaa6fea075a7380b6927773f4
 size 6353

 version https://git-lfs.github.com/spec/v1
+oid sha256:79668c78f13b1c865f88ffab2a80bc10c893e3262c29261ee8d447ec47b717d3
 size 6353