Upload 15 files

Browse files

Files changed (8) hide show

.gitattributes +1 -0
ARCHASTERISK.png +3 -0
AsteriskForCausalLM.py +27 -113
README.md +8 -101
chat_template.jinja +1 -1
config.json +1 -2
model.safetensors +2 -2
training_args.bin +1 -1

.gitattributes CHANGED Viewed

@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 Gemini_Generated_Image_jvekprjvekprjvek.png filter=lfs diff=lfs merge=lfs -text
 Arch.png filter=lfs diff=lfs merge=lfs -text

 *tfevents* filter=lfs diff=lfs merge=lfs -text
 Gemini_Generated_Image_jvekprjvekprjvek.png filter=lfs diff=lfs merge=lfs -text
 Arch.png filter=lfs diff=lfs merge=lfs -text
+ARCHASTERISK.png filter=lfs diff=lfs merge=lfs -text

ARCHASTERISK.png ADDED Viewed

Git LFS Details

SHA256: 4b5e7b1e822d358c4fc6de0882b23db7cec6d501af638d4550aae34f961168ef
Pointer size: 132 Bytes
Size of remote file: 5.71 MB

AsteriskForCausalLM.py CHANGED Viewed

@@ -36,7 +36,6 @@ class AsteriskConfig(LlamaConfig):
         aspp_hidden_dim: Optional[int] = None,
         aspp_num_steps: int = 2,
         aspp_dropout: float = 0.1,
-        aspp_num_neighbors: int = 8,  # NEW: number of semantic neighbors
         # π-flow parameters
         pi_flow: bool = False,
         pi_flow_steps: int = 1,
@@ -49,7 +48,6 @@ class AsteriskConfig(LlamaConfig):
         self.aspp_hidden_dim = aspp_hidden_dim
         self.aspp_num_steps = aspp_num_steps
         self.aspp_dropout = aspp_dropout
-        self.aspp_num_neighbors = aspp_num_neighbors
         # π-flow config
         self.pi_flow = pi_flow
         self.pi_flow_steps = pi_flow_steps
@@ -59,31 +57,26 @@ class AsteriskConfig(LlamaConfig):
 class ASPPOperator(nn.Module):
     """
-    Asterisk Operator (ASPP) - Graph Propagation with Adjacency List (2D Grid)
-    Converts 1D sequence to 2D grid and performs 8-neighbor graph propagation:
-    - Adjacency list: 8 neighbors per position (棋盘8邻居)
-        ↖ ↑ ↗
-        ← ● →
-        ↙ ↓ ↘
-    - Learnable adjacency weights (randomly initialized)
-    - Padding for dynamic sequence lengths
-    - Message passing: h_i^(t+1) = φ(h_i^(t), Σ_j∈N(i) A_j * h_j^(t))
     Args:
         hidden_size: Dimension of hidden states (input/output)
         aspp_hidden_dim: Internal dimension for ASPP (default: None, use hidden_size)
         num_steps: Number of evolution steps K (default: 2)
         dropout: Dropout rate for regularization (default: 0.1)
-        num_neighbors: Number of neighbors (default: 8, for 8-directional grid)
     """
-    def __init__(self, hidden_size: int, aspp_hidden_dim: Optional[int] = None, num_steps: int = 2, dropout: float = 0.1, num_neighbors: int = 8):
         super().__init__()
         self.hidden_size = hidden_size
         self.aspp_hidden_dim = aspp_hidden_dim or hidden_size
         self.num_steps = num_steps
-        self.num_neighbors = num_neighbors
         # Projection to lower dimension (if specified)
         self.use_projection = (self.aspp_hidden_dim != hidden_size)
@@ -92,22 +85,10 @@ class ASPPOperator(nn.Module):
             self.up_proj = nn.Linear(self.aspp_hidden_dim, hidden_size)
             self.proj_dropout = nn.Dropout(dropout)
-        # 8-directional offsets for 2D grid neighbors (row, col)
-        # ↖(-1,-1) ↑(-1,0) ↗(-1,1)
-        # ←(0,-1)  ●(0,0)  →(0,1)
-        # ↙(1,-1)  ↓(1,0)  ↘(1,1)
-        self.register_buffer('neighbor_offsets', torch.tensor([
-            [-1, -1], [-1, 0], [-1, 1],  # top-left, top, top-right
-            [0, -1],           [0, 1],   # left, right
-            [1, -1],  [1, 0],  [1, 1]    # bottom-left, bottom, bottom-right
-        ], dtype=torch.long))  # [8, 2]
-        # Learnable adjacency weights for 8 directions (randomly initialized)
-        self.adjacency_weights = nn.Parameter(torch.randn(num_neighbors) * 0.1)
-        # Message aggregation function: combines self + neighbors
-        self.message_net = nn.Sequential(
-            nn.Linear(self.aspp_hidden_dim * 2, self.aspp_hidden_dim * 2),
             nn.SiLU(),
             nn.Dropout(dropout),
             nn.Linear(self.aspp_hidden_dim * 2, self.aspp_hidden_dim),
@@ -115,6 +96,7 @@ class ASPPOperator(nn.Module):
         )
         # Learnable K-step parameter
         self.k_logit = nn.Parameter(torch.tensor(1.0))
         # Learnable residual scale
@@ -130,8 +112,6 @@ class ASPPOperator(nn.Module):
         Returns:
             evolved_states: [batch_size, seq_len, hidden_size]
         """
-        batch_size, seq_len, _ = hidden_states.shape
         # Project to lower dimension if needed
         if self.use_projection:
             h_t = self.down_proj(hidden_states)
@@ -142,70 +122,12 @@ class ASPPOperator(nn.Module):
         # Learnable number of steps
         k_steps = max(1, int(torch.sigmoid(self.k_logit) * self.num_steps))
-        # K-step graph propagation with 2D grid adjacency list
         for t in range(k_steps):
-            # 1. Reshape 1D sequence to 2D grid
-            # Dynamic grid size: H ≈ W ≈ sqrt(seq_len)
-            H = int(torch.ceil(torch.sqrt(torch.tensor(seq_len, dtype=torch.float32))).item())
-            W = int(torch.ceil(torch.tensor(seq_len, dtype=torch.float32) / H).item())
-            grid_size = H * W
-            # Pad sequence to grid_size
-            if seq_len < grid_size:
-                padding = grid_size - seq_len
-                h_t_padded = F.pad(h_t, (0, 0, 0, padding), mode='constant', value=0)  # [B, H*W, D]
-            else:
-                h_t_padded = h_t
-            # Reshape to 2D grid: [B, H*W, D] -> [B, H, W, D]
-            h_grid = h_t_padded.view(batch_size, H, W, self.aspp_hidden_dim)
-            # 2. Add boundary padding for neighbor gathering (pad with zeros)
-            # Pad 1 row/col on each side: [B, H, W, D] -> [B, H+2, W+2, D]
-            h_grid_padded = F.pad(h_grid, (0, 0, 1, 1, 1, 1), mode='constant', value=0)
-            # 3. Gather neighbors using adjacency list (offsets)
-            # neighbor_offsets: [8, 2] with (row_offset, col_offset)
-            # Only use first num_neighbors offsets
-            neighbors_list = []
-            for offset in self.neighbor_offsets[:self.num_neighbors]:
-                # Offset is relative to center, but we need absolute indices in padded grid
-                # Center at (i+1, j+1) in padded grid, neighbor at (i+1+di, j+1+dj)
-                # Use roll to shift the grid
-                di, dj = offset[0].item(), offset[1].item()
-                # Create index tensors for gathering
-                # For each position (i,j) in original grid, get neighbor at (i+di, j+dj) in padded grid
-                row_indices = torch.arange(H, device=h_t.device).view(-1, 1).expand(H, W) + 1 + di
-                col_indices = torch.arange(W, device=h_t.device).view(1, -1).expand(H, W) + 1 + dj
-                # Gather neighbor features: [B, H, W, D]
-                neighbor = h_grid_padded[:, row_indices, col_indices, :]  # [B, H, W, D]
-                neighbors_list.append(neighbor)
-            # Stack neighbors: [B, H, W, num_neighbors, D]
-            neighbors = torch.stack(neighbors_list, dim=3)  # [B, H, W, num_neighbors, D]
-            # 4. Apply learnable adjacency weights
-            # adjacency_weights: [num_neighbors] -> normalize with softmax
-            adj_weights = F.softmax(self.adjacency_weights, dim=0)  # [num_neighbors]
-            # Weighted aggregation: [B, H, W, D]
-            aggregated_neighbors = torch.sum(neighbors * adj_weights.view(1, 1, 1, self.num_neighbors, 1), dim=3)
-            # 5. Message passing: combine self + neighbors
-            # Flatten back to sequence: [B, H, W, D] -> [B, H*W, D]
-            h_grid_flat = h_grid.view(batch_size, grid_size, self.aspp_hidden_dim)
-            aggregated_flat = aggregated_neighbors.view(batch_size, grid_size, self.aspp_hidden_dim)
-            # Concat and pass through message net
-            message_input = torch.cat([h_grid_flat, aggregated_flat], dim=-1)  # [B, H*W, 2D]
-            h_t_next = self.message_net(message_input)  # [B, H*W, D]
-            # Remove padding to restore original seq_len
-            h_t_next = h_t_next[:, :seq_len, :]  # [B, L, D]
-            # 6. Scaled residual connection for stability
             h_t = h_t + self.residual_scale * h_t_next
             h_t = self.norm(h_t)
@@ -231,7 +153,7 @@ class HybridASPPAttentionLayer(LlamaDecoderLayer):
     4. Feed-forward network
     """
-    def __init__(self, config: LlamaConfig, layer_idx: int, aspp_hidden_dim: Optional[int] = None, aspp_num_steps: int = 2, aspp_dropout: float = 0.1, aspp_num_neighbors: int = 8):
         # Initialize parent LlamaDecoderLayer
         super().__init__(config, layer_idx)
@@ -240,8 +162,7 @@ class HybridASPPAttentionLayer(LlamaDecoderLayer):
             hidden_size=config.hidden_size,
             aspp_hidden_dim=aspp_hidden_dim,
             num_steps=aspp_num_steps,
-            dropout=aspp_dropout,
-            num_neighbors=aspp_num_neighbors
         )
         # Gated fusion mechanism with dropout
@@ -261,8 +182,7 @@ class HybridASPPAttentionLayer(LlamaDecoderLayer):
                 hidden_size=config.hidden_size,
                 aspp_hidden_dim=aspp_hidden_dim,
                 num_steps=aspp_num_steps,
-                dropout=aspp_dropout,
-                num_neighbors=aspp_num_neighbors
             )
             # Learnable flow scale (per-layer)
@@ -356,7 +276,7 @@ class AsteriskLlamaModel(LlamaModel):
     All layers use hybrid ASPP+Attention by default for maximum expressiveness.
     """
-    def __init__(self, config: LlamaConfig, hybrid_layer_indices: Optional[List[int]] = None, aspp_hidden_dim: Optional[int] = None, aspp_num_steps: int = 2, aspp_dropout: float = 0.1, aspp_num_neighbors: int = 8):
         super().__init__(config)
         # Determine which layers to make hybrid (default: ALL layers)
@@ -375,8 +295,7 @@ class AsteriskLlamaModel(LlamaModel):
                     layer_idx=idx,
                     aspp_hidden_dim=aspp_hidden_dim,
                     aspp_num_steps=aspp_num_steps,
-                    aspp_dropout=aspp_dropout,
-                    aspp_num_neighbors=aspp_num_neighbors
                 )
         # Initialize weights
@@ -392,7 +311,7 @@ class AsteriskForCausalLM(LlamaForCausalLM):
     config_class = AsteriskConfig
-    def __init__(self, config: AsteriskConfig, hybrid_layer_indices: Optional[List[int]] = None, aspp_hidden_dim: Optional[int] = None, aspp_num_steps: int = 2, aspp_dropout: float = 0.1, aspp_num_neighbors: int = 8):
         # Read all ASPP parameters from config if not explicitly provided
         if hybrid_layer_indices is None and hasattr(config, 'hybrid_layer_indices'):
             hybrid_layer_indices = config.hybrid_layer_indices
@@ -402,13 +321,11 @@ class AsteriskForCausalLM(LlamaForCausalLM):
             aspp_num_steps = config.aspp_num_steps
         if hasattr(config, 'aspp_dropout'):
             aspp_dropout = config.aspp_dropout
-        if hasattr(config, 'aspp_num_neighbors'):
-            aspp_num_neighbors = config.aspp_num_neighbors
         super().__init__(config)
         # Replace model with Asterisk version
-        self.model = AsteriskLlamaModel(config, hybrid_layer_indices, aspp_hidden_dim, aspp_num_steps, aspp_dropout, aspp_num_neighbors)
         # Store hybrid layer info in config for serialization
         self.config.hybrid_layer_indices = hybrid_layer_indices
@@ -424,7 +341,6 @@ class AsteriskForCausalLM(LlamaForCausalLM):
         aspp_hidden_dim: Optional[int] = None,
         aspp_num_steps: int = 2,
         aspp_dropout: float = 0.1,
-        aspp_num_neighbors: int = 8,  # NEW: number of semantic neighbors
         # π-flow parameters
         pi_flow: bool = False,
         pi_flow_steps: int = 1,
@@ -441,7 +357,6 @@ class AsteriskForCausalLM(LlamaForCausalLM):
             aspp_hidden_dim: Internal dimension for ASPP (None = use model hidden_size)
             aspp_num_steps: Number of evolution steps K for ASPP (default: 2)
             aspp_dropout: Dropout rate for ASPP regularization (default: 0.1)
-            aspp_num_neighbors: Number of semantic neighbors for graph propagation (default: 8)
             pi_flow: Enable π-flow refinement step (default: False)
             pi_flow_steps: Number of flow refinement steps (default: 1)
             pi_flow_scale: Initial flow scale parameter (default: 0.2)
@@ -458,7 +373,6 @@ class AsteriskForCausalLM(LlamaForCausalLM):
             aspp_hidden_dim=aspp_hidden_dim,
             aspp_num_steps=aspp_num_steps,
             aspp_dropout=aspp_dropout,
-            aspp_num_neighbors=aspp_num_neighbors,
             pi_flow=pi_flow,
             pi_flow_steps=pi_flow_steps,
             pi_flow_scale=pi_flow_scale,
@@ -466,15 +380,15 @@ class AsteriskForCausalLM(LlamaForCausalLM):
         )
         # Create Asterisk model
-        asterisk_model = cls(asterisk_config, hybrid_layer_indices, aspp_hidden_dim, aspp_num_steps, aspp_dropout, aspp_num_neighbors)
         # Transfer weights from base model (non-hybrid layers and embeddings)
         asterisk_model.load_state_dict(base_model.state_dict(), strict=False)
-        print(f"✓ Converted base model to Asterisk architecture with Graph Propagation")
         print(f"  Hybrid layers: {asterisk_model.model.hybrid_layer_indices}")
         aspp_dim_str = f"{aspp_hidden_dim}" if aspp_hidden_dim else f"{base_config.hidden_size} (full)"
-        print(f"  ASPP config: dim={aspp_dim_str}, steps={aspp_num_steps}, dropout={aspp_dropout}, neighbors={aspp_num_neighbors}")
         if pi_flow:
             print(f"  π-flow enabled: steps={pi_flow_steps}, scale={pi_flow_scale}, gate={pi_flow_use_gate}")

         aspp_hidden_dim: Optional[int] = None,
         aspp_num_steps: int = 2,
         aspp_dropout: float = 0.1,
         # π-flow parameters
         pi_flow: bool = False,
         pi_flow_steps: int = 1,
         self.aspp_hidden_dim = aspp_hidden_dim
         self.aspp_num_steps = aspp_num_steps
         self.aspp_dropout = aspp_dropout
         # π-flow config
         self.pi_flow = pi_flow
         self.pi_flow_steps = pi_flow_steps
 class ASPPOperator(nn.Module):
     """
+    Asterisk Operator (ASPP) - Point-wise Parallel Propagation
+    Simplified version WITHOUT neighbor gathering to reduce overfitting:
+    - Optional dimensionality reduction for efficiency
+    - Point-wise evolution: h_i^(t+1) = φ(h_i^(t))  [NO neighbors]
+    - Multi-step evolution for depth without added complexity
+    - Dropout for regularization
     Args:
         hidden_size: Dimension of hidden states (input/output)
         aspp_hidden_dim: Internal dimension for ASPP (default: None, use hidden_size)
         num_steps: Number of evolution steps K (default: 2)
         dropout: Dropout rate for regularization (default: 0.1)
     """
+    def __init__(self, hidden_size: int, aspp_hidden_dim: Optional[int] = None, num_steps: int = 2, dropout: float = 0.1):
         super().__init__()
         self.hidden_size = hidden_size
         self.aspp_hidden_dim = aspp_hidden_dim or hidden_size
         self.num_steps = num_steps
         # Projection to lower dimension (if specified)
         self.use_projection = (self.aspp_hidden_dim != hidden_size)
             self.up_proj = nn.Linear(self.aspp_hidden_dim, hidden_size)
             self.proj_dropout = nn.Dropout(dropout)
+        # Point-wise update function φ - NO neighbor gathering
+        # Much smaller: only processes current position
+        self.update_net = nn.Sequential(
+            nn.Linear(self.aspp_hidden_dim, self.aspp_hidden_dim * 2),
             nn.SiLU(),
             nn.Dropout(dropout),
             nn.Linear(self.aspp_hidden_dim * 2, self.aspp_hidden_dim),
         )
         # Learnable K-step parameter
+        # sigmoid(1.0) ≈ 0.73, giving k_steps ≈ 1.5 → 2 steps initially
         self.k_logit = nn.Parameter(torch.tensor(1.0))
         # Learnable residual scale
         Returns:
             evolved_states: [batch_size, seq_len, hidden_size]
         """
         # Project to lower dimension if needed
         if self.use_projection:
             h_t = self.down_proj(hidden_states)
         # Learnable number of steps
         k_steps = max(1, int(torch.sigmoid(self.k_logit) * self.num_steps))
+        # K-step point-wise evolution (NO neighbor gathering)
         for t in range(k_steps):
+            # Apply point-wise update rule φ
+            h_t_next = self.update_net(h_t)
+            # Scaled residual connection for stability
             h_t = h_t + self.residual_scale * h_t_next
             h_t = self.norm(h_t)
     4. Feed-forward network
     """
+    def __init__(self, config: LlamaConfig, layer_idx: int, aspp_hidden_dim: Optional[int] = None, aspp_num_steps: int = 2, aspp_dropout: float = 0.1):
         # Initialize parent LlamaDecoderLayer
         super().__init__(config, layer_idx)
             hidden_size=config.hidden_size,
             aspp_hidden_dim=aspp_hidden_dim,
             num_steps=aspp_num_steps,
+            dropout=aspp_dropout
         )
         # Gated fusion mechanism with dropout
                 hidden_size=config.hidden_size,
                 aspp_hidden_dim=aspp_hidden_dim,
                 num_steps=aspp_num_steps,
+                dropout=aspp_dropout
             )
             # Learnable flow scale (per-layer)
     All layers use hybrid ASPP+Attention by default for maximum expressiveness.
     """
+    def __init__(self, config: LlamaConfig, hybrid_layer_indices: Optional[List[int]] = None, aspp_hidden_dim: Optional[int] = None, aspp_num_steps: int = 2, aspp_dropout: float = 0.1):
         super().__init__(config)
         # Determine which layers to make hybrid (default: ALL layers)
                     layer_idx=idx,
                     aspp_hidden_dim=aspp_hidden_dim,
                     aspp_num_steps=aspp_num_steps,
+                    aspp_dropout=aspp_dropout
                 )
         # Initialize weights
     config_class = AsteriskConfig
+    def __init__(self, config: AsteriskConfig, hybrid_layer_indices: Optional[List[int]] = None, aspp_hidden_dim: Optional[int] = None, aspp_num_steps: int = 2, aspp_dropout: float = 0.1):
         # Read all ASPP parameters from config if not explicitly provided
         if hybrid_layer_indices is None and hasattr(config, 'hybrid_layer_indices'):
             hybrid_layer_indices = config.hybrid_layer_indices
             aspp_num_steps = config.aspp_num_steps
         if hasattr(config, 'aspp_dropout'):
             aspp_dropout = config.aspp_dropout
         super().__init__(config)
         # Replace model with Asterisk version
+        self.model = AsteriskLlamaModel(config, hybrid_layer_indices, aspp_hidden_dim, aspp_num_steps, aspp_dropout)
         # Store hybrid layer info in config for serialization
         self.config.hybrid_layer_indices = hybrid_layer_indices
         aspp_hidden_dim: Optional[int] = None,
         aspp_num_steps: int = 2,
         aspp_dropout: float = 0.1,
         # π-flow parameters
         pi_flow: bool = False,
         pi_flow_steps: int = 1,
             aspp_hidden_dim: Internal dimension for ASPP (None = use model hidden_size)
             aspp_num_steps: Number of evolution steps K for ASPP (default: 2)
             aspp_dropout: Dropout rate for ASPP regularization (default: 0.1)
             pi_flow: Enable π-flow refinement step (default: False)
             pi_flow_steps: Number of flow refinement steps (default: 1)
             pi_flow_scale: Initial flow scale parameter (default: 0.2)
             aspp_hidden_dim=aspp_hidden_dim,
             aspp_num_steps=aspp_num_steps,
             aspp_dropout=aspp_dropout,
             pi_flow=pi_flow,
             pi_flow_steps=pi_flow_steps,
             pi_flow_scale=pi_flow_scale,
         )
         # Create Asterisk model
+        asterisk_model = cls(asterisk_config, hybrid_layer_indices, aspp_hidden_dim, aspp_num_steps, aspp_dropout)
         # Transfer weights from base model (non-hybrid layers and embeddings)
         asterisk_model.load_state_dict(base_model.state_dict(), strict=False)
+        print(f"✓ Converted base model to Asterisk architecture")
         print(f"  Hybrid layers: {asterisk_model.model.hybrid_layer_indices}")
         aspp_dim_str = f"{aspp_hidden_dim}" if aspp_hidden_dim else f"{base_config.hidden_size} (full)"
+        print(f"  ASPP config: dim={aspp_dim_str}, steps={aspp_num_steps}, dropout={aspp_dropout}")
         if pi_flow:
             print(f"  π-flow enabled: steps={pi_flow_steps}, scale={pi_flow_scale}, gate={pi_flow_use_gate}")

README.md CHANGED Viewed

@@ -100,41 +100,12 @@ class HybridASPPAttentionLayer:
     Combines ASPP operator with standard attention
     Components:
-    - ASPP operator: Local structured reasoning with graph propagation
     - Standard attention: Global context
     - Gated fusion: Dynamic balancing
     """
 ```
-**ASPP Operator - Graph Propagation:**
-The ASPP operator converts the 1D sequence into a 2D grid and performs graph-based message passing:
-```
-Sequence [1, 2, 3, 4, ...] → 2D Grid:
-┌───┬───┬───┐
-│ 1 │ 2 │ 3 │
-├───┼───┼───┤
-│ 4 │ 5 │ 6 │
-└───┴───┴───┘
-8-directional neighbors (default):
-  ↖ ↑ ↗
-  ← ● →
-  ↙ ↓ ↘
-4-directional neighbors (optional):
-    ↑
-  ← ● →
-    ↓
-```
-**Key features:**
-- **Configurable neighbors**: `aspp_num_neighbors` (default: 8)
-- **Learnable adjacency weights**: Each direction has a learnable weight
-- **K-step evolution**: Iterative message passing for `aspp_num_steps` (default: 4)
-- **Dynamic grid**: Grid dimensions adapt to sequence length (H ≈ W ≈ √seq_len)
 **Fusion mechanism:**
 ```
 aspp_out = ASPP(hidden_states)
@@ -269,9 +240,9 @@ Total: ~10,148 training samples
 ### Training Configuration
 - **Starting Point**: Asterisk checkpoint (base ASPP-Attention model)
-- **Optimizer**: AdamW (lr=1e-4, weight_decay=0.1)
-- **Batch Size**: 4 per device, gradient accumulation=4 (effective batch=16)
-- **Epochs**: 2.5
 - **Scheduler**: Linear warmup (10% of steps)
 - **Mixed Precision**: bfloat16
 - **Gradient Checkpointing**: Enabled
@@ -292,16 +263,9 @@ pi_flow_use_gate = True     # Token-wise adaptive gating
 aspp_hidden_dim = 256       # Internal dimension (vs 576 model hidden_size)
 aspp_num_steps = 4          # Evolution steps for ASPP
 aspp_dropout = 0.2          # Regularization
-aspp_num_neighbors = 8      # Number of semantic neighbors for graph propagation (default: 8)
 hybrid_layer_indices = None # All 30 layers
 ```
-**Graph Propagation Neighbors:**
-- Default: 8-directional grid (↖↑↗←→↙↓↘)
-- Configurable: Can use fewer neighbors (e.g., 4-directional: ↑←→↓)
-- The neighbor offsets are defined in a buffer, and only the first `num_neighbors` are used
-- Learnable adjacency weights adapt importance of each direction during training
 ## Model Creation from Base Asterisk
 ```python
@@ -319,10 +283,6 @@ config.pi_flow_steps = 2
 config.pi_flow_scale = 1.0
 config.pi_flow_use_gate = True
-# Optional: Configure ASPP graph propagation neighbors
-# config.aspp_num_neighbors = 8  # Default: 8 (full 8-directional grid)
-# config.aspp_num_neighbors = 4  # Alternative: 4 (cardinal directions only)
 # Create model with π-flow
 model = AsteriskForCausalLM(config)
@@ -374,40 +334,6 @@ This creates a **hierarchical refinement cascade** enabling gradual convergence
 ## Implementation Details
-### ASPP Graph Propagation Configuration
-The ASPP operator supports configurable neighbor connectivity:
-```python
-class ASPPOperator(nn.Module):
-    def __init__(
-        self,
-        hidden_size: int,
-        aspp_hidden_dim: Optional[int] = None,
-        num_steps: int = 2,
-        dropout: float = 0.1,
-        num_neighbors: int = 8  # Configurable: 4, 8, etc.
-    ):
-        # Neighbor offsets buffer (8 directions hardcoded)
-        self.register_buffer('neighbor_offsets', torch.tensor([
-            [-1, -1], [-1, 0], [-1, 1],  # ↖ ↑ ↗
-            [0, -1],           [0, 1],   # ← →
-            [1, -1],  [1, 0],  [1, 1]    # ↙ ↓ ↘
-        ]))
-        # Only first num_neighbors are used
-        self.num_neighbors = num_neighbors
-        # Learnable adjacency weights
-        self.adjacency_weights = nn.Parameter(torch.randn(num_neighbors) * 0.1)
-```
-**Usage:**
-- `num_neighbors=8`: Full 8-directional connectivity (default)
-- `num_neighbors=4`: Cardinal directions only (↑←→↓)
-- The buffer always contains 8 offsets, but only the first `num_neighbors` are used
-- Adjacency weights are softmax-normalized for stability
 ### Return Type Handling
 Critical for Transformers compatibility:
@@ -460,41 +386,22 @@ Asterisk-Pi/
 ## Known Issues & Solutions
-### 1. Neighbor Count Configuration (Fixed in Latest Version)
-**Previous issue**: ASPP operator hardcoded 8 neighbors in reshape operations, causing errors when using different neighbor counts.
-**Solution**: Updated implementation to use `self.num_neighbors` dynamically:
-```python
-# Fixed: Dynamic neighbor count
-for offset in self.neighbor_offsets[:self.num_neighbors]:  # Only use first N
-    # ...gather neighbors...
-# Fixed: Dynamic reshape
-aggregated_neighbors = torch.sum(
-    neighbors * adj_weights.view(1, 1, 1, self.num_neighbors, 1),
-    dim=3
-)
-```
-Now supports any neighbor count (4, 8, etc.) without modification.
-### 2. Return Type Errors
 **Issue**: `AttributeError: 'tuple' object has no attribute 'dtype'`
 **Solution**: `HybridASPPAttentionLayer.forward()` must return `torch.Tensor` only, not tuple. This matches the `LlamaDecoderLayer` API in transformers 4.57.6.
-### 3. π-Flow in All Layers vs Final Layer
 **Initial approach**: π-flow only in final layer (limited expressiveness)
 **Current approach**: π-flow in all 30 hybrid layers for maximum refinement capability.
-### 4. Training Stability
 π-Flow can cause instability with high learning rates. Use:
-- Lower learning rate (1e-4 recommended for stability)
 - Gradient clipping (max_norm=1.0)
 - Conservative initial flow scale (0.2-1.0)

     Combines ASPP operator with standard attention
     Components:
+    - ASPP operator: Local structured reasoning
     - Standard attention: Global context
     - Gated fusion: Dynamic balancing
     """
 ```
 **Fusion mechanism:**
 ```
 aspp_out = ASPP(hidden_states)
 ### Training Configuration
 - **Starting Point**: Asterisk checkpoint (base ASPP-Attention model)
+- **Optimizer**: AdamW (lr=5e-4, weight_decay=0.1)
+- **Batch Size**: 2 per device, gradient accumulation=4 (effective batch=8)
+- **Epochs**: 2
 - **Scheduler**: Linear warmup (10% of steps)
 - **Mixed Precision**: bfloat16
 - **Gradient Checkpointing**: Enabled
 aspp_hidden_dim = 256       # Internal dimension (vs 576 model hidden_size)
 aspp_num_steps = 4          # Evolution steps for ASPP
 aspp_dropout = 0.2          # Regularization
 hybrid_layer_indices = None # All 30 layers
 ```
 ## Model Creation from Base Asterisk
 ```python
 config.pi_flow_scale = 1.0
 config.pi_flow_use_gate = True
 # Create model with π-flow
 model = AsteriskForCausalLM(config)
 ## Implementation Details
 ### Return Type Handling
 Critical for Transformers compatibility:
 ## Known Issues & Solutions
+### 1. Return Type Errors
 **Issue**: `AttributeError: 'tuple' object has no attribute 'dtype'`
 **Solution**: `HybridASPPAttentionLayer.forward()` must return `torch.Tensor` only, not tuple. This matches the `LlamaDecoderLayer` API in transformers 4.57.6.
+### 2. π-Flow in All Layers vs Final Layer
 **Initial approach**: π-flow only in final layer (limited expressiveness)
 **Current approach**: π-flow in all 30 hybrid layers for maximum refinement capability.
+### 3. Training Stability
 π-Flow can cause instability with high learning rates. Use:
+- Lower learning rate (5e-4 vs 2e-5 for base)
 - Gradient clipping (max_norm=1.0)
 - Conservative initial flow scale (0.2-1.0)

chat_template.jinja CHANGED Viewed

@@ -1,5 +1,5 @@
 {% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system
-You are a helpful AI assistant named Asterisk, trained by NoesisLab<|im_end|>
 ' }}{% endif %}{{'<|im_start|>' + message['role'] + '
 ' + message['content'] + '<|im_end|>' + '
 '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant

 {% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system
+You are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>
 ' }}{% endif %}{{'<|im_start|>' + message['role'] + '
 ' + message['content'] + '<|im_end|>' + '
 '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant

config.json CHANGED Viewed

@@ -2,9 +2,8 @@
   "architectures": [
     "AsteriskForCausalLM"
   ],
-  "aspp_dropout": 0.1,
   "aspp_hidden_dim": 256,
-  "aspp_num_neighbors": 8,
   "aspp_num_steps": 4,
   "attention_bias": false,
   "attention_dropout": 0.0,

   "architectures": [
     "AsteriskForCausalLM"
   ],
+  "aspp_dropout": 0.2,
   "aspp_hidden_dim": 256,
   "aspp_num_steps": 4,
   "attention_bias": false,
   "attention_dropout": 0.0,

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:41550e1413295a2b5e02127758543c905650e9c834c6b53e382238f4021f3668
-size 396858360

 version https://git-lfs.github.com/spec/v1
+oid sha256:cd3411332c19c27ac340b99a92d91e0b93f224b62fa3e0cccf7777b4e126b802
+size 381107624

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:79668c78f13b1c865f88ffab2a80bc10c893e3262c29261ee8d447ec47b717d3
 size 6353

 version https://git-lfs.github.com/spec/v1
+oid sha256:357a1e8bcbd247f80b9437f6d4dd9e81a29edbafaa6fea075a7380b6927773f4
 size 6353