Upload 14 files

Browse files

Files changed (6) hide show

AsteriskForCausalLM.py +78 -26
README.md +114 -3
chat_template.jinja +1 -1
config.json +2 -1
model.safetensors +2 -2
training_args.bin +1 -1

AsteriskForCausalLM.py CHANGED Viewed

@@ -36,6 +36,7 @@ class AsteriskConfig(LlamaConfig):
         aspp_hidden_dim: Optional[int] = None,
         aspp_num_steps: int = 2,
         aspp_dropout: float = 0.1,
         # π-flow parameters
         pi_flow: bool = False,
         pi_flow_steps: int = 1,
@@ -48,6 +49,7 @@ class AsteriskConfig(LlamaConfig):
         self.aspp_hidden_dim = aspp_hidden_dim
         self.aspp_num_steps = aspp_num_steps
         self.aspp_dropout = aspp_dropout
         # π-flow config
         self.pi_flow = pi_flow
         self.pi_flow_steps = pi_flow_steps
@@ -57,26 +59,37 @@ class AsteriskConfig(LlamaConfig):
 class ASPPOperator(nn.Module):
     """
-    Asterisk Operator (ASPP) - Point-wise Parallel Propagation
-    Simplified version WITHOUT neighbor gathering to reduce overfitting:
-    - Optional dimensionality reduction for efficiency
-    - Point-wise evolution: h_i^(t+1) = φ(h_i^(t))  [NO neighbors]
-    - Multi-step evolution for depth without added complexity
-    - Dropout for regularization
     Args:
         hidden_size: Dimension of hidden states (input/output)
         aspp_hidden_dim: Internal dimension for ASPP (default: None, use hidden_size)
         num_steps: Number of evolution steps K (default: 2)
         dropout: Dropout rate for regularization (default: 0.1)
     """
-    def __init__(self, hidden_size: int, aspp_hidden_dim: Optional[int] = None, num_steps: int = 2, dropout: float = 0.1):
         super().__init__()
         self.hidden_size = hidden_size
         self.aspp_hidden_dim = aspp_hidden_dim or hidden_size
         self.num_steps = num_steps
         # Projection to lower dimension (if specified)
         self.use_projection = (self.aspp_hidden_dim != hidden_size)
@@ -85,10 +98,9 @@ class ASPPOperator(nn.Module):
             self.up_proj = nn.Linear(self.aspp_hidden_dim, hidden_size)
             self.proj_dropout = nn.Dropout(dropout)
-        # Point-wise update function φ - NO neighbor gathering
-        # Much smaller: only processes current position
-        self.update_net = nn.Sequential(
-            nn.Linear(self.aspp_hidden_dim, self.aspp_hidden_dim * 2),
             nn.SiLU(),
             nn.Dropout(dropout),
             nn.Linear(self.aspp_hidden_dim * 2, self.aspp_hidden_dim),
@@ -96,7 +108,6 @@ class ASPPOperator(nn.Module):
         )
         # Learnable K-step parameter
-        # sigmoid(1.0) ≈ 0.73, giving k_steps ≈ 1.5 → 2 steps initially
         self.k_logit = nn.Parameter(torch.tensor(1.0))
         # Learnable residual scale
@@ -105,6 +116,28 @@ class ASPPOperator(nn.Module):
         # Layer norm for stability
         self.norm = nn.LayerNorm(self.aspp_hidden_dim, eps=1e-5)
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         """
         Args:
@@ -112,6 +145,8 @@ class ASPPOperator(nn.Module):
         Returns:
             evolved_states: [batch_size, seq_len, hidden_size]
         """
         # Project to lower dimension if needed
         if self.use_projection:
             h_t = self.down_proj(hidden_states)
@@ -122,12 +157,21 @@ class ASPPOperator(nn.Module):
         # Learnable number of steps
         k_steps = max(1, int(torch.sigmoid(self.k_logit) * self.num_steps))
-        # K-step point-wise evolution (NO neighbor gathering)
         for t in range(k_steps):
-            # Apply point-wise update rule φ
-            h_t_next = self.update_net(h_t)
-            # Scaled residual connection for stability
             h_t = h_t + self.residual_scale * h_t_next
             h_t = self.norm(h_t)
@@ -153,7 +197,7 @@ class HybridASPPAttentionLayer(LlamaDecoderLayer):
     4. Feed-forward network
     """
-    def __init__(self, config: LlamaConfig, layer_idx: int, aspp_hidden_dim: Optional[int] = None, aspp_num_steps: int = 2, aspp_dropout: float = 0.1):
         # Initialize parent LlamaDecoderLayer
         super().__init__(config, layer_idx)
@@ -162,7 +206,8 @@ class HybridASPPAttentionLayer(LlamaDecoderLayer):
             hidden_size=config.hidden_size,
             aspp_hidden_dim=aspp_hidden_dim,
             num_steps=aspp_num_steps,
-            dropout=aspp_dropout
         )
         # Gated fusion mechanism with dropout
@@ -182,7 +227,8 @@ class HybridASPPAttentionLayer(LlamaDecoderLayer):
                 hidden_size=config.hidden_size,
                 aspp_hidden_dim=aspp_hidden_dim,
                 num_steps=aspp_num_steps,
-                dropout=aspp_dropout
             )
             # Learnable flow scale (per-layer)
@@ -276,7 +322,7 @@ class AsteriskLlamaModel(LlamaModel):
     All layers use hybrid ASPP+Attention by default for maximum expressiveness.
     """
-    def __init__(self, config: LlamaConfig, hybrid_layer_indices: Optional[List[int]] = None, aspp_hidden_dim: Optional[int] = None, aspp_num_steps: int = 2, aspp_dropout: float = 0.1):
         super().__init__(config)
         # Determine which layers to make hybrid (default: ALL layers)
@@ -295,7 +341,8 @@ class AsteriskLlamaModel(LlamaModel):
                     layer_idx=idx,
                     aspp_hidden_dim=aspp_hidden_dim,
                     aspp_num_steps=aspp_num_steps,
-                    aspp_dropout=aspp_dropout
                 )
         # Initialize weights
@@ -311,7 +358,7 @@ class AsteriskForCausalLM(LlamaForCausalLM):
     config_class = AsteriskConfig
-    def __init__(self, config: AsteriskConfig, hybrid_layer_indices: Optional[List[int]] = None, aspp_hidden_dim: Optional[int] = None, aspp_num_steps: int = 2, aspp_dropout: float = 0.1):
         # Read all ASPP parameters from config if not explicitly provided
         if hybrid_layer_indices is None and hasattr(config, 'hybrid_layer_indices'):
             hybrid_layer_indices = config.hybrid_layer_indices
@@ -321,11 +368,13 @@ class AsteriskForCausalLM(LlamaForCausalLM):
             aspp_num_steps = config.aspp_num_steps
         if hasattr(config, 'aspp_dropout'):
             aspp_dropout = config.aspp_dropout
         super().__init__(config)
         # Replace model with Asterisk version
-        self.model = AsteriskLlamaModel(config, hybrid_layer_indices, aspp_hidden_dim, aspp_num_steps, aspp_dropout)
         # Store hybrid layer info in config for serialization
         self.config.hybrid_layer_indices = hybrid_layer_indices
@@ -341,6 +390,7 @@ class AsteriskForCausalLM(LlamaForCausalLM):
         aspp_hidden_dim: Optional[int] = None,
         aspp_num_steps: int = 2,
         aspp_dropout: float = 0.1,
         # π-flow parameters
         pi_flow: bool = False,
         pi_flow_steps: int = 1,
@@ -357,6 +407,7 @@ class AsteriskForCausalLM(LlamaForCausalLM):
             aspp_hidden_dim: Internal dimension for ASPP (None = use model hidden_size)
             aspp_num_steps: Number of evolution steps K for ASPP (default: 2)
             aspp_dropout: Dropout rate for ASPP regularization (default: 0.1)
             pi_flow: Enable π-flow refinement step (default: False)
             pi_flow_steps: Number of flow refinement steps (default: 1)
             pi_flow_scale: Initial flow scale parameter (default: 0.2)
@@ -373,6 +424,7 @@ class AsteriskForCausalLM(LlamaForCausalLM):
             aspp_hidden_dim=aspp_hidden_dim,
             aspp_num_steps=aspp_num_steps,
             aspp_dropout=aspp_dropout,
             pi_flow=pi_flow,
             pi_flow_steps=pi_flow_steps,
             pi_flow_scale=pi_flow_scale,
@@ -380,15 +432,15 @@ class AsteriskForCausalLM(LlamaForCausalLM):
         )
         # Create Asterisk model
-        asterisk_model = cls(asterisk_config, hybrid_layer_indices, aspp_hidden_dim, aspp_num_steps, aspp_dropout)
         # Transfer weights from base model (non-hybrid layers and embeddings)
         asterisk_model.load_state_dict(base_model.state_dict(), strict=False)
-        print(f"✓ Converted base model to Asterisk architecture")
         print(f"  Hybrid layers: {asterisk_model.model.hybrid_layer_indices}")
         aspp_dim_str = f"{aspp_hidden_dim}" if aspp_hidden_dim else f"{base_config.hidden_size} (full)"
-        print(f"  ASPP config: dim={aspp_dim_str}, steps={aspp_num_steps}, dropout={aspp_dropout}")
         if pi_flow:
             print(f"  π-flow enabled: steps={pi_flow_steps}, scale={pi_flow_scale}, gate={pi_flow_use_gate}")

         aspp_hidden_dim: Optional[int] = None,
         aspp_num_steps: int = 2,
         aspp_dropout: float = 0.1,
+        aspp_num_neighbors: int = 1,  # Fixed at 1 for Union-Find (only parent)
         # π-flow parameters
         pi_flow: bool = False,
         pi_flow_steps: int = 1,
         self.aspp_hidden_dim = aspp_hidden_dim
         self.aspp_num_steps = aspp_num_steps
         self.aspp_dropout = aspp_dropout
+        self.aspp_num_neighbors = aspp_num_neighbors
         # π-flow config
         self.pi_flow = pi_flow
         self.pi_flow_steps = pi_flow_steps
 class ASPPOperator(nn.Module):
     """
+    Asterisk Operator (ASPP) - Union-Find Graph Propagation
+    Uses Union-Find (Disjoint Set Union) structure for dynamic parent connections:
+    - Each position maintains a parent pointer: parent[i]
+    - Initial structure: parent[i] = max(0, i-1) (linear chain)
+    - Message passing: aggregate self + parent features
+    - Can apply path compression for optimization
+    Advantages:
+    - O(n) complexity with simple indexing
+    - Dynamic grouping of related positions
+    - Efficient parent-only propagation (no complex gather)
+    - Nearly constant time find with path compression
+    Complexity: O(n) with α(n) ≈ O(1) per operation
+    Message passing: h_i^(t+1) = φ(h_i^(t), h_parent[i])
     Args:
         hidden_size: Dimension of hidden states (input/output)
         aspp_hidden_dim: Internal dimension for ASPP (default: None, use hidden_size)
         num_steps: Number of evolution steps K (default: 2)
         dropout: Dropout rate for regularization (default: 0.1)
+        num_neighbors: Fixed at 1 (only parent) for Union-Find structure
     """
+    def __init__(self, hidden_size: int, aspp_hidden_dim: Optional[int] = None, num_steps: int = 2, dropout: float = 0.1, num_neighbors: int = 1):
         super().__init__()
         self.hidden_size = hidden_size
         self.aspp_hidden_dim = aspp_hidden_dim or hidden_size
         self.num_steps = num_steps
+        self.num_neighbors = 1  # Fixed: only parent
         # Projection to lower dimension (if specified)
         self.use_projection = (self.aspp_hidden_dim != hidden_size)
             self.up_proj = nn.Linear(self.aspp_hidden_dim, hidden_size)
             self.proj_dropout = nn.Dropout(dropout)
+        # Message aggregation function: combines self + parent
+        self.message_net = nn.Sequential(
+            nn.Linear(self.aspp_hidden_dim * 2, self.aspp_hidden_dim * 2),
             nn.SiLU(),
             nn.Dropout(dropout),
             nn.Linear(self.aspp_hidden_dim * 2, self.aspp_hidden_dim),
         )
         # Learnable K-step parameter
         self.k_logit = nn.Parameter(torch.tensor(1.0))
         # Learnable residual scale
         # Layer norm for stability
         self.norm = nn.LayerNorm(self.aspp_hidden_dim, eps=1e-5)
+    def compute_parent_indices(self, seq_len: int, device) -> torch.Tensor:
+        """
+        Compute parent index for each position using Union-Find structure
+        Simple implementation: parent[i] = i-1 (linear chain)
+        - Position 0 points to itself (root)
+        - All others point to previous position
+        Can be extended with dynamic union operations based on:
+        - Semantic similarity
+        - Positional heuristics
+        - Learned grouping
+        Returns: [seq_len] tensor of parent indices
+        """
+        # Initialize: parent[i] = max(0, i-1)
+        parent_indices = torch.arange(seq_len, device=device) - 1
+        parent_indices[0] = 0  # Root points to itself
+        parent_indices = torch.clamp(parent_indices, 0, seq_len - 1)
+        return parent_indices
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         """
         Args:
         Returns:
             evolved_states: [batch_size, seq_len, hidden_size]
         """
+        batch_size, seq_len, _ = hidden_states.shape
         # Project to lower dimension if needed
         if self.use_projection:
             h_t = self.down_proj(hidden_states)
         # Learnable number of steps
         k_steps = max(1, int(torch.sigmoid(self.k_logit) * self.num_steps))
+        # K-step Union-Find graph propagation
         for t in range(k_steps):
+            # 1. Compute parent indices using Union-Find structure
+            parent_indices = self.compute_parent_indices(seq_len, h_t.device)  # [L]
+            # 2. Gather parent features (super simple indexing!)
+            # h_t: [B, L, D], parent_indices: [L]
+            # Just gather from parent positions
+            parent_features = h_t[:, parent_indices, :]  # [B, L, D]
+            # 3. Message passing: combine self + parent
+            message_input = torch.cat([h_t, parent_features], dim=-1)  # [B, L, 2D]
+            h_t_next = self.message_net(message_input)  # [B, L, D]
+            # 4. Scaled residual connection for stability
             h_t = h_t + self.residual_scale * h_t_next
             h_t = self.norm(h_t)
     4. Feed-forward network
     """
+    def __init__(self, config: LlamaConfig, layer_idx: int, aspp_hidden_dim: Optional[int] = None, aspp_num_steps: int = 2, aspp_dropout: float = 0.1, aspp_num_neighbors: int = 1):
         # Initialize parent LlamaDecoderLayer
         super().__init__(config, layer_idx)
             hidden_size=config.hidden_size,
             aspp_hidden_dim=aspp_hidden_dim,
             num_steps=aspp_num_steps,
+            dropout=aspp_dropout,
+            num_neighbors=aspp_num_neighbors
         )
         # Gated fusion mechanism with dropout
                 hidden_size=config.hidden_size,
                 aspp_hidden_dim=aspp_hidden_dim,
                 num_steps=aspp_num_steps,
+                dropout=aspp_dropout,
+                num_neighbors=aspp_num_neighbors
             )
             # Learnable flow scale (per-layer)
     All layers use hybrid ASPP+Attention by default for maximum expressiveness.
     """
+    def __init__(self, config: LlamaConfig, hybrid_layer_indices: Optional[List[int]] = None, aspp_hidden_dim: Optional[int] = None, aspp_num_steps: int = 2, aspp_dropout: float = 0.1, aspp_num_neighbors: int = 2):
         super().__init__(config)
         # Determine which layers to make hybrid (default: ALL layers)
                     layer_idx=idx,
                     aspp_hidden_dim=aspp_hidden_dim,
                     aspp_num_steps=aspp_num_steps,
+                    aspp_dropout=aspp_dropout,
+                    aspp_num_neighbors=aspp_num_neighbors
                 )
         # Initialize weights
     config_class = AsteriskConfig
+    def __init__(self, config: AsteriskConfig, hybrid_layer_indices: Optional[List[int]] = None, aspp_hidden_dim: Optional[int] = None, aspp_num_steps: int = 2, aspp_dropout: float = 0.1, aspp_num_neighbors: int = 2):
         # Read all ASPP parameters from config if not explicitly provided
         if hybrid_layer_indices is None and hasattr(config, 'hybrid_layer_indices'):
             hybrid_layer_indices = config.hybrid_layer_indices
             aspp_num_steps = config.aspp_num_steps
         if hasattr(config, 'aspp_dropout'):
             aspp_dropout = config.aspp_dropout
+        if hasattr(config, 'aspp_num_neighbors'):
+            aspp_num_neighbors = config.aspp_num_neighbors
         super().__init__(config)
         # Replace model with Asterisk version
+        self.model = AsteriskLlamaModel(config, hybrid_layer_indices, aspp_hidden_dim, aspp_num_steps, aspp_dropout, aspp_num_neighbors)
         # Store hybrid layer info in config for serialization
         self.config.hybrid_layer_indices = hybrid_layer_indices
         aspp_hidden_dim: Optional[int] = None,
         aspp_num_steps: int = 2,
         aspp_dropout: float = 0.1,
+        aspp_num_neighbors: int = 1,  # Fixed at 1 for Union-Find (only parent)
         # π-flow parameters
         pi_flow: bool = False,
         pi_flow_steps: int = 1,
             aspp_hidden_dim: Internal dimension for ASPP (None = use model hidden_size)
             aspp_num_steps: Number of evolution steps K for ASPP (default: 2)
             aspp_dropout: Dropout rate for ASPP regularization (default: 0.1)
+            aspp_num_neighbors: Number of neighbors for Union-Find (fixed at 1: only parent)
             pi_flow: Enable π-flow refinement step (default: False)
             pi_flow_steps: Number of flow refinement steps (default: 1)
             pi_flow_scale: Initial flow scale parameter (default: 0.2)
             aspp_hidden_dim=aspp_hidden_dim,
             aspp_num_steps=aspp_num_steps,
             aspp_dropout=aspp_dropout,
+            aspp_num_neighbors=aspp_num_neighbors,
             pi_flow=pi_flow,
             pi_flow_steps=pi_flow_steps,
             pi_flow_scale=pi_flow_scale,
         )
         # Create Asterisk model
+        asterisk_model = cls(asterisk_config, hybrid_layer_indices, aspp_hidden_dim, aspp_num_steps, aspp_dropout, aspp_num_neighbors)
         # Transfer weights from base model (non-hybrid layers and embeddings)
         asterisk_model.load_state_dict(base_model.state_dict(), strict=False)
+        print(f"✓ Converted base model to Asterisk architecture with Graph Propagation")
         print(f"  Hybrid layers: {asterisk_model.model.hybrid_layer_indices}")
         aspp_dim_str = f"{aspp_hidden_dim}" if aspp_hidden_dim else f"{base_config.hidden_size} (full)"
+        print(f"  ASPP config: dim={aspp_dim_str}, steps={aspp_num_steps}, dropout={aspp_dropout}, neighbors={aspp_num_neighbors}")
         if pi_flow:
             print(f"  π-flow enabled: steps={pi_flow_steps}, scale={pi_flow_scale}, gate={pi_flow_use_gate}")

README.md CHANGED Viewed

@@ -100,18 +100,129 @@ class HybridASPPAttentionLayer:
     Combines ASPP operator with standard attention
     Components:
-    - ASPP operator: Local structured reasoning
     - Standard attention: Global context
     - Gated fusion: Dynamic balancing
     """
 ```
 **Fusion mechanism:**
 ```
-aspp_out = ASPP(hidden_states)
-attn_out = Attention(hidden_states, mask, ...)
 gate = sigmoid(linear([aspp_out || attn_out]))
 fused = gate * aspp_out + (1 - gate) * attn_out
 ```
 ### 2. π-Flow Refinement (Per-Layer)

     Combines ASPP operator with standard attention
     Components:
+    - ASPP operator: Local structured reasoning with Union-Find graph propagation
     - Standard attention: Global context
     - Gated fusion: Dynamic balancing
     """
 ```
+#### ASPP Operator: Union-Find Graph Propagation
+The ASPP operator uses a **Union-Find (Disjoint Set Union)** structure for efficient graph-based message passing. Unlike traditional attention's O(n²) complexity or skip-list's O(n log n), Union-Find achieves **O(n) complexity with nearly constant-time operations**.
+**Graph Structure - Union-Find Parent Chain:**
+```
+Position:  [0]  [1]  [2]  [3]  [4]  [5]  ...  [n-1]
+Parent:    [0] ← 0  ← 1  ← 2  ← 3  ← 4  ...  ← n-2
+           (root)
+- Position 0: points to itself (root of the tree)
+- Position i (i>0): points to position i-1 (parent)
+- Forms a linear chain structure for sequential token relationships
+```
+This creates a **directed acyclic graph (DAG)** where information flows from children to parents, naturally capturing left-to-right sequential dependencies in language modeling.
+**Graph Propagation Aggregation:**
+Each ASPP evolution step performs parent-based message passing:
+```python
+# Pseudocode for one ASPP propagation step
+for position i in sequence:
+    # 1. Find parent using Union-Find structure
+    parent_idx = compute_parent_indices()[i]  # O(1) with path compression
+    # 2. Gather parent features
+    parent_features = hidden_states[parent_idx]
+    # 3. Message aggregation: combine self + parent
+    message_input = concat([hidden_states[i], parent_features])
+    # 4. Update via learned transformation
+    new_state = message_net(message_input)  # 2-layer MLP
+    # 5. Scaled residual connection
+    hidden_states[i] = hidden_states[i] + residual_scale * new_state
+    hidden_states[i] = layer_norm(hidden_states[i])
+```
+**Key properties of Union-Find propagation:**
+1. **O(n) Complexity**: Each position performs exactly one parent lookup and one aggregation
+   - No expensive attention computation (O(n²))
+   - No multi-level skip connections (O(n log n))
+   - Simple indexing operation: `parent_features = h[parent_indices]`
+2. **Hierarchical Information Flow**: After K steps, position i can access information from positions [i-K, i]
+   - K=1: immediate parent only
+   - K=2: grandparent (2 positions back)
+   - K=4 (default): great-great-grandparent (4 positions back)
+   - Information propagates through the chain structure
+3. **Learnable Aggregation**: The `message_net` MLP learns how to combine self and parent features
+   - Input: `[self_features || parent_features]` (2D dimensions)
+   - Output: `D` dimensional update vector
+   - Dropout regularization for robustness
+4. **Path Compression Potential**: Can extend to dynamic parent reassignment
+   - Current implementation: static `parent[i] = i-1` chain
+   - Future extension: learn parent assignments based on semantic similarity
+   - Enables adaptive graph structure during forward pass
+**Union-Find vs. Other Graph Structures:**
+| Structure | Complexity | Receptive Field | Connections per Node |
+|-----------|------------|-----------------|----------------------|
+| **Full Attention** | O(n²) | Global | n-1 (all positions) |
+| **Skip-List** | O(n log n) | Multi-scale | O(log n) (multiple levels) |
+| **Union-Find** | O(n) | Local chain | 1 (parent only) |
+| **Dilated Conv** | O(n·k) | Sparse | k (fixed window) |
+Union-Find achieves the **lowest complexity** while maintaining effective information propagation through iterative K-step evolution.
+**Theoretical Foundation - Union-Find in Graph Algorithms:**
+Union-Find is a classic data structure for disjoint set operations:
+- **Find**: Determine which set an element belongs to (with path compression: O(α(n)) ≈ O(1))
+- **Union**: Merge two sets into one
+- **Applications**: Kruskal's MST algorithm, connected components, cycle detection
+In Asterisk-Pi:
+- Each token position is a node in the graph
+- Parent pointers define the tree structure
+- Message passing simulates "Find" operations (traversing to ancestors)
+- Can extend to dynamic "Union" operations (merging related tokens)
+**Multi-Step Propagation:**
+With K=4 evolution steps, information flow becomes:
+```
+Step 1: Position i accesses parent i-1
+Step 2: Position i now has information from i-2 (via i-1)
+Step 3: Position i now has information from i-3 (propagated through chain)
+Step 4: Position i now has information from i-4 (fully propagated)
+Result: Each position has aggregated context from 4 previous positions
+        through efficient O(n) operations
+```
+This multi-step propagation is crucial for:
+- **Local context**: Recent tokens for coherence
+- **Gradient flow**: Direct paths for backpropagation
+- **Efficiency**: Linear cost instead of quadratic attention
 **Fusion mechanism:**
 ```
+aspp_out = ASPP(hidden_states)            # Union-Find graph propagation (O(n))
+attn_out = Attention(hidden_states, mask, ...)  # Global attention (O(n²))
 gate = sigmoid(linear([aspp_out || attn_out]))
 fused = gate * aspp_out + (1 - gate) * attn_out
+# Combines:
+# - Local structured reasoning (ASPP via Union-Find)
+# - Global contextual awareness (Attention)
 ```
 ### 2. π-Flow Refinement (Per-Layer)

chat_template.jinja CHANGED Viewed

@@ -1,5 +1,5 @@
 {% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system
-You are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>
 ' }}{% endif %}{{'<|im_start|>' + message['role'] + '
 ' + message['content'] + '<|im_end|>' + '
 '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant

 {% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system
+You are a helpful AI assistant named Asterisk, trained by NoesisLab<|im_end|>
 ' }}{% endif %}{{'<|im_start|>' + message['role'] + '
 ' + message['content'] + '<|im_end|>' + '
 '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant

config.json CHANGED Viewed

@@ -2,8 +2,9 @@
   "architectures": [
     "AsteriskForCausalLM"
   ],
-  "aspp_dropout": 0.2,
   "aspp_hidden_dim": 256,
   "aspp_num_steps": 4,
   "attention_bias": false,
   "attention_dropout": 0.0,

   "architectures": [
     "AsteriskForCausalLM"
   ],
+  "aspp_dropout": 0.1,
   "aspp_hidden_dim": 256,
+  "aspp_num_neighbors": 1,
   "aspp_num_steps": 4,
   "attention_bias": false,
   "attention_dropout": 0.0,

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cd3411332c19c27ac340b99a92d91e0b93f224b62fa3e0cccf7777b4e126b802
-size 381107624

 version https://git-lfs.github.com/spec/v1
+oid sha256:c7c7f75c4ede6e9a2f8ef54b5c5c5b0d29c773eda3c8467426fb957edf075bb5
+size 396836528

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:357a1e8bcbd247f80b9437f6d4dd9e81a29edbafaa6fea075a7380b6927773f4
 size 6353

 version https://git-lfs.github.com/spec/v1
+oid sha256:4bf59bb69a5946540bcc9c4c08fc6cf0c903b2923e07239b606e38965dcb26a5
 size 6353