Upload 12 files

Browse files

Files changed (12) hide show

AsteriskForCausalLM.py +376 -0
README.md +324 -3
chat_template.jinja +6 -0
config.json +46 -0
generation_config.json +9 -0
merges.txt +0 -0
model.safetensors +3 -0
special_tokens_map.json +34 -0
tokenizer.json +0 -0
tokenizer_config.json +154 -0
training_args.bin +3 -0
vocab.json +0 -0

AsteriskForCausalLM.py ADDED Viewed

	@@ -0,0 +1,376 @@

+"""
+Hybrid ASPP-Attention Architecture (Asterisk Model)
+Combines Adjacency-Structured Parallel Propagation (ASPP) with standard attention mechanisms
+to enhance model expressiveness while maintaining efficiency.
+Architecture Design:
+- Hybrid layers: Standard attention + ASPP operator in parallel
+- Gate mechanism for dynamic fusion
+- Knowledge distillation from SmolLM2-135M base model
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import LlamaConfig, LlamaForCausalLM, LlamaModel
+from transformers.models.llama.modeling_llama import (
+    LlamaAttention,
+    LlamaDecoderLayer,
+    LlamaRMSNorm,
+    LlamaMLP,
+)
+from transformers import AutoConfig, AutoModelForCausalLM
+from typing import Optional, Tuple, List
+class AsteriskConfig(LlamaConfig):
+    """
+    Configuration class for Asterisk model.
+    Inherits from LlamaConfig with custom model_type.
+    """
+    model_type = "asterisk"
+    def __init__(
+        self,
+        hybrid_layer_indices: Optional[List[int]] = None,
+        aspp_hidden_dim: Optional[int] = None,
+        aspp_num_steps: int = 2,
+        aspp_dropout: float = 0.1,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.hybrid_layer_indices = hybrid_layer_indices
+        self.aspp_hidden_dim = aspp_hidden_dim
+        self.aspp_num_steps = aspp_num_steps
+        self.aspp_dropout = aspp_dropout
+class ASPPOperator(nn.Module):
+    """
+    Asterisk Operator (ASPP) - Point-wise Parallel Propagation
+    Simplified version WITHOUT neighbor gathering to reduce overfitting:
+    - Optional dimensionality reduction for efficiency
+    - Point-wise evolution: h_i^(t+1) = φ(h_i^(t))  [NO neighbors]
+    - Multi-step evolution for depth without added complexity
+    - Dropout for regularization
+    Args:
+        hidden_size: Dimension of hidden states (input/output)
+        aspp_hidden_dim: Internal dimension for ASPP (default: None, use hidden_size)
+        num_steps: Number of evolution steps K (default: 2)
+        dropout: Dropout rate for regularization (default: 0.1)
+    """
+    def __init__(self, hidden_size: int, aspp_hidden_dim: Optional[int] = None, num_steps: int = 2, dropout: float = 0.1):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.aspp_hidden_dim = aspp_hidden_dim or hidden_size
+        self.num_steps = num_steps
+        # Projection to lower dimension (if specified)
+        self.use_projection = (self.aspp_hidden_dim != hidden_size)
+        if self.use_projection:
+            self.down_proj = nn.Linear(hidden_size, self.aspp_hidden_dim)
+            self.up_proj = nn.Linear(self.aspp_hidden_dim, hidden_size)
+            self.proj_dropout = nn.Dropout(dropout)
+        # Point-wise update function φ - NO neighbor gathering
+        # Much smaller: only processes current position
+        self.update_net = nn.Sequential(
+            nn.Linear(self.aspp_hidden_dim, self.aspp_hidden_dim * 2),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            nn.Linear(self.aspp_hidden_dim * 2, self.aspp_hidden_dim),
+            nn.Dropout(dropout),
+        )
+        # Learnable K-step parameter
+        # sigmoid(1.0) ≈ 0.73, giving k_steps ≈ 1.5 → 2 steps initially
+        self.k_logit = nn.Parameter(torch.tensor(1.0))
+        # Learnable residual scale
+        self.residual_scale = nn.Parameter(torch.tensor(0.1))
+        # Layer norm for stability
+        self.norm = nn.LayerNorm(self.aspp_hidden_dim, eps=1e-5)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            hidden_states: [batch_size, seq_len, hidden_size]
+        Returns:
+            evolved_states: [batch_size, seq_len, hidden_size]
+        """
+        # Project to lower dimension if needed
+        if self.use_projection:
+            h_t = self.down_proj(hidden_states)
+            h_t = self.proj_dropout(h_t)
+        else:
+            h_t = hidden_states
+        # Learnable number of steps
+        k_steps = max(1, int(torch.sigmoid(self.k_logit) * self.num_steps))
+        # K-step point-wise evolution (NO neighbor gathering)
+        for t in range(k_steps):
+            # Apply point-wise update rule φ
+            h_t_next = self.update_net(h_t)
+            # Scaled residual connection for stability
+            h_t = h_t + self.residual_scale * h_t_next
+            h_t = self.norm(h_t)
+        # Project back to original dimension if needed
+        if self.use_projection:
+            h_t = self.up_proj(h_t)
+            h_t = self.proj_dropout(h_t)
+        return h_t
+class HybridASPPAttentionLayer(LlamaDecoderLayer):
+    """
+    Hybrid layer combining ASPP operator and standard attention
+    Inherits from LlamaDecoderLayer to maintain compatibility
+    Architecture:
+    1. Parallel branches:
+       - ASPP operator for local structured reasoning
+       - Standard LlamaAttention for global context
+    2. Gated fusion of both outputs
+    3. Feed-forward network
+    """
+    def __init__(self, config: LlamaConfig, layer_idx: int, aspp_hidden_dim: Optional[int] = None, aspp_num_steps: int = 2, aspp_dropout: float = 0.1):
+        # Initialize parent LlamaDecoderLayer
+        super().__init__(config, layer_idx)
+        # Add ASPP branch
+        self.aspp_operator = ASPPOperator(
+            hidden_size=config.hidden_size,
+            aspp_hidden_dim=aspp_hidden_dim,
+            num_steps=aspp_num_steps,
+            dropout=aspp_dropout
+        )
+        # Gated fusion mechanism with dropout
+        self.fusion_gate = nn.Sequential(
+            nn.Linear(config.hidden_size * 2, config.hidden_size),
+            nn.Dropout(aspp_dropout),
+            nn.Sigmoid()
+        )
+        # Initialize gate to be balanced (output 0.5 initially)
+        with torch.no_grad():
+            self.fusion_gate[0].bias.fill_(0.0)  # sigmoid(0) = 0.5
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Override LlamaDecoderLayer.forward to add ASPP branch
+        Returns single tensor to match LlamaDecoderLayer API in transformers 4.57.6
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # ASPP branch
+        aspp_output = self.aspp_operator(hidden_states)
+        # Attention branch - use parent's self_attn
+        attn_outputs = self.self_attn(
+            hidden_states,
+            position_embeddings,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            cache_position=cache_position,
+        )
+        attn_output = attn_outputs[0]
+        # Gated fusion
+        fusion_input = torch.cat([aspp_output, attn_output], dim=-1)
+        gate = self.fusion_gate(fusion_input)
+        # Combine with gating: gate * ASPP + (1-gate) * Attention
+        fused_output = gate * aspp_output + (1 - gate) * attn_output
+        # Residual connection
+        hidden_states = residual + fused_output
+        # MLP block (use parent's mlp)
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        # Return single tensor like LlamaDecoderLayer
+        return hidden_states
+class AsteriskLlamaModel(LlamaModel):
+    """
+    Asterisk-Llama model with full hybrid ASPP-Attention architecture
+    All layers use hybrid ASPP+Attention by default for maximum expressiveness.
+    """
+    def __init__(self, config: LlamaConfig, hybrid_layer_indices: Optional[List[int]] = None, aspp_hidden_dim: Optional[int] = None, aspp_num_steps: int = 2, aspp_dropout: float = 0.1):
+        super().__init__(config)
+        # Determine which layers to make hybrid (default: ALL layers)
+        if hybrid_layer_indices is None:
+            # Use ALL layers as hybrid (full hybrid architecture)
+            num_layers = config.num_hidden_layers
+            hybrid_layer_indices = list(range(num_layers))
+        self.hybrid_layer_indices = hybrid_layer_indices
+        # Replace specified layers with hybrid layers
+        for idx in hybrid_layer_indices:
+            if idx < len(self.layers):
+                self.layers[idx] = HybridASPPAttentionLayer(
+                    config,
+                    layer_idx=idx,
+                    aspp_hidden_dim=aspp_hidden_dim,
+                    aspp_num_steps=aspp_num_steps,
+                    aspp_dropout=aspp_dropout
+                )
+        # Initialize weights
+        self.post_init()
+class AsteriskForCausalLM(LlamaForCausalLM):
+    """
+    Asterisk Causal LM with Hybrid ASPP-Attention architecture
+    Registered as: AsteriskForCausalLM
+    """
+    config_class = AsteriskConfig
+    def __init__(self, config: AsteriskConfig, hybrid_layer_indices: Optional[List[int]] = None, aspp_hidden_dim: Optional[int] = None, aspp_num_steps: int = 2, aspp_dropout: float = 0.1):
+        # Read all ASPP parameters from config if not explicitly provided
+        if hybrid_layer_indices is None and hasattr(config, 'hybrid_layer_indices'):
+            hybrid_layer_indices = config.hybrid_layer_indices
+        if aspp_hidden_dim is None and hasattr(config, 'aspp_hidden_dim'):
+            aspp_hidden_dim = config.aspp_hidden_dim
+        if hasattr(config, 'aspp_num_steps'):
+            aspp_num_steps = config.aspp_num_steps
+        if hasattr(config, 'aspp_dropout'):
+            aspp_dropout = config.aspp_dropout
+        super().__init__(config)
+        # Replace model with Asterisk version
+        self.model = AsteriskLlamaModel(config, hybrid_layer_indices, aspp_hidden_dim, aspp_num_steps, aspp_dropout)
+        # Store hybrid layer info in config for serialization
+        self.config.hybrid_layer_indices = hybrid_layer_indices
+        # Initialize weights
+        self.post_init()
+    @classmethod
+    def from_pretrained_base(
+        cls,
+        base_model_path: str,
+        hybrid_layer_indices: Optional[List[int]] = None,
+        aspp_hidden_dim: Optional[int] = None,
+        aspp_num_steps: int = 2,
+        aspp_dropout: float = 0.1,
+        **kwargs
+    ):
+        """
+        Load base model and convert to Asterisk architecture
+        Args:
+            base_model_path: Path to base SmolLM2 model
+            hybrid_layer_indices: Which layers to make hybrid (None for all)
+            aspp_hidden_dim: Internal dimension for ASPP (None = use model hidden_size)
+            aspp_num_steps: Number of evolution steps K for ASPP (default: 2)
+            aspp_dropout: Dropout rate for ASPP regularization (default: 0.1)
+        """
+        # Load base model
+        base_model = LlamaForCausalLM.from_pretrained(base_model_path, **kwargs)
+        base_config = base_model.config
+        # Create Asterisk config from base config with ASPP params
+        asterisk_config = AsteriskConfig(
+            **base_config.to_dict(),
+            hybrid_layer_indices=hybrid_layer_indices,
+            aspp_hidden_dim=aspp_hidden_dim,
+            aspp_num_steps=aspp_num_steps,
+            aspp_dropout=aspp_dropout
+        )
+        # Create Asterisk model
+        asterisk_model = cls(asterisk_config, hybrid_layer_indices, aspp_hidden_dim, aspp_num_steps, aspp_dropout)
+        # Transfer weights from base model (non-hybrid layers and embeddings)
+        asterisk_model.load_state_dict(base_model.state_dict(), strict=False)
+        print(f"✓ Converted base model to Asterisk architecture")
+        print(f"  Hybrid layers: {asterisk_model.model.hybrid_layer_indices}")
+        aspp_dim_str = f"{aspp_hidden_dim}" if aspp_hidden_dim else f"{base_config.hidden_size} (full)"
+        print(f"  ASPP config: dim={aspp_dim_str}, steps={aspp_num_steps}, dropout={aspp_dropout}")
+        return asterisk_model, base_model
+# Register the model for AutoModel
+AutoConfig.register("asterisk", AsteriskConfig)
+AutoModelForCausalLM.register(AsteriskConfig, AsteriskForCausalLM)
+def get_model_info(model):
+    """Print model architecture information"""
+    total_params = sum(p.numel() for p in model.parameters())
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"  • Total parameters: {total_params:,}")
+    print(f"  • Trainable parameters: {trainable_params:,}")
+    print(f"  • Model size: {total_params * 4 / 1024**2:.2f} MB (fp32)")
+    if isinstance(model, AsteriskForCausalLM):
+        print(f"  • Hybrid layer indices: {model.model.hybrid_layer_indices}")
+        print(f"  • Number of hybrid layers: {len(model.model.hybrid_layer_indices)}")
+# Example usage
+if __name__ == "__main__":
+    print("=" * 80)
+    print("Asterisk Architecture - ASPP + Standard Attention")
+    print("=" * 80)
+    # Configuration
+    base_model_path = "SmolLM2-135M-Instruct"
+    # Create Asterisk model
+    print("\n🔧 Creating Asterisk model...")
+    asterisk_model, base_model = AsteriskForCausalLM.from_pretrained_base(
+        base_model_path,
+        hybrid_layer_indices=None,  # Auto-select ALL layers (full hybrid)
+        aspp_num_steps=2,  # Reduced from 3
+        aspp_neighbor_radius=1,  # Reduced from 2
+        aspp_dropout=0.1,  # Added dropout
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+    )
+    print("\n📊 Base model info:")
+    get_model_info(base_model)
+    print("\n📊 Asterisk model info:")
+    get_model_info(asterisk_model)
+    print("\n✨ Model ready for training!")

README.md CHANGED Viewed

@@ -1,3 +1,324 @@
----
-license: apache-2.0
----

+---
+library_name: transformers
+model_name: Asterisk-135M
+base_model: HuggingFaceTB/SmolLM2-135M-Instruct
+tags:
+- aspp
+- hybrid-architecture
+- graph-reasoning
+- sft
+- trl
+license: apache-2.0
+language:
+- en
+---
+# Asterisk-135M: Hybrid ASPP-Attention Architecture
+**Asterisk** is a research implementation that combines the **ASPP (Adjacency-Structured Parallel Propagation)** operator with standard attention mechanisms to enhance the SmolLM2-135M model. The model implements a hybrid architecture that fuses graph-based local reasoning (ASPP) with global attention for improved expressiveness on structured reasoning tasks.
+## Model Description
+- **Base Model**: [SmolLM2-135M-Instruct](https://huggingface.co/HuggingFaceTB/SmolLM2-135M-Instruct)
+- **Architecture**: Hybrid ASPP-Attention (30 hybrid layers)
+- **Parameters**: 171.2M (35M additional ASPP parameters)
+- **Training**: Supervised Fine-Tuning on Capybara dataset
+- **Framework**: Transformers 4.57.6, TRL 0.27.0
+### Key Innovation: The Asterisk Operator (★-operator)
+The **Asterisk Operator** performs local parallel state evolution through point-wise transformations:
+```
+h_i^(t+1) = φ(h_i^(t))  [K-step iterative evolution]
+```
+This is then gated and fused with standard Llama attention outputs:
+```
+output = gate * ASPP(x) + (1-gate) * Attention(x)
+```
+## Architecture
+### 1. ASPPOperator (Point-wise Parallel Propagation)
+```python
+class ASPPOperator:
+    """
+    Simplified ASPP without neighbor gathering to reduce overfitting
+    Forward pass:
+    1. Optional dimensionality reduction: h_t = down_proj(hidden_states)
+    2. K-step evolution: h_t = h_t + α * φ(h_t)  [K times]
+    3. Layer normalization after each step
+    4. Optional projection back: output = up_proj(h_t)
+    Parameters:
+    - hidden_size: 576 (model dimension)
+    - aspp_hidden_dim: 256 (internal ASPP dimension)
+    - aspp_num_steps: 8 (evolution iterations)
+    - aspp_dropout: 0.2
+    """
+```
+**Pseudocode:**
+```
+function ASPP(hidden_states):
+    # Optional dimensionality reduction
+    if use_projection:
+        h_t ← down_proj(hidden_states)
+        h_t ← dropout(h_t)
+    else:
+        h_t ← hidden_states
+    # Learnable number of steps
+    k_steps ← max(1, int(sigmoid(k_logit) * num_steps))
+    # K-step point-wise evolution
+    for t = 1 to k_steps:
+        # Point-wise update: φ(h_t) = MLP(h_t)
+        h_t_next ← update_net(h_t)
+        # Scaled residual connection
+        h_t ← h_t + residual_scale * h_t_next
+        h_t ← layer_norm(h_t)
+    # Project back to original dimension
+    if use_projection:
+        h_t ← up_proj(h_t)
+        h_t ← dropout(h_t)
+    return h_t
+```
+### 2. HybridASPPAttentionLayer
+```python
+class HybridASPPAttentionLayer(LlamaDecoderLayer):
+    """
+    Extends LlamaDecoderLayer with parallel ASPP branch
+    Architecture:
+    1. Input LayerNorm
+    2. Parallel branches:
+       - ASPP operator for local structured reasoning
+       - Standard LlamaAttention for global context
+    3. Gated fusion: gate * ASPP + (1-gate) * Attention
+    4. Residual connection
+    5. Feed-forward MLP
+    """
+```
+**Pseudocode:**
+```
+function HybridLayer(hidden_states, attention_mask, ...):
+    residual ← hidden_states
+    hidden_states ← input_layernorm(hidden_states)
+    # Parallel branches
+    aspp_output ← aspp_operator(hidden_states)
+    attn_output ← self_attention(hidden_states, attention_mask, ...)
+    # Gated fusion
+    fusion_input ← concat([aspp_output, attn_output])
+    gate ← sigmoid(linear(dropout(fusion_input)))
+    fused_output ← gate * aspp_output + (1 - gate) * attn_output
+    # Residual connection
+    hidden_states ← residual + fused_output
+    # MLP block
+    residual ← hidden_states
+    hidden_states ← post_attention_layernorm(hidden_states)
+    hidden_states ← mlp(hidden_states)
+    hidden_states ← residual + hidden_states
+    return hidden_states
+```
+### 3. AsteriskForCausalLM
+```python
+class AsteriskForCausalLM(LlamaForCausalLM):
+    """
+    Main model class with custom model_type "asterisk"
+    Configuration:
+    - hybrid_layer_indices: None (all 30 layers are hybrid)
+    - aspp_hidden_dim: 256 (reduces overfitting)
+    - aspp_num_steps: 8 (learnable, actual steps ≈ 6)
+    - aspp_dropout: 0.2
+    """
+```
+## Evaluation Results
+Evaluated on LM-Evaluation-Harness with `limit=50` per task:
+| Task | Metric | Score | Stderr |
+|------|--------|-------|--------|
+| **MMLU** | acc | **0.2376** | ±0.0037 |
+| - Humanities | acc | 0.2472 | ±0.0067 |
+| - STEM | acc | 0.2245 | ±0.0074 |
+| - Social Sciences | acc | 0.2327 | ±0.0076 |
+| - Other | acc | 0.2430 | ±0.0077 |
+| **GSM8K** | exact_match | **0.0240** | ±0.0048 |
+| **HellaSwag** | acc_norm | **0.4430** | ±0.0157 |
+| **ARC-Easy** | acc_norm | **0.5450** | ±0.0158 |
+| **PIQA** | acc_norm | **0.6770** | ±0.0148 |
+| **WinoGrande** | acc | **0.5210** | ±0.0158 |
+**Note**: These are preliminary results with sample limits. Full evaluation pending.
+## Quick Start
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+# Load model and tokenizer
+model = AutoModelForCausalLM.from_pretrained(
+    "path/to/Asterisk",
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+    device_map="auto"
+)
+tokenizer = AutoTokenizer.from_pretrained("path/to/Asterisk")
+# Generate text
+messages = [{"role": "user", "content": "Explain quantum computing in simple terms."}]
+inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)
+outputs = model.generate(
+    inputs,
+    max_new_tokens=256,
+    temperature=0.7,
+    do_sample=True,
+)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
+## Training Details
+### Training Configuration
+- **Dataset**: Capybara (conversational instruction-following)
+- **Optimizer**: AdamW (lr=2e-5, weight_decay=0.01)
+- **Batch Size**: 4 per device, gradient accumulation=4 (effective batch=16)
+- **Epochs**: 2
+- **Scheduler**: Cosine with warmup (100 steps)
+- **Mixed Precision**: bfloat16
+- **Gradient Checkpointing**: Enabled
+### ASPP Configuration
+```python
+aspp_hidden_dim = 256      # Internal dimension (vs 576 model hidden_size)
+aspp_num_steps = 8         # Max evolution steps (learnable)
+aspp_dropout = 0.2         # Regularization
+hybrid_layer_indices = None  # All 30 layers
+```
+## Model Creation from Base
+```python
+from AsteriskForCausalLM import AsteriskForCausalLM
+# Create Asterisk model from SmolLM2 base
+model, base_model = AsteriskForCausalLM.from_pretrained_base(
+    "HuggingFaceTB/SmolLM2-135M-Instruct",
+    hybrid_layer_indices=None,  # None = all layers
+    aspp_hidden_dim=256,        # Internal ASPP dimension
+    aspp_num_steps=8,           # K-step evolution
+    aspp_dropout=0.2,           # Dropout rate
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+)
+# Base model parameters are transferred, ASPP parameters initialized randomly
+model.load_state_dict(base_model.state_dict(), strict=False)
+```
+## Theoretical Background
+### Universality (Theorem 2.1)
+ASPP can simulate any Message-Passing Neural Network (MPNN) function on finite graphs in D steps, where D is the graph diameter.
+### Convergence (Theorem 2.2)
+Exponential convergence to fixed points with rate c=0.76 under Lipschitz continuity.
+### Turing Completeness
+Proven via cyclic tag system simulation - ASPP can compute any Turing-computable function given sufficient depth.
+**Implementation Note**: This implementation simplifies theoretical ASPP to point-wise evolution (no neighbor gathering) to reduce overfitting while maintaining iterative refinement benefits.
+## Files in Checkpoint
+```
+Asterisk/
+├── AsteriskForCausalLM.py    # Model implementation (required for trust_remote_code)
+├── config.json                # Model configuration with auto_map
+├── model.safetensors          # Model weights
+├── tokenizer.json             # Tokenizer
+├── generation_config.json     # Generation settings
+└── README.md                  # This file
+```
+## Dependencies
+```bash
+pip install torch>=2.0.0
+pip install transformers>=4.40.0
+pip install trl>=0.8.0
+pip install datasets>=2.14.0
+pip install accelerate>=0.25.0
+pip install bitsandbytes
+```
+## Citations
+If you use this model, please cite:
+```bibtex
+@misc{asterisk2026,
+  title={Asterisk: Hybrid ASPP-Attention Architecture for Enhanced Language Modeling},
+  author={NoesisLab},
+  year={2026},
+  publisher={Huggingface},
+}
+```
+```bibtex
+@misc{vonwerra2022trl,
+  title={{TRL: Transformer Reinforcement Learning}},
+  author={Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
+  year={2020},
+  journal={GitHub repository},
+  publisher={GitHub},
+  howpublished={\url{https://github.com/huggingface/trl}}
+}
+```
+```bibtex
+@article{allal2024SmolLM2,
+  title={SmolLM2 - with great data, comes great performance},
+  author={Allal, Loubna Ben and Lozhkov, Anton and Penedo, Guilherme and Wolf, Thomas and von Werra, Leandro},
+  year={2024}
+}
+```
+## License
+This model inherits the Apache 2.0 license from SmolLM2-135M-Instruct.
+## Framework Versions
+- **TRL**: 0.27.0
+- **Transformers**: 4.57.6
+- **PyTorch**: 2.8.0+cu128
+- **Datasets**: 4.5.0
+- **Tokenizers**: 0.22.2
+## Acknowledgments
+Built on top of [SmolLM2-135M-Instruct](https://huggingface.co/HuggingFaceTB/SmolLM2-135M-Instruct) by HuggingFace. Training framework powered by [TRL](https://github.com/huggingface/trl).

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,6 @@

+{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system
+You are a helpful AI assistant named Asterisk, trained by NoesisLab<|im_end|>
+' }}{% endif %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}

config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "architectures": [
+    "AsteriskForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "AsteriskForCausalLM.AsteriskConfig",
+    "AutoModelForCausalLM": "AsteriskForCausalLM.AsteriskForCausalLM"
+  },
+  "aspp_dropout": 0.2,
+  "aspp_hidden_dim": 256,
+  "aspp_num_steps": 8,
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "dtype": "float32",
+  "eos_token_id": 2,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 576,
+  "hybrid_layer_indices": null,
+  "initializer_range": 0.041666666666666664,
+  "intermediate_size": 1536,
+  "is_llama_config": true,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "model_type": "asterisk",
+  "num_attention_heads": 9,
+  "num_hidden_layers": 30,
+  "num_key_value_heads": 3,
+  "pad_token_id": 2,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_interleaved": false,
+  "rope_scaling": null,
+  "rope_theta": 100000,
+  "tie_word_embeddings": true,
+  "transformers.js_config": {
+    "kv_cache_dtype": {
+      "fp16": "float16",
+      "q4f16": "float16"
+    }
+  },
+  "transformers_version": "4.57.6",
+  "use_cache": true,
+  "vocab_size": 49152
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": [
+    2
+  ],
+  "pad_token_id": 2,
+  "transformers_version": "4.57.6"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3af701c6eb2735e0c54417aa3eb6d2460ee92de8b646e22c6fe7106388611fdb
+size 684933848

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": {
+    "content": "<|im_start|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,154 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<repo_name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<file_sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<jupyter_script>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": "<|im_start|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<|im_end|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:49e41e7530752cf0d15a3251e00703b28ccee977859c7f621eca6e31227608ca
+size 6353

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff