Text Generation
Transformers
PyTorch
English
taonet_mini_t2
taonet
taotern
ssm
state-space-model
dplr
custom_code
experimental
Instructions to use TaoTern/TaoNet-mini-T2 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use TaoTern/TaoNet-mini-T2 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="TaoTern/TaoNet-mini-T2", trust_remote_code=True)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("TaoTern/TaoNet-mini-T2", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use TaoTern/TaoNet-mini-T2 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "TaoTern/TaoNet-mini-T2" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "TaoTern/TaoNet-mini-T2", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/TaoTern/TaoNet-mini-T2
- SGLang
How to use TaoTern/TaoNet-mini-T2 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "TaoTern/TaoNet-mini-T2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "TaoTern/TaoNet-mini-T2", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "TaoTern/TaoNet-mini-T2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "TaoTern/TaoNet-mini-T2", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use TaoTern/TaoNet-mini-T2 with Docker Model Runner:
docker model run hf.co/TaoTern/TaoNet-mini-T2
| """ | |
| DeepSeek-style Multi-head Latent Attention (MLA) with RoPE. | |
| Key innovations: | |
| 1. KV compression to latent space (reduce KV memory) | |
| 2. Q stays in full dimension for expressive query space | |
| 3. RoPE positional embeddings on Q and K | |
| 4. Grouped Query Attention (GQA) for efficiency | |
| 5. Learnable head combination weights | |
| 6. Numerical stability via pre-norm and scaling | |
| """ | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| import math | |
| def _residual_rms_norm(x, enabled=False, target=1.0, eps=1e-6, cap=None): | |
| if not enabled and cap is None: | |
| return x | |
| rms = x.float().square().mean(dim=-1, keepdim=True).add(eps).sqrt() | |
| if enabled: | |
| scale = target / rms | |
| else: | |
| cap_tensor = torch.tensor(float(cap), dtype=rms.dtype, device=rms.device) | |
| scale = torch.minimum(torch.ones_like(rms), cap_tensor / rms) | |
| return x * scale.to(dtype=x.dtype) | |
| class RotaryEmbedding(nn.Module): | |
| """Rotary position embeddings used in RoPE with optional YaRN extension. | |
| YaRN (Yet another RoPE eXtension) allows context length interpolation via | |
| frequency scaling. When yarn_alpha != 1.0 or seq_len > max_seq_length, | |
| frequencies are dynamically scaled to support longer sequences. | |
| Parameters: | |
| dim: Embedding dimension (must be even) | |
| rope_scale: Base RoPE scale factor (default: 40) | |
| max_seq_length: Original trained sequence length (default: 1024) | |
| yarn_alpha: YaRN interpolation factor (default: 1.0, no interpolation) | |
| - values < 1.0: aggressive interpolation (faster context expansion) | |
| - values > 1.0: conservative interpolation (safer) | |
| """ | |
| def __init__(self, dim, rope_scale=40.0, max_seq_length=1024, yarn_alpha=1.0): | |
| super().__init__() | |
| assert dim % 2 == 0, "Dimension must be even for rotary embeddings" | |
| self.dim = dim | |
| self.rope_scale = rope_scale | |
| self.max_seq_length = max_seq_length | |
| self.yarn_alpha = yarn_alpha | |
| inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim)) | |
| self.register_buffer("inv_freq", inv_freq) | |
| def _apply_yarn_scaling(self, freqs, seq_len): | |
| """Apply YaRN frequency scaling for context extension. | |
| Args: | |
| freqs: [seq_len, dim] frequency tensor | |
| seq_len: Current sequence length | |
| Returns: | |
| Scaled freqs if yarn is enabled and seq_len > max_seq_length, else original freqs | |
| """ | |
| # Only apply scaling if sequence exceeds training length or yarn_alpha != 1.0 | |
| if self.yarn_alpha == 1.0 and seq_len <= self.max_seq_length: | |
| return freqs | |
| # YaRN scaling factor: interpolate frequency reduction | |
| # scale_factor = (seq_len / max_seq_length) ** (1 / yarn_alpha) | |
| # Scales down frequencies to fit longer context while maintaining position distinctions | |
| scale_factor = (seq_len / self.max_seq_length) ** (1.0 / self.yarn_alpha) | |
| freqs = freqs / scale_factor | |
| return freqs | |
| def forward(self, seq_len, device): | |
| """Generate rotary embeddings for sequence with optional YaRN scaling. | |
| Args: | |
| seq_len: Current sequence length | |
| device: Device to create embeddings on | |
| Returns: | |
| [seq_len, 2*dim] rotary embeddings (duplicated freqs) | |
| """ | |
| t = torch.arange(seq_len, device=device).type_as(self.inv_freq) / self.rope_scale | |
| freqs = torch.einsum("i,j->ij", t, self.inv_freq) # [seq_len, dim//2] | |
| # Apply YaRN frequency scaling if enabled | |
| freqs = self._apply_yarn_scaling(freqs, seq_len) | |
| return torch.cat((freqs, freqs), dim=-1) # [seq_len, dim] | |
| def rotate_half(x): | |
| """Rotate half the hidden dims of the input.""" | |
| x1, x2 = x.chunk(2, dim=-1) | |
| return torch.cat((-x2, x1), dim=-1) | |
| def apply_rotary(x, cos, sin): | |
| """Apply rotary embeddings to input tensor. | |
| Args: | |
| x: [B, n_heads, seq_len, head_dim] or similar | |
| cos: [seq_len, head_dim] or [1, 1, seq_len, head_dim] | |
| sin: [seq_len, head_dim] or [1, 1, seq_len, head_dim] | |
| """ | |
| # Ensure cos/sin have the right dimensions for broadcasting | |
| if cos.dim() == 2: | |
| cos = cos.unsqueeze(0).unsqueeze(0) | |
| sin = sin.unsqueeze(0).unsqueeze(0) | |
| # Handle case where cos/sin may be shorter than x | |
| cos = cos[..., :x.shape[-1]] | |
| sin = sin[..., :x.shape[-1]] | |
| # Split x based on cos dimensions | |
| x_rot = x[..., :cos.shape[-1]] | |
| x_base = x[..., cos.shape[-1]:] | |
| # Apply rotation | |
| x_rot = (x_rot * cos) + (rotate_half(x_rot) * sin) | |
| # Concatenate rotated and base parts | |
| return torch.cat([x_rot, x_base], dim=-1) if x_base.shape[-1] > 0 else x_rot | |
| class DeepSeekMLA(nn.Module): | |
| """ | |
| DeepSeek-style Multi-head Latent Attention (MLA). | |
| Architecture: | |
| 1. Project input to Query: [B, seq_len, d_model] -> [B, seq_len, d_model] | |
| 2. Compress to KV latent: [B, seq_len, d_model] -> [B, seq_len, d_latent_kv] | |
| 3. Split into heads for attention | |
| 4. Apply RoPE to Q and K | |
| 5. Compute attention scores: (Q @ K^T) / sqrt(d_head) | |
| 6. Apply softmax and combine with values | |
| 7. Concatenate heads and project back to d_model | |
| Parameters: | |
| d_model: Model dimension | |
| d_latent_kv: Latent dimension for KV compression | |
| n_heads: Number of attention heads | |
| d_rope: Dimension for RoPE (usually == d_head_dim) | |
| dropout: Dropout probability | |
| gqa_groups: Grouped Query Attention groups (1 = standard MLA, >1 = GQA) | |
| """ | |
| def __init__(self, d_model, d_latent_kv, n_heads, d_rope, dropout=0.1, gqa_groups=1, | |
| rope_scale=40.0, max_seq_length=1024, yarn_alpha=1.0): | |
| super().__init__() | |
| self.d_model = d_model | |
| self.d_latent_kv = d_latent_kv | |
| self.n_heads = n_heads | |
| self.d_rope = d_rope | |
| self.gqa_groups = gqa_groups | |
| assert d_model % n_heads == 0, f"d_model ({d_model}) must be divisible by n_heads ({n_heads})" | |
| assert d_latent_kv % n_heads == 0, f"d_latent_kv ({d_latent_kv}) must be divisible by n_heads ({n_heads})" | |
| self.d_head_full = d_model // n_heads # Full head dimension for Q | |
| self.d_head_latent = d_latent_kv // n_heads # Latent head dimension for K/V | |
| # Scaling factor for attention scores | |
| self.scale = 1.0 / math.sqrt(self.d_head_latent) | |
| # Layer norm before attention for stability | |
| self.norm = nn.LayerNorm(d_model) | |
| # Q projection: d_model -> d_model (full dimension) | |
| self.q_proj = nn.Linear(d_model, d_model, bias=False) | |
| # K/V projections: d_model -> d_latent_kv (compressed) | |
| self.k_proj = nn.Linear(d_model, d_latent_kv, bias=False) | |
| self.v_proj = nn.Linear(d_model, d_latent_kv, bias=False) | |
| # RoPE for position encoding with YaRN support | |
| self.rotary = RotaryEmbedding( | |
| d_rope, | |
| rope_scale=rope_scale, | |
| max_seq_length=max_seq_length, | |
| yarn_alpha=yarn_alpha | |
| ) | |
| # Output projection: d_latent_kv -> d_model | |
| self.out_proj = nn.Linear(d_latent_kv, d_model, bias=False) | |
| # Head combination weights (learnable scaling per head) | |
| self.head_weights = nn.Parameter(torch.ones(n_heads)) | |
| # Dropout | |
| self.attn_dropout = nn.Dropout(dropout) | |
| self.proj_dropout = nn.Dropout(dropout) | |
| def forward(self, x, attention_mask=None): | |
| """ | |
| Args: | |
| x: [B, seq_len, d_model] | |
| attention_mask: [B, seq_len] (1 = keep, 0 = mask) or | |
| [B, 1, seq_len, seq_len] (causal mask) | |
| Returns: | |
| out: [B, seq_len, d_model] | |
| """ | |
| B, seq_len, _ = x.shape | |
| device = x.device | |
| # Pre-norm | |
| x_norm = self.norm(x) | |
| # Project to Q, K, V spaces | |
| q = self.q_proj(x_norm) # [B, seq_len, d_model] | |
| k = self.k_proj(x_norm) # [B, seq_len, d_latent_kv] | |
| v = self.v_proj(x_norm) # [B, seq_len, d_latent_kv] | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Reshape into multi-head format | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Q: [B, seq_len, d_model] -> [B, seq_len, n_heads, d_head_full] -> [B, n_heads, seq_len, d_head_full] | |
| q = q.view(B, seq_len, self.n_heads, self.d_head_full).transpose(1, 2) | |
| # K: [B, seq_len, d_latent_kv] -> [B, seq_len, n_heads, d_head_latent] -> [B, n_heads, seq_len, d_head_latent] | |
| k = k.view(B, seq_len, self.n_heads, self.d_head_latent).transpose(1, 2) | |
| # V: [B, seq_len, d_latent_kv] -> [B, seq_len, n_heads, d_head_latent] -> [B, n_heads, seq_len, d_head_latent] | |
| v = v.view(B, seq_len, self.n_heads, self.d_head_latent).transpose(1, 2) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Apply RoPE to Q and K | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if self.d_rope > 0: | |
| # Generate RoPE embeddings: [seq_len, d_rope] | |
| rotary_emb = self.rotary(seq_len, device) # [seq_len, d_rope] | |
| cos = torch.cos(rotary_emb).unsqueeze(0).unsqueeze(0) # [1, 1, seq_len, d_rope] | |
| sin = torch.sin(rotary_emb).unsqueeze(0).unsqueeze(0) # [1, 1, seq_len, d_rope] | |
| # Apply RoPE to Q (only on first d_rope dimensions) | |
| q_rope = apply_rotary(q[..., :self.d_rope], cos, sin) # [B, n_heads, seq_len, d_rope] | |
| q = torch.cat([q_rope, q[..., self.d_rope:]], dim=-1) # Combine with remaining dims | |
| # Apply RoPE to K (only on first d_rope dimensions) | |
| k_rope = apply_rotary(k[..., :self.d_rope], cos, sin) # [B, n_heads, seq_len, d_rope] | |
| k = torch.cat([k_rope, k[..., self.d_rope:]], dim=-1) # Combine with remaining dims | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Compute attention using PyTorch 2.0+ fused scaled_dot_product_attention | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Only use first d_head_latent dimensions of Q for attention | |
| # K and V are already d_head_latent dimension | |
| q_for_attn = q[..., :self.d_head_latent] # [B, n_heads, seq_len, d_head_latent] | |
| # Convert attention mask to boolean format for scaled_dot_product_attention | |
| # Input mask: 0 = mask (don't attend), 1 = keep (attend) | |
| # Boolean mask: False = mask, True = attend | |
| attn_mask_bool = None | |
| if attention_mask is not None: | |
| if attention_mask.dim() == 2: | |
| # [B, seq_len] with {0, 1} -> [B, 1, 1, seq_len] with {False, True} | |
| attn_mask_bool = attention_mask.bool().unsqueeze(1).unsqueeze(1) | |
| else: | |
| # Already 4D [B, 1, seq_len, seq_len], just convert to bool | |
| attn_mask_bool = attention_mask.bool() | |
| # Get dropout probability (0.0 when not training) | |
| dropout_p = self.attn_dropout.p if self.training else 0.0 | |
| if hasattr(F, "scaled_dot_product_attention"): | |
| # Apply fused attention operation when available. | |
| out_heads = F.scaled_dot_product_attention( | |
| q_for_attn, k, v, | |
| attn_mask=attn_mask_bool, | |
| dropout_p=dropout_p, | |
| scale=None | |
| ) # [B, n_heads, seq_len, d_head_latent] | |
| else: | |
| scores = torch.matmul(q_for_attn, k.transpose(-2, -1)) * self.scale | |
| if attn_mask_bool is not None: | |
| scores = scores.masked_fill(~attn_mask_bool, torch.finfo(scores.dtype).min) | |
| attn_weights = F.softmax(scores, dim=-1) | |
| if dropout_p > 0.0: | |
| attn_weights = F.dropout(attn_weights, p=dropout_p, training=True) | |
| out_heads = torch.matmul(attn_weights, v) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Concatenate heads | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # [B, seq_len, n_heads, d_head_latent] -> [B, seq_len, d_latent_kv] | |
| out_concat = out_heads.transpose(1, 2).reshape(B, seq_len, self.d_latent_kv) | |
| # Project back to d_model | |
| out = self.out_proj(out_concat) # [B, seq_len, d_model] | |
| out = self.proj_dropout(out) | |
| return out | |
| class AttentionBlock(nn.Module): | |
| """ | |
| Attention block with pre-norm residual connection and feed-forward network. | |
| Structure: | |
| Input | |
| ββ> Norm ββ¬β> MLA βββ¬β> Residual Add | |
| β ββββββββββ | |
| βββββββββββββββββββββββββββββββββββββ> Norm ββ¬β> SwiGLU FFN βββ¬β> Residual Add | |
| β βββββββββ β | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ> Output | |
| """ | |
| def __init__(self, d_model, d_latent_kv, n_heads, d_rope, d_ff, dropout=0.1, gqa_groups=1, | |
| rope_scale=40.0, max_seq_length=1024, yarn_alpha=1.0, | |
| residual_rms_norm=False, residual_rms_target=1.0, residual_rms_cap=None, | |
| residual_rms_eps=1e-6): | |
| super().__init__() | |
| self.residual_rms_norm = residual_rms_norm | |
| self.residual_rms_target = residual_rms_target | |
| self.residual_rms_cap = residual_rms_cap | |
| self.residual_rms_eps = residual_rms_eps | |
| self.mla = DeepSeekMLA(d_model, d_latent_kv, n_heads, d_rope, dropout, gqa_groups, | |
| rope_scale=rope_scale, max_seq_length=max_seq_length, | |
| yarn_alpha=yarn_alpha) | |
| # SwiGLU feed-forward network | |
| self.ff_norm = nn.LayerNorm(d_model) | |
| self.ff_gate = nn.Linear(d_model, d_ff, bias=False) | |
| self.ff_value = nn.Linear(d_model, d_ff, bias=False) | |
| self.ff_out = nn.Linear(d_ff, d_model, bias=False) | |
| self.dropout = nn.Dropout(dropout) | |
| def forward(self, x, attention_mask=None): | |
| """ | |
| Args: | |
| x: [B, seq_len, d_model] | |
| attention_mask: [B, seq_len] or [B, 1, seq_len, seq_len] | |
| Returns: | |
| out: [B, seq_len, d_model] | |
| """ | |
| # Attention with residual | |
| attn_out = self.mla(x, attention_mask) | |
| x = x + self.dropout(attn_out) | |
| x = _residual_rms_norm( | |
| x, | |
| self.residual_rms_norm, | |
| self.residual_rms_target, | |
| self.residual_rms_eps, | |
| self.residual_rms_cap, | |
| ) | |
| # FFN with residual | |
| ff_norm = self.ff_norm(x) | |
| ff_gate = self.ff_gate(ff_norm) | |
| ff_value = self.ff_value(ff_norm) | |
| ff_out = ff_value * F.silu(ff_gate) # SwiGLU activation | |
| ff_out = self.ff_out(ff_out) | |
| x = x + self.dropout(ff_out) | |
| x = _residual_rms_norm( | |
| x, | |
| self.residual_rms_norm, | |
| self.residual_rms_target, | |
| self.residual_rms_eps, | |
| self.residual_rms_cap, | |
| ) | |
| return x | |