Text Generation
Transformers
PyTorch
Safetensors
English
axiom
causal-lm
base-model
custom-architecture
tiktoken
custom_code
Instructions to use user-anto/Axiom-Dense-380M-Base with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use user-anto/Axiom-Dense-380M-Base with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="user-anto/Axiom-Dense-380M-Base", trust_remote_code=True)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("user-anto/Axiom-Dense-380M-Base", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use user-anto/Axiom-Dense-380M-Base with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "user-anto/Axiom-Dense-380M-Base" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "user-anto/Axiom-Dense-380M-Base", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/user-anto/Axiom-Dense-380M-Base
- SGLang
How to use user-anto/Axiom-Dense-380M-Base with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "user-anto/Axiom-Dense-380M-Base" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "user-anto/Axiom-Dense-380M-Base", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "user-anto/Axiom-Dense-380M-Base" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "user-anto/Axiom-Dense-380M-Base", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use user-anto/Axiom-Dense-380M-Base with Docker Model Runner:
docker model run hf.co/user-anto/Axiom-Dense-380M-Base
| import math | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| from torch.utils.checkpoint import checkpoint | |
| from typing import Optional | |
| from config import ModelConfig | |
| # RMSNorm | |
| class RMSNorm(nn.Module): | |
| def __init__(self, dim: int, eps: float = 1e-5): | |
| super().__init__() | |
| self.eps = eps | |
| self.weight = nn.Parameter(torch.ones(dim)) | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| rms = x.float().pow(2).mean(-1, keepdim=True).add(self.eps).rsqrt() | |
| return (x.float() * rms).to(x.dtype) * self.weight | |
| # RoPE | |
| def precompute_rope_freqs(head_dim: int, max_seq_len: int, theta: float = 10000.0): | |
| freqs = 1.0 / (theta ** (torch.arange(0, head_dim, 2).float() / head_dim)) | |
| t = torch.arange(max_seq_len) | |
| angles = torch.outer(t, freqs) | |
| return angles.cos(), angles.sin() | |
| def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor: | |
| # x: (B, T, n_heads, head_dim) | |
| x_even = x[..., ::2].float() | |
| x_odd = x[..., 1::2].float() | |
| # (T, head_dim/2) -> (1, T, 1, head_dim/2) for broadcasting | |
| cos = cos[: x.shape[1]].unsqueeze(0).unsqueeze(2) | |
| sin = sin[: x.shape[1]].unsqueeze(0).unsqueeze(2) | |
| out_even = x_even * cos - x_odd * sin | |
| out_odd = x_even * sin + x_odd * cos | |
| x_rot = torch.stack((out_even, out_odd), dim=-1).flatten(-2) | |
| return x_rot.to(x.dtype) | |
| # GQA | |
| class GQAttention(nn.Module): | |
| def __init__(self, cfg: ModelConfig): | |
| super().__init__() | |
| assert cfg.n_heads % cfg.n_kv_heads == 0 | |
| self.n_heads = cfg.n_heads | |
| self.n_kv_heads = cfg.n_kv_heads | |
| self.n_rep = cfg.n_heads // cfg.n_kv_heads | |
| self.head_dim = cfg.dim // cfg.n_heads | |
| self.wq = nn.Linear(cfg.dim, cfg.n_heads * self.head_dim, bias=False) | |
| self.wk = nn.Linear(cfg.dim, cfg.n_kv_heads * self.head_dim, bias=False) | |
| self.wv = nn.Linear(cfg.dim, cfg.n_kv_heads * self.head_dim, bias=False) | |
| self.wo = nn.Linear(cfg.n_heads * self.head_dim, cfg.dim, bias=False) | |
| self.dropout_p = cfg.dropout | |
| def forward( | |
| self, | |
| x: torch.Tensor, | |
| cos: torch.Tensor, | |
| sin: torch.Tensor, | |
| cache_k: Optional[torch.Tensor] = None, | |
| cache_v: Optional[torch.Tensor] = None, | |
| return_cache: bool = False, | |
| ): | |
| B, T, _ = x.shape | |
| q = self.wq(x).view(B, T, self.n_heads, self.head_dim) | |
| k = self.wk(x).view(B, T, self.n_kv_heads, self.head_dim) | |
| v = self.wv(x).view(B, T, self.n_kv_heads, self.head_dim) | |
| q = apply_rope(q, cos, sin) | |
| k = apply_rope(k, cos, sin) | |
| if cache_k is not None: | |
| k = torch.cat([cache_k, k], dim=1) | |
| v = torch.cat([cache_v, v], dim=1) | |
| new_cache_k, new_cache_v = (k, v) if return_cache else (None, None) | |
| # Expand KV heads → Q heads | |
| k = k.repeat_interleave(self.n_rep, dim=2) | |
| v = v.repeat_interleave(self.n_rep, dim=2) | |
| # (B, n_heads, T, head_dim) for SDPA | |
| q = q.transpose(1, 2) | |
| k = k.transpose(1, 2) | |
| v = v.transpose(1, 2) | |
| # Flash / memory-efficient attention — never materialises (B,H,T,T) score matrix | |
| out = F.scaled_dot_product_attention( | |
| q, k, v, | |
| dropout_p=self.dropout_p if self.training else 0.0, | |
| is_causal=(cache_k is None), # causal during training; non-causal with cache | |
| ) | |
| out = out.transpose(1, 2).contiguous().view(B, T, -1) | |
| return self.wo(out), new_cache_k, new_cache_v | |
| # SwiGLU FFN | |
| class SwiGLU(nn.Module): | |
| def __init__(self, cfg: ModelConfig): | |
| super().__init__() | |
| hidden = int(cfg.dim * cfg.ffn_dim_multiplier) | |
| hidden = (hidden + 255) & ~255 | |
| self.w1 = nn.Linear(cfg.dim, hidden, bias=False) | |
| self.w2 = nn.Linear(hidden, cfg.dim, bias=False) | |
| self.w3 = nn.Linear(cfg.dim, hidden, bias=False) | |
| self.dropout = nn.Dropout(cfg.dropout) | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| return self.dropout(self.w2(F.silu(self.w1(x)) * self.w3(x))) | |
| # Transformer Block | |
| class TransformerBlock(nn.Module): | |
| def __init__(self, cfg: ModelConfig): | |
| super().__init__() | |
| self.attn_norm = RMSNorm(cfg.dim, cfg.norm_eps) | |
| self.attn = GQAttention(cfg) | |
| self.ffn_norm = RMSNorm(cfg.dim, cfg.norm_eps) | |
| self.ffn = SwiGLU(cfg) | |
| def _forward(self, x, cos, sin, cache_k, cache_v, return_cache): | |
| attn_out, nck, ncv = self.attn( | |
| self.attn_norm(x), cos, sin, cache_k, cache_v, return_cache=return_cache | |
| ) | |
| x = x + attn_out | |
| x = x + self.ffn(self.ffn_norm(x)) | |
| return x, nck, ncv | |
| def forward(self, x, cos, sin, cache_k=None, cache_v=None, use_grad_ckpt=False, return_cache=False): | |
| if use_grad_ckpt and self.training: | |
| # gradient checkpointing: recompute activations on backward instead of storing them | |
| # cache is None during training so we pass dummy tensors to satisfy checkpoint API | |
| def ckpt_fn(x, cos, sin): | |
| out, _, _ = self._forward(x, cos, sin, None, None, False) | |
| return out | |
| x = checkpoint(ckpt_fn, x, cos, sin, use_reentrant=False) | |
| return x, None, None | |
| return self._forward(x, cos, sin, cache_k, cache_v, return_cache) | |
| # LLM Definition | |
| class LLM(nn.Module): | |
| def __init__(self, cfg: ModelConfig): | |
| super().__init__() | |
| self.cfg = cfg | |
| self.embed = nn.Embedding(cfg.vocab_size, cfg.dim) | |
| self.layers = nn.ModuleList([TransformerBlock(cfg) for _ in range(cfg.n_layers)]) | |
| self.norm = RMSNorm(cfg.dim, cfg.norm_eps) | |
| self.lm_head = nn.Linear(cfg.dim, cfg.vocab_size, bias=False) | |
| self.embed.weight = self.lm_head.weight # weight tying | |
| head_dim = cfg.dim // cfg.n_heads | |
| cos, sin = precompute_rope_freqs(head_dim, cfg.max_seq_len * 2, cfg.rope_theta) | |
| self.register_buffer("rope_cos", cos) | |
| self.register_buffer("rope_sin", sin) | |
| self.apply(self._init_weights) | |
| def _init_weights(self, m): | |
| if isinstance(m, nn.Linear): | |
| nn.init.normal_(m.weight, mean=0.0, std=0.02 / math.sqrt(2 * self.cfg.n_layers)) | |
| elif isinstance(m, nn.Embedding): | |
| nn.init.normal_(m.weight, mean=0.0, std=0.02) | |
| def forward( | |
| self, | |
| idx: torch.Tensor, | |
| targets: Optional[torch.Tensor] = None, | |
| cache: Optional[list] = None, | |
| use_grad_ckpt: bool = False, | |
| return_cache: bool = False, | |
| ): | |
| B, T = idx.shape | |
| x = self.embed(idx) | |
| pos_start = 0 if (cache is None or cache[0][0] is None) else cache[0][0].shape[1] | |
| cos = self.rope_cos[pos_start: pos_start + T] | |
| sin = self.rope_sin[pos_start: pos_start + T] | |
| need_cache = return_cache or (cache is not None) | |
| new_cache = [] if need_cache else None | |
| for i, layer in enumerate(self.layers): | |
| ck, cv = cache[i] if cache else (None, None) | |
| x, nck, ncv = layer( | |
| x, | |
| cos, | |
| sin, | |
| ck, | |
| cv, | |
| use_grad_ckpt=use_grad_ckpt, | |
| return_cache=need_cache, | |
| ) | |
| if need_cache: | |
| new_cache.append((nck, ncv)) | |
| x = self.norm(x) | |
| logits = self.lm_head(x) | |
| loss = None | |
| if targets is not None: | |
| loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) | |
| return logits, loss, new_cache | |
| def param_count(self) -> int: | |
| return sum(p.numel() for p in self.parameters()) | |
| def probe_attention_entropy(self, idx: torch.Tensor, max_probe_len: int = 256) -> float: | |
| """ | |
| Estimate mean causal attention entropy from layer 0 on a short token window. | |
| Lower entropy means sharper/more concentrated attention. | |
| """ | |
| if idx.ndim != 2: | |
| raise ValueError(f"idx must be shape (B, T), got {tuple(idx.shape)}") | |
| if idx.shape[1] == 0: | |
| return float("nan") | |
| probe_len = min(int(max_probe_len), int(idx.shape[1])) | |
| idx = idx[:, -probe_len:] | |
| B, T = idx.shape | |
| x = self.embed(idx) | |
| cos = self.rope_cos[:T] | |
| sin = self.rope_sin[:T] | |
| layer0 = self.layers[0] | |
| attn = layer0.attn | |
| h = layer0.attn_norm(x) | |
| q = attn.wq(h).view(B, T, attn.n_heads, attn.head_dim) | |
| k = attn.wk(h).view(B, T, attn.n_kv_heads, attn.head_dim) | |
| q = apply_rope(q, cos, sin) | |
| k = apply_rope(k, cos, sin) | |
| k = k.repeat_interleave(attn.n_rep, dim=2) | |
| q = q.transpose(1, 2).float() # (B, H, T, D) | |
| k = k.transpose(1, 2).float() # (B, H, T, D) | |
| scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(attn.head_dim) | |
| causal_mask = torch.triu( | |
| torch.ones((T, T), device=scores.device, dtype=torch.bool), diagonal=1 | |
| ) | |
| scores = scores.masked_fill(causal_mask, float("-inf")) | |
| probs = torch.softmax(scores, dim=-1) | |
| entropy = -(probs * torch.log(probs.clamp_min(1e-12))).sum(dim=-1) | |
| return float(entropy.mean().item()) | |
| def generate( | |
| self, | |
| idx: torch.Tensor, | |
| max_new_tokens: int, | |
| temperature: float = 0.8, | |
| top_p: float = 0.9, | |
| repetition_penalty: float = 1.1, | |
| no_repeat_ngram_size: int = 3, | |
| ): | |
| cache = None | |
| for _ in range(max_new_tokens): | |
| idx_cond = idx[:, -self.cfg.max_seq_len:] if cache is None else idx[:, -1:] | |
| logits, _, cache = self(idx_cond, cache=cache, return_cache=True) | |
| logits = logits[:, -1, :] | |
| # Discourage copying previously generated tokens. | |
| if repetition_penalty > 1.0: | |
| for b in range(idx.size(0)): | |
| used = idx[b].unique() | |
| used_logits = logits[b, used] | |
| logits[b, used] = torch.where( | |
| used_logits > 0, used_logits / repetition_penalty, used_logits * repetition_penalty | |
| ) | |
| # Block tokens that would create repeated n-grams. | |
| if no_repeat_ngram_size and no_repeat_ngram_size > 1 and idx.size(1) >= no_repeat_ngram_size - 1: | |
| n = int(no_repeat_ngram_size) | |
| for b in range(idx.size(0)): | |
| seq = idx[b].tolist() | |
| prefix = tuple(seq[-(n - 1) :]) | |
| banned = set() | |
| for i in range(len(seq) - n + 1): | |
| if tuple(seq[i : i + n - 1]) == prefix: | |
| banned.add(seq[i + n - 1]) | |
| if banned: | |
| logits[b, list(banned)] = float("-inf") | |
| if temperature == 0.0: | |
| next_tok = torch.argmax(logits, dim=-1, keepdim=True) | |
| else: | |
| logits = logits / temperature | |
| probs = F.softmax(logits, dim=-1) | |
| sorted_probs, sorted_idx = torch.sort(probs, descending=True) | |
| cumsum = sorted_probs.cumsum(-1) | |
| sorted_probs[cumsum - sorted_probs > top_p] = 0.0 | |
| sorted_probs /= sorted_probs.sum(-1, keepdim=True) | |
| next_tok = sorted_idx.gather(-1, torch.multinomial(sorted_probs, 1)) | |
| idx = torch.cat([idx, next_tok], dim=1) | |
| return idx | |