Instructions to use user-anto/Axiom-Dense-380M-Base with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use user-anto/Axiom-Dense-380M-Base with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="user-anto/Axiom-Dense-380M-Base", trust_remote_code=True)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("user-anto/Axiom-Dense-380M-Base", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use user-anto/Axiom-Dense-380M-Base with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "user-anto/Axiom-Dense-380M-Base"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "user-anto/Axiom-Dense-380M-Base",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker

docker model run hf.co/user-anto/Axiom-Dense-380M-Base

SGLang

How to use user-anto/Axiom-Dense-380M-Base with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "user-anto/Axiom-Dense-380M-Base" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "user-anto/Axiom-Dense-380M-Base",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "user-anto/Axiom-Dense-380M-Base" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "user-anto/Axiom-Dense-380M-Base",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Docker Model Runner
How to use user-anto/Axiom-Dense-380M-Base with Docker Model Runner:
```
docker model run hf.co/user-anto/Axiom-Dense-380M-Base
```

Axiom-Dense-380M-Base / model.py

user-anto

Upload MiniLM base checkpoint

dd3a77b verified 11 days ago

raw

history blame contribute delete

11.5 kB

	import math
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.utils.checkpoint import checkpoint
	from typing import Optional
	from config import ModelConfig


	# RMSNorm
	class RMSNorm(nn.Module):
	def __init__(self, dim: int, eps: float = 1e-5):
	super().__init__()
	self.eps = eps
	self.weight = nn.Parameter(torch.ones(dim))

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	rms = x.float().pow(2).mean(-1, keepdim=True).add(self.eps).rsqrt()
	return (x.float() * rms).to(x.dtype) * self.weight


	# RoPE
	def precompute_rope_freqs(head_dim: int, max_seq_len: int, theta: float = 10000.0):
	freqs = 1.0 / (theta ** (torch.arange(0, head_dim, 2).float() / head_dim))
	t = torch.arange(max_seq_len)
	angles = torch.outer(t, freqs)
	return angles.cos(), angles.sin()


	def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
	# x: (B, T, n_heads, head_dim)
	x_even = x[..., ::2].float()
	x_odd = x[..., 1::2].float()

	# (T, head_dim/2) -> (1, T, 1, head_dim/2) for broadcasting
	cos = cos[: x.shape[1]].unsqueeze(0).unsqueeze(2)
	sin = sin[: x.shape[1]].unsqueeze(0).unsqueeze(2)

	out_even = x_even * cos - x_odd * sin
	out_odd = x_even * sin + x_odd * cos
	x_rot = torch.stack((out_even, out_odd), dim=-1).flatten(-2)
	return x_rot.to(x.dtype)


	# GQA
	class GQAttention(nn.Module):
	def __init__(self, cfg: ModelConfig):
	super().__init__()
	assert cfg.n_heads % cfg.n_kv_heads == 0
	self.n_heads = cfg.n_heads
	self.n_kv_heads = cfg.n_kv_heads
	self.n_rep = cfg.n_heads // cfg.n_kv_heads
	self.head_dim = cfg.dim // cfg.n_heads

	self.wq = nn.Linear(cfg.dim, cfg.n_heads * self.head_dim, bias=False)
	self.wk = nn.Linear(cfg.dim, cfg.n_kv_heads * self.head_dim, bias=False)
	self.wv = nn.Linear(cfg.dim, cfg.n_kv_heads * self.head_dim, bias=False)
	self.wo = nn.Linear(cfg.n_heads * self.head_dim, cfg.dim, bias=False)
	self.dropout_p = cfg.dropout

	def forward(
	self,
	x: torch.Tensor,
	cos: torch.Tensor,
	sin: torch.Tensor,
	cache_k: Optional[torch.Tensor] = None,
	cache_v: Optional[torch.Tensor] = None,
	return_cache: bool = False,
	):
	B, T, _ = x.shape
	q = self.wq(x).view(B, T, self.n_heads, self.head_dim)
	k = self.wk(x).view(B, T, self.n_kv_heads, self.head_dim)
	v = self.wv(x).view(B, T, self.n_kv_heads, self.head_dim)

	q = apply_rope(q, cos, sin)
	k = apply_rope(k, cos, sin)

	if cache_k is not None:
	k = torch.cat([cache_k, k], dim=1)
	v = torch.cat([cache_v, v], dim=1)
	new_cache_k, new_cache_v = (k, v) if return_cache else (None, None)

	# Expand KV heads → Q heads
	k = k.repeat_interleave(self.n_rep, dim=2)
	v = v.repeat_interleave(self.n_rep, dim=2)

	# (B, n_heads, T, head_dim) for SDPA
	q = q.transpose(1, 2)
	k = k.transpose(1, 2)
	v = v.transpose(1, 2)

	# Flash / memory-efficient attention — never materialises (B,H,T,T) score matrix
	out = F.scaled_dot_product_attention(
	q, k, v,
	dropout_p=self.dropout_p if self.training else 0.0,
	is_causal=(cache_k is None), # causal during training; non-causal with cache
	)

	out = out.transpose(1, 2).contiguous().view(B, T, -1)
	return self.wo(out), new_cache_k, new_cache_v


	# SwiGLU FFN
	class SwiGLU(nn.Module):
	def __init__(self, cfg: ModelConfig):
	super().__init__()
	hidden = int(cfg.dim * cfg.ffn_dim_multiplier)
	hidden = (hidden + 255) & ~255
	self.w1 = nn.Linear(cfg.dim, hidden, bias=False)
	self.w2 = nn.Linear(hidden, cfg.dim, bias=False)
	self.w3 = nn.Linear(cfg.dim, hidden, bias=False)
	self.dropout = nn.Dropout(cfg.dropout)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	return self.dropout(self.w2(F.silu(self.w1(x)) * self.w3(x)))


	# Transformer Block
	class TransformerBlock(nn.Module):
	def __init__(self, cfg: ModelConfig):
	super().__init__()
	self.attn_norm = RMSNorm(cfg.dim, cfg.norm_eps)
	self.attn = GQAttention(cfg)
	self.ffn_norm = RMSNorm(cfg.dim, cfg.norm_eps)
	self.ffn = SwiGLU(cfg)

	def _forward(self, x, cos, sin, cache_k, cache_v, return_cache):
	attn_out, nck, ncv = self.attn(
	self.attn_norm(x), cos, sin, cache_k, cache_v, return_cache=return_cache
	)
	x = x + attn_out
	x = x + self.ffn(self.ffn_norm(x))
	return x, nck, ncv

	def forward(self, x, cos, sin, cache_k=None, cache_v=None, use_grad_ckpt=False, return_cache=False):
	if use_grad_ckpt and self.training:
	# gradient checkpointing: recompute activations on backward instead of storing them
	# cache is None during training so we pass dummy tensors to satisfy checkpoint API
	def ckpt_fn(x, cos, sin):
	out, _, _ = self._forward(x, cos, sin, None, None, False)
	return out
	x = checkpoint(ckpt_fn, x, cos, sin, use_reentrant=False)
	return x, None, None
	return self._forward(x, cos, sin, cache_k, cache_v, return_cache)


	# LLM Definition
	class LLM(nn.Module):
	def __init__(self, cfg: ModelConfig):
	super().__init__()
	self.cfg = cfg
	self.embed = nn.Embedding(cfg.vocab_size, cfg.dim)
	self.layers = nn.ModuleList([TransformerBlock(cfg) for _ in range(cfg.n_layers)])
	self.norm = RMSNorm(cfg.dim, cfg.norm_eps)
	self.lm_head = nn.Linear(cfg.dim, cfg.vocab_size, bias=False)
	self.embed.weight = self.lm_head.weight # weight tying

	head_dim = cfg.dim // cfg.n_heads
	cos, sin = precompute_rope_freqs(head_dim, cfg.max_seq_len * 2, cfg.rope_theta)
	self.register_buffer("rope_cos", cos)
	self.register_buffer("rope_sin", sin)

	self.apply(self._init_weights)

	def _init_weights(self, m):
	if isinstance(m, nn.Linear):
	nn.init.normal_(m.weight, mean=0.0, std=0.02 / math.sqrt(2 * self.cfg.n_layers))
	elif isinstance(m, nn.Embedding):
	nn.init.normal_(m.weight, mean=0.0, std=0.02)

	def forward(
	self,
	idx: torch.Tensor,
	targets: Optional[torch.Tensor] = None,
	cache: Optional[list] = None,
	use_grad_ckpt: bool = False,
	return_cache: bool = False,
	):
	B, T = idx.shape
	x = self.embed(idx)
	pos_start = 0 if (cache is None or cache[0][0] is None) else cache[0][0].shape[1]
	cos = self.rope_cos[pos_start: pos_start + T]
	sin = self.rope_sin[pos_start: pos_start + T]

	need_cache = return_cache or (cache is not None)
	new_cache = [] if need_cache else None
	for i, layer in enumerate(self.layers):
	ck, cv = cache[i] if cache else (None, None)
	x, nck, ncv = layer(
	x,
	cos,
	sin,
	ck,
	cv,
	use_grad_ckpt=use_grad_ckpt,
	return_cache=need_cache,
	)
	if need_cache:
	new_cache.append((nck, ncv))

	x = self.norm(x)
	logits = self.lm_head(x)

	loss = None
	if targets is not None:
	loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))

	return logits, loss, new_cache

	def param_count(self) -> int:
	return sum(p.numel() for p in self.parameters())

	@torch.no_grad()
	def probe_attention_entropy(self, idx: torch.Tensor, max_probe_len: int = 256) -> float:
	"""
	Estimate mean causal attention entropy from layer 0 on a short token window.
	Lower entropy means sharper/more concentrated attention.
	"""
	if idx.ndim != 2:
	raise ValueError(f"idx must be shape (B, T), got {tuple(idx.shape)}")
	if idx.shape[1] == 0:
	return float("nan")

	probe_len = min(int(max_probe_len), int(idx.shape[1]))
	idx = idx[:, -probe_len:]
	B, T = idx.shape

	x = self.embed(idx)
	cos = self.rope_cos[:T]
	sin = self.rope_sin[:T]
	layer0 = self.layers[0]
	attn = layer0.attn

	h = layer0.attn_norm(x)
	q = attn.wq(h).view(B, T, attn.n_heads, attn.head_dim)
	k = attn.wk(h).view(B, T, attn.n_kv_heads, attn.head_dim)
	q = apply_rope(q, cos, sin)
	k = apply_rope(k, cos, sin)

	k = k.repeat_interleave(attn.n_rep, dim=2)

	q = q.transpose(1, 2).float() # (B, H, T, D)
	k = k.transpose(1, 2).float() # (B, H, T, D)

	scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(attn.head_dim)
	causal_mask = torch.triu(
	torch.ones((T, T), device=scores.device, dtype=torch.bool), diagonal=1
	)
	scores = scores.masked_fill(causal_mask, float("-inf"))

	probs = torch.softmax(scores, dim=-1)
	entropy = -(probs * torch.log(probs.clamp_min(1e-12))).sum(dim=-1)
	return float(entropy.mean().item())

	@torch.no_grad()
	def generate(
	self,
	idx: torch.Tensor,
	max_new_tokens: int,
	temperature: float = 0.8,
	top_p: float = 0.9,
	repetition_penalty: float = 1.1,
	no_repeat_ngram_size: int = 3,
	):
	cache = None
	for _ in range(max_new_tokens):
	idx_cond = idx[:, -self.cfg.max_seq_len:] if cache is None else idx[:, -1:]
	logits, _, cache = self(idx_cond, cache=cache, return_cache=True)
	logits = logits[:, -1, :]

	# Discourage copying previously generated tokens.
	if repetition_penalty > 1.0:
	for b in range(idx.size(0)):
	used = idx[b].unique()
	used_logits = logits[b, used]
	logits[b, used] = torch.where(
	used_logits > 0, used_logits / repetition_penalty, used_logits * repetition_penalty
	)

	# Block tokens that would create repeated n-grams.
	if no_repeat_ngram_size and no_repeat_ngram_size > 1 and idx.size(1) >= no_repeat_ngram_size - 1:
	n = int(no_repeat_ngram_size)
	for b in range(idx.size(0)):
	seq = idx[b].tolist()
	prefix = tuple(seq[-(n - 1) :])
	banned = set()
	for i in range(len(seq) - n + 1):
	if tuple(seq[i : i + n - 1]) == prefix:
	banned.add(seq[i + n - 1])
	if banned:
	logits[b, list(banned)] = float("-inf")

	if temperature == 0.0:
	next_tok = torch.argmax(logits, dim=-1, keepdim=True)
	else:
	logits = logits / temperature
	probs = F.softmax(logits, dim=-1)
	sorted_probs, sorted_idx = torch.sort(probs, descending=True)
	cumsum = sorted_probs.cumsum(-1)
	sorted_probs[cumsum - sorted_probs > top_p] = 0.0
	sorted_probs /= sorted_probs.sum(-1, keepdim=True)
	next_tok = sorted_idx.gather(-1, torch.multinomial(sorted_probs, 1))

	idx = torch.cat([idx, next_tok], dim=1)
	return idx