Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

README.md +50 -0
chat.py +90 -0
config.json +18 -0
config.py +102 -0
model.py +223 -0
model.safetensors +3 -0
pytorch_model.bin +3 -0
system_admin_tokenizer.json +0 -0
tokenizer.json +0 -0
tokenizer_config.json +8 -0

README.md ADDED Viewed

	@@ -0,0 +1,50 @@

+---
+language: [en]
+license: mit
+tags:
+- sysadmin
+- linux
+- windows-server
+- networking
+- security
+- slm
+- llama-style
+- rope
+- 5m-context
+- from-scratch
+- 1b-params
+pipeline_tag: text-generation
+---
+# System Admin-SLM: Role-Based Small Language Model
+A **LLaMA-style transformer** (~1016.6M params, ~1.02B) trained from scratch for the **System Admin** role.
+Supports up to **5M token context** via RoPE with gradient checkpointing.
+## Architecture
+| Component | Value |
+|-----------|-------|
+| Architecture | LLaMA-style (RoPE + RMSNorm + SwiGLU) |
+| Parameters | ~1016.6M (~1.02B) |
+| Layers | 32 |
+| Heads | 20 |
+| Embedding | 1600 |
+| Max Context | 5,000,000 tokens |
+| Max Output | 5,000,000 tokens |
+| Vocab | 18,841 BPE |
+| Model Size | ~4 GB (fp32) |
+## Training
+- Best eval loss: 5.795391702651978
+- Trained with gradient checkpointing on Apple M4 (MPS)
+- 3 epochs, batch_size=1, grad_accum=16
+## Usage
+```python
+from huggingface_hub import hf_hub_download
+from tokenizers import Tokenizer
+model_path = hf_hub_download("sathishphdai/system-admin-slm-5m", "model.safetensors")
+tokenizer_path = hf_hub_download("sathishphdai/system-admin-slm-5m", "system_admin_tokenizer.json")
+tokenizer = Tokenizer.from_file(tokenizer_path)
+```

chat.py ADDED Viewed

	@@ -0,0 +1,90 @@

+#!/usr/bin/env python3
+"""Interactive chat and demo inference for Role SLM."""
+import torch
+from tokenizers import Tokenizer
+from config import cfg
+from model import RoleSLM
+def load_model(checkpoint_name="best_model.pt"):
+    device = torch.device(cfg.device)
+    ckpt_path = cfg.checkpoint_dir / checkpoint_name
+    if not ckpt_path.exists():
+        raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")
+    ckpt = torch.load(ckpt_path, map_location=device, weights_only=False)
+    for key, val in ckpt.get("config", {}).items():
+        if hasattr(cfg, key):
+            setattr(cfg, key, val)
+    model = RoleSLM()
+    model.load_state_dict(ckpt["model_state_dict"], strict=False)
+    model = model.to(device)
+    model.eval()
+    tok_path = cfg.tokenizer_dir / cfg.tokenizer_filename
+    tokenizer = Tokenizer.from_file(str(tok_path))
+    print(f"Model loaded: {model.count_parameters()/1e6:.2f}M params")
+    return model, tokenizer, device
+def generate_response(model, tokenizer, device, prompt, max_tokens=None,
+                      temperature=0.8, top_k=50, top_p=0.9):
+    max_tokens = max_tokens or min(cfg.max_new_tokens, 512)
+    encoded = tokenizer.encode(prompt)
+    ids = encoded.ids
+    if ids and ids[-1] == 3:
+        ids = ids[:-1]
+    input_ids = torch.tensor([ids], dtype=torch.long, device=device)
+    input_len = input_ids.shape[1]
+    with torch.no_grad():
+        output_ids = model.generate(input_ids, max_new_tokens=max_tokens,
+                                    temperature=temperature, top_k=top_k, top_p=top_p)
+    new_tokens = output_ids[0][input_len:].tolist()
+    response = tokenizer.decode(new_tokens)
+    response = response.replace("<eos>", "").replace("<bos>", "").replace("<pad>", "").strip()
+    return response
+DEMO_PROMPTS = ['Linux system administration involves', 'Server hardening best practices include', 'Automated configuration management using', 'Network troubleshooting steps include', 'System monitoring tools help administrators by']
+def demo_generation(model, tokenizer, device):
+    print(f"\n{'='*60}")
+    print(f"Demo: {cfg.domain_name}-SLM Inference")
+    print(f"{'='*60}\n")
+    for i, prompt in enumerate(DEMO_PROMPTS, 1):
+        print(f"[{i}] Prompt: {prompt}")
+        response = generate_response(model, tokenizer, device, prompt, max_tokens=256)
+        print(f"    Response: {response}\n")
+def interactive_chat():
+    print("Loading model...")
+    model, tokenizer, device = load_model()
+    print(f"\n{'='*60}")
+    print(f"{cfg.domain_name}-SLM Interactive Chat (type 'quit' to exit, 'demo' for demos)")
+    print(f"{'='*60}\n")
+    while True:
+        try:
+            user_input = input("You: ").strip()
+            if not user_input:
+                continue
+            if user_input.lower() == "quit":
+                print("Goodbye!")
+                break
+            if user_input.lower() == "demo":
+                demo_generation(model, tokenizer, device)
+                continue
+            response = generate_response(model, tokenizer, device, user_input, max_tokens=512)
+            print(f"SLM: {response}\n")
+        except KeyboardInterrupt:
+            print("\nGoodbye!")
+            break
+if __name__ == "__main__":
+    interactive_chat()

config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "architectures": [
+    "RoleSLM"
+  ],
+  "model_type": "system_admin-slm",
+  "domain": "System Admin",
+  "vocab_size": 18841,
+  "n_layer": 32,
+  "n_head": 20,
+  "n_embd": 1600,
+  "block_size": 512,
+  "dropout": 0.05,
+  "bias": false,
+  "ffn_multiplier": 2.667,
+  "max_position_embeddings": 5000000,
+  "rope_theta": 5000000.0,
+  "n_parameters": 1016566400
+}

config.py ADDED Viewed

	@@ -0,0 +1,102 @@

+#!/usr/bin/env python3
+"""
+Configuration for System-Admin-SLM: A Role-Based SLM for System Admin.
+~1B params, LLaMA-style architecture with RoPE — supports up to 5M token context.
+"""
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+@dataclass
+class SLMConfig:
+    """All hyperparameters and paths in one place."""
+    # ── Project paths ──────────────────────────────────────────────
+    project_dir: Path = Path(__file__).resolve().parent
+    data_dir: Path = field(default=None)
+    tokenizer_dir: Path = field(default=None)
+    checkpoint_dir: Path = field(default=None)
+    # ── Domain ─────────────────────────────────────────────────────
+    domain_name: str = "System Admin"
+    domain_slug: str = "system_admin"
+    tokenizer_filename: str = "system_admin_tokenizer.json"
+    # ── Tokenizer ──────────────────────────────────────────────────
+    vocab_size: int = 32_768
+    min_frequency: int = 2
+    special_tokens: list = field(
+        default_factory=lambda: [
+            "<pad>", "<unk>", "<bos>", "<eos>",
+            "<|system|>", "<|user|>", "<|assistant|>",
+        ]
+    )
+    # ── Model (~1B params, LLaMA-style with RoPE) ─────────────────
+    n_layer: int = 32
+    n_head: int = 20
+    n_embd: int = 1600
+    block_size: int = 1_000_000  # 1M input token context window
+    dropout: float = 0.05
+    bias: bool = False
+    ffn_multiplier: float = 2.667
+    # ── RoPE ───────────────────────────────────────────────────────
+    max_position_embeddings: int = 5_000_000  # 5M context window via RoPE
+    rope_theta: float = 5_000_000.0  # Scaled for 5M context window
+    # ── Sliding Window ─────────────────────────────────────────────
+    sliding_window: Optional[int] = None
+    # ── Gradient Checkpointing (essential for 1B on 24GB) ──────────
+    gradient_checkpointing: bool = True
+    # ── Training ───────────────────────────────────────────────────
+    batch_size: int = 1
+    gradient_accumulation_steps: int = 16
+    learning_rate: float = 2e-4
+    weight_decay: float = 0.1
+    max_epochs: int = 3
+    dataset_stride: int = 512  # Training stride
+    warmup_steps: int = 100
+    grad_clip: float = 1.0
+    eval_interval: int = 50
+    eval_samples: int = 10
+    log_interval: int = 10
+    device: str = "auto"
+    # ── Generation ─────────────────────────────────────────────────
+    max_new_tokens: int = 5_000_000  # 5M max output tokens
+    temperature: float = 0.8
+    top_k: int = 50
+    top_p: float = 0.9
+    # ── HuggingFace ────────────────────────────────────────────────
+    hf_repo_name: str = "system-admin-slm-5m"
+    hf_model_card_tags: list = field(default_factory=lambda: ['sysadmin', 'linux', 'windows-server', 'networking', 'security', 'slm', 'llama-style', 'rope', '5m-context', 'from-scratch', '1b-params'])
+    def __post_init__(self):
+        if self.data_dir is None:
+            self.data_dir = self.project_dir / "data"
+        if self.tokenizer_dir is None:
+            self.tokenizer_dir = self.project_dir / "tokenizer"
+        if self.checkpoint_dir is None:
+            self.checkpoint_dir = self.project_dir / "checkpoints"
+        self.data_dir.mkdir(parents=True, exist_ok=True)
+        self.tokenizer_dir.mkdir(parents=True, exist_ok=True)
+        self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
+        if self.device == "auto":
+            import torch
+            if torch.cuda.is_available():
+                self.device = "cuda"
+            elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+                self.device = "mps"
+            else:
+                self.device = "cpu"
+cfg = SLMConfig()

model.py ADDED Viewed

	@@ -0,0 +1,223 @@

+#!/usr/bin/env python3
+"""
+model.py — Role SLM Transformer (~1B params) with RoPE + Gradient Checkpointing
+================================================================================
+Supports context lengths up to 5M tokens via:
+  * RoPE (no fixed position embedding table)
+  * RMSNorm (more efficient than LayerNorm)
+  * SwiGLU activation (better training dynamics)
+  * Flash Attention via PyTorch scaled_dot_product_attention
+  * Gradient checkpointing for memory-efficient training on 24GB
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint as grad_checkpoint
+from typing import Optional, Tuple
+from config import cfg
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        norm = torch.rsqrt(x.float().pow(2).mean(-1, keepdim=True) + self.eps)
+        return (x.float() * norm).type_as(x) * self.weight
+def precompute_rope_freqs(dim, max_seq_len, theta=10000.0, device=None):
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, device=device).float() / dim))
+    t = torch.arange(max_seq_len, device=device).float()
+    freqs = torch.outer(t, freqs)
+    return freqs.cos(), freqs.sin()
+def apply_rope(x, cos, sin):
+    seq_len = x.shape[2]
+    head_dim = x.shape[3]
+    cos = cos[:seq_len].unsqueeze(0).unsqueeze(0)
+    sin = sin[:seq_len].unsqueeze(0).unsqueeze(0)
+    x1 = x[..., :head_dim // 2]
+    x2 = x[..., head_dim // 2:]
+    return torch.cat([x1 * cos - x2 * sin, x2 * cos + x1 * sin], dim=-1)
+class CausalSelfAttention(nn.Module):
+    def __init__(self):
+        super().__init__()
+        assert cfg.n_embd % cfg.n_head == 0
+        self.n_head = cfg.n_head
+        self.head_dim = cfg.n_embd // cfg.n_head
+        self.q_proj = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False)
+        self.k_proj = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False)
+        self.v_proj = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False)
+        self.out_proj = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False)
+        self.resid_drop = nn.Dropout(cfg.dropout)
+    def forward(self, x, rope_cos, rope_sin):
+        B, T, C = x.shape
+        q = self.q_proj(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = self.k_proj(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = self.v_proj(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        q = apply_rope(q, rope_cos, rope_sin)
+        k = apply_rope(k, rope_cos, rope_sin)
+        if hasattr(F, 'scaled_dot_product_attention'):
+            y = F.scaled_dot_product_attention(q, k, v,
+                dropout_p=cfg.dropout if self.training else 0.0, is_causal=True)
+        else:
+            scale = 1.0 / math.sqrt(self.head_dim)
+            att = (q @ k.transpose(-2, -1)) * scale
+            mask = torch.triu(torch.ones(T, T, device=x.device), diagonal=1).bool()
+            att = att.masked_fill(mask.unsqueeze(0).unsqueeze(0), float('-inf'))
+            att = F.softmax(att, dim=-1)
+            y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        return self.resid_drop(self.out_proj(y))
+class SwiGLUFFN(nn.Module):
+    def __init__(self):
+        super().__init__()
+        hidden_dim = int(cfg.n_embd * getattr(cfg, 'ffn_multiplier', 2.667))
+        hidden_dim = ((hidden_dim + 63) // 64) * 64
+        self.gate_proj = nn.Linear(cfg.n_embd, hidden_dim, bias=False)
+        self.up_proj = nn.Linear(cfg.n_embd, hidden_dim, bias=False)
+        self.down_proj = nn.Linear(hidden_dim, cfg.n_embd, bias=False)
+        self.dropout = nn.Dropout(cfg.dropout)
+    def forward(self, x):
+        return self.dropout(self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x)))
+class TransformerBlock(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.attn_norm = RMSNorm(cfg.n_embd)
+        self.attn = CausalSelfAttention()
+        self.ffn_norm = RMSNorm(cfg.n_embd)
+        self.ffn = SwiGLUFFN()
+    def forward(self, x, rope_cos, rope_sin):
+        x = x + self.attn(self.attn_norm(x), rope_cos, rope_sin)
+        x = x + self.ffn(self.ffn_norm(x))
+        return x
+class RoleSLM(nn.Module):
+    """Role-Based Small Language Model — ~1B params, LLaMA-style with gradient checkpointing."""
+    def __init__(self):
+        super().__init__()
+        self.tok_emb = nn.Embedding(cfg.vocab_size, cfg.n_embd)
+        self.drop = nn.Dropout(cfg.dropout)
+        self.blocks = nn.ModuleList([TransformerBlock() for _ in range(cfg.n_layer)])
+        self.norm = RMSNorm(cfg.n_embd)
+        self.lm_head = nn.Linear(cfg.n_embd, cfg.vocab_size, bias=False)
+        self.tok_emb.weight = self.lm_head.weight  # Weight tying
+        self.use_checkpointing = getattr(cfg, 'gradient_checkpointing', True)
+        head_dim = cfg.n_embd // cfg.n_head
+        max_pos = getattr(cfg, 'max_position_embeddings', 1_000_000)
+        rope_theta = getattr(cfg, 'rope_theta', 10000.0)
+        precompute_len = min(max_pos, cfg.block_size * 2)
+        cos, sin = precompute_rope_freqs(head_dim, precompute_len, theta=rope_theta)
+        self.register_buffer("rope_cos", cos, persistent=False)
+        self.register_buffer("rope_sin", sin, persistent=False)
+        self._rope_max_len = precompute_len
+        self._rope_theta = rope_theta
+        self._head_dim = head_dim
+        self.apply(self._init_weights)
+        n_params = sum(p.numel() for p in self.parameters())
+        print(f"{cfg.domain_name}-SLM initialized: {n_params/1e6:.2f}M parameters ({n_params/1e9:.3f}B)")
+        print(f"   Architecture: {cfg.n_layer}L / {cfg.n_head}H / {cfg.n_embd}D")
+        print(f"   Gradient checkpointing: {self.use_checkpointing}")
+        print(f"   Max context: {max_pos:,} tokens (via RoPE)")
+        print(f"   Estimated model size: {n_params * 4 / 1e9:.2f} GB (fp32)")
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def _extend_rope(self, seq_len, device):
+        if seq_len > self._rope_max_len:
+            new_len = max(seq_len, self._rope_max_len * 2)
+            cos, sin = precompute_rope_freqs(self._head_dim, new_len,
+                                             theta=self._rope_theta, device=device)
+            self.rope_cos = cos
+            self.rope_sin = sin
+            self._rope_max_len = new_len
+    def _block_forward(self, block, x, rope_cos, rope_sin):
+        """Wrapper for gradient checkpointing."""
+        return block(x, rope_cos, rope_sin)
+    def forward(self, idx, targets=None):
+        B, T = idx.shape
+        device = idx.device
+        self._extend_rope(T, device)
+        x = self.drop(self.tok_emb(idx))
+        rope_cos = self.rope_cos[:T].to(device)
+        rope_sin = self.rope_sin[:T].to(device)
+        for block in self.blocks:
+            if self.use_checkpointing and self.training:
+                x = grad_checkpoint(self._block_forward, block, x, rope_cos, rope_sin,
+                                    use_reentrant=False)
+            else:
+                x = block(x, rope_cos, rope_sin)
+        x = self.norm(x)
+        logits = self.lm_head(x)
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
+        return logits, loss
+    @torch.no_grad()
+    def generate(self, idx, max_new_tokens, temperature=0.8, top_k=50, top_p=0.9):
+        self.use_checkpointing = False  # No checkpointing during generation
+        for _ in range(max_new_tokens):
+            idx_cond = idx if idx.size(1) <= cfg.block_size else idx[:, -cfg.block_size:]
+            logits, _ = self(idx_cond)
+            logits = logits[:, -1, :]
+            if temperature == 0:
+                idx_next = logits.argmax(dim=-1, keepdim=True)
+            else:
+                logits = logits / temperature
+                if top_k > 0:
+                    v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                    logits[logits < v[:, [-1]]] = float('-inf')
+                if top_p < 1.0:
+                    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+                    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+                    sorted_indices_to_remove = cumulative_probs > top_p
+                    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                    sorted_indices_to_remove[..., 0] = 0
+                    indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+                    logits[indices_to_remove] = float('-inf')
+                probs = F.softmax(logits, dim=-1)
+                idx_next = torch.multinomial(probs, num_samples=1)
+            idx = torch.cat([idx, idx_next], dim=1)
+            if idx_next.item() == 3:  # <eos>
+                break
+        self.use_checkpointing = getattr(cfg, 'gradient_checkpointing', True)
+        return idx
+    def count_parameters(self):
+        return sum(p.numel() for p in self.parameters())
+if __name__ == "__main__":
+    model = RoleSLM()
+    x = torch.randint(0, cfg.vocab_size, (1, 32))
+    logits, loss = model(x, x)
+    print(f"Test forward: logits={logits.shape}, loss={loss.item():.4f}")

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c37efec5f714363958d855c92cb131c00aa8dc31ea09c83f013ffd641a297990
+size 4186878264

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:217f370c5c5db93d56c09bd8d623cc8af8cc0355d87da08662e14cd154582d5c
+size 4066376683

system_admin_tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "bos_token": "<bos>",
+  "eos_token": "<eos>",
+  "unk_token": "<unk>",
+  "pad_token": "<pad>",
+  "model_max_length": 5000000
+}