Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

README.md +49 -0
chat.py +90 -0
config.json +18 -0
config.py +102 -0
model.py +223 -0
model.safetensors +3 -0
pytorch_model.bin +3 -0
software_engineer_tokenizer.json +0 -0
tokenizer.json +0 -0
tokenizer_config.json +8 -0

README.md ADDED Viewed

	@@ -0,0 +1,49 @@

+---
+language: [en]
+license: mit
+tags:
+- software-engineering
+- programming
+- algorithms
+- system-design
+- slm
+- llama-style
+- rope
+- 1m-context
+- from-scratch
+- 1b-params
+pipeline_tag: text-generation
+---
+# Software Engineer-SLM: Role-Based Small Language Model
+A **LLaMA-style transformer** (~989.9M params, ~0.99B) trained from scratch for the **Software Engineer** role.
+Supports up to **1M token context** via RoPE with gradient checkpointing.
+## Architecture
+| Component | Value |
+|-----------|-------|
+| Architecture | LLaMA-style (RoPE + RMSNorm + SwiGLU) |
+| Parameters | ~989.9M (~0.99B) |
+| Layers | 32 |
+| Heads | 20 |
+| Embedding | 1600 |
+| Max Context | 100,000,000,000 tokens |
+| Max Output | 1,000,000 tokens |
+| Vocab | 2,180 BPE |
+| Model Size | ~4 GB (fp32) |
+## Training
+- Best eval loss: 0.301249697804451
+- Trained with gradient checkpointing on Apple M4 (MPS)
+- 5 epochs, batch_size=1, grad_accum=16
+## Usage
+```python
+from huggingface_hub import hf_hub_download
+from tokenizers import Tokenizer
+model_path = hf_hub_download("sathishphdai/software-engineer-slm-1m", "model.safetensors")
+tokenizer_path = hf_hub_download("sathishphdai/software-engineer-slm-1m", "software_engineer_tokenizer.json")
+tokenizer = Tokenizer.from_file(tokenizer_path)
+```

chat.py ADDED Viewed

	@@ -0,0 +1,90 @@

+#!/usr/bin/env python3
+"""Interactive chat and demo inference for Role SLM."""
+import torch
+from tokenizers import Tokenizer
+from config import cfg
+from model import RoleSLM
+def load_model(checkpoint_name="best_model.pt"):
+    device = torch.device(cfg.device)
+    ckpt_path = cfg.checkpoint_dir / checkpoint_name
+    if not ckpt_path.exists():
+        raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")
+    ckpt = torch.load(ckpt_path, map_location=device, weights_only=False)
+    for key, val in ckpt.get("config", {}).items():
+        if hasattr(cfg, key):
+            setattr(cfg, key, val)
+    model = RoleSLM()
+    model.load_state_dict(ckpt["model_state_dict"], strict=False)
+    model = model.to(device)
+    model.eval()
+    tok_path = cfg.tokenizer_dir / cfg.tokenizer_filename
+    tokenizer = Tokenizer.from_file(str(tok_path))
+    print(f"Model loaded: {model.count_parameters()/1e6:.2f}M params")
+    return model, tokenizer, device
+def generate_response(model, tokenizer, device, prompt, max_tokens=None,
+                      temperature=0.8, top_k=50, top_p=0.9):
+    max_tokens = max_tokens or min(cfg.max_new_tokens, 512)
+    encoded = tokenizer.encode(prompt)
+    ids = encoded.ids
+    if ids and ids[-1] == 3:
+        ids = ids[:-1]
+    input_ids = torch.tensor([ids], dtype=torch.long, device=device)
+    input_len = input_ids.shape[1]
+    with torch.no_grad():
+        output_ids = model.generate(input_ids, max_new_tokens=max_tokens,
+                                    temperature=temperature, top_k=top_k, top_p=top_p)
+    new_tokens = output_ids[0][input_len:].tolist()
+    response = tokenizer.decode(new_tokens)
+    response = response.replace("<eos>", "").replace("<bos>", "").replace("<pad>", "").strip()
+    return response
+DEMO_PROMPTS = ['Object-oriented design principles include', 'Microservices architecture benefits include', 'The SOLID principles in software engineering are', 'Database indexing improves query performance by', 'RESTful API design best practices include']
+def demo_generation(model, tokenizer, device):
+    print(f"\n{'='*60}")
+    print(f"Demo: {cfg.domain_name}-SLM Inference")
+    print(f"{'='*60}\n")
+    for i, prompt in enumerate(DEMO_PROMPTS, 1):
+        print(f"[{i}] Prompt: {prompt}")
+        response = generate_response(model, tokenizer, device, prompt, max_tokens=256)
+        print(f"    Response: {response}\n")
+def interactive_chat():
+    print("Loading model...")
+    model, tokenizer, device = load_model()
+    print(f"\n{'='*60}")
+    print(f"{cfg.domain_name}-SLM Interactive Chat (type 'quit' to exit, 'demo' for demos)")
+    print(f"{'='*60}\n")
+    while True:
+        try:
+            user_input = input("You: ").strip()
+            if not user_input:
+                continue
+            if user_input.lower() == "quit":
+                print("Goodbye!")
+                break
+            if user_input.lower() == "demo":
+                demo_generation(model, tokenizer, device)
+                continue
+            response = generate_response(model, tokenizer, device, user_input, max_tokens=512)
+            print(f"SLM: {response}\n")
+        except KeyboardInterrupt:
+            print("\nGoodbye!")
+            break
+if __name__ == "__main__":
+    interactive_chat()

config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "architectures": [
+    "RoleSLM"
+  ],
+  "model_type": "software_engineer-slm",
+  "domain": "Software Engineer",
+  "vocab_size": 2180,
+  "n_layer": 32,
+  "n_head": 20,
+  "n_embd": 1600,
+  "block_size": 512,
+  "dropout": 0.05,
+  "bias": false,
+  "ffn_multiplier": 2.667,
+  "max_position_embeddings": 100000000000,
+  "rope_theta": 50000000000.0,
+  "n_parameters": 989908800
+}

config.py ADDED Viewed

	@@ -0,0 +1,102 @@

+#!/usr/bin/env python3
+"""
+Configuration for Software-Engineer-SLM: A Role-Based SLM for Software Engineer.
+~1B params, LLaMA-style architecture with RoPE — supports up to 1M token context.
+"""
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+@dataclass
+class SLMConfig:
+    """All hyperparameters and paths in one place."""
+    # ── Project paths ──────────────────────────────────────────────
+    project_dir: Path = Path(__file__).resolve().parent
+    data_dir: Path = field(default=None)
+    tokenizer_dir: Path = field(default=None)
+    checkpoint_dir: Path = field(default=None)
+    # ── Domain ─────────────────────────────────────────────────────
+    domain_name: str = "Software Engineer"
+    domain_slug: str = "software_engineer"
+    tokenizer_filename: str = "software_engineer_tokenizer.json"
+    # ── Tokenizer ──────────────────────────────────────────────────
+    vocab_size: int = 32_768
+    min_frequency: int = 2
+    special_tokens: list = field(
+        default_factory=lambda: [
+            "<pad>", "<unk>", "<bos>", "<eos>",
+            "<|system|>", "<|user|>", "<|assistant|>",
+        ]
+    )
+    # ── Model (~1B params, LLaMA-style with RoPE) ─────────────────
+    n_layer: int = 32
+    n_head: int = 20
+    n_embd: int = 1600
+    block_size: int = 512
+    dropout: float = 0.05
+    bias: bool = False
+    ffn_multiplier: float = 2.667
+    # ── RoPE ───────────────────────────────────────────────────────
+    max_position_embeddings: int = 100_000_000_000  # 100B tokens via RoPE
+    rope_theta: float = 50_000_000_000.0  # Scaled for 100B context
+    # ── Sliding Window ─────────────────────────────────────────────
+    sliding_window: Optional[int] = None
+    # ── Gradient Checkpointing (essential for 1B on 24GB) ──────────
+    gradient_checkpointing: bool = True
+    # ── Training ───────────────────────────────────────────────────
+    batch_size: int = 1
+    gradient_accumulation_steps: int = 16
+    learning_rate: float = 2e-4
+    weight_decay: float = 0.1
+    max_epochs: int = 5
+    dataset_stride: int = 256
+    warmup_steps: int = 100
+    grad_clip: float = 1.0
+    eval_interval: int = 50
+    eval_samples: int = 10
+    log_interval: int = 10
+    device: str = "auto"
+    # ── Generation ─────────────────────────────────────────────────
+    max_new_tokens: int = 1_000_000  # 1M output tokens
+    temperature: float = 0.8
+    top_k: int = 50
+    top_p: float = 0.9
+    # ── HuggingFace ────────────────────────────────────────────────
+    hf_repo_name: str = "software-engineer-slm-1m"
+    hf_model_card_tags: list = field(default_factory=lambda: ['software-engineering', 'programming', 'algorithms', 'system-design', 'slm', 'llama-style', 'rope', '1m-context', 'from-scratch', '1b-params'])
+    def __post_init__(self):
+        if self.data_dir is None:
+            self.data_dir = self.project_dir / "data"
+        if self.tokenizer_dir is None:
+            self.tokenizer_dir = self.project_dir / "tokenizer"
+        if self.checkpoint_dir is None:
+            self.checkpoint_dir = self.project_dir / "checkpoints"
+        self.data_dir.mkdir(parents=True, exist_ok=True)
+        self.tokenizer_dir.mkdir(parents=True, exist_ok=True)
+        self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
+        if self.device == "auto":
+            import torch
+            if torch.cuda.is_available():
+                self.device = "cuda"
+            elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+                self.device = "mps"
+            else:
+                self.device = "cpu"
+cfg = SLMConfig()

model.py ADDED Viewed

	@@ -0,0 +1,223 @@

+#!/usr/bin/env python3
+"""
+model.py — Role SLM Transformer (~1B params) with RoPE + Gradient Checkpointing
+================================================================================
+Supports context lengths up to 1M tokens via:
+  * RoPE (no fixed position embedding table)
+  * RMSNorm (more efficient than LayerNorm)
+  * SwiGLU activation (better training dynamics)
+  * Flash Attention via PyTorch scaled_dot_product_attention
+  * Gradient checkpointing for memory-efficient training on 24GB
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint as grad_checkpoint
+from typing import Optional, Tuple
+from config import cfg
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        norm = torch.rsqrt(x.float().pow(2).mean(-1, keepdim=True) + self.eps)
+        return (x.float() * norm).type_as(x) * self.weight
+def precompute_rope_freqs(dim, max_seq_len, theta=10000.0, device=None):
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, device=device).float() / dim))
+    t = torch.arange(max_seq_len, device=device).float()
+    freqs = torch.outer(t, freqs)
+    return freqs.cos(), freqs.sin()
+def apply_rope(x, cos, sin):
+    seq_len = x.shape[2]
+    head_dim = x.shape[3]
+    cos = cos[:seq_len].unsqueeze(0).unsqueeze(0)
+    sin = sin[:seq_len].unsqueeze(0).unsqueeze(0)
+    x1 = x[..., :head_dim // 2]
+    x2 = x[..., head_dim // 2:]
+    return torch.cat([x1 * cos - x2 * sin, x2 * cos + x1 * sin], dim=-1)
+class CausalSelfAttention(nn.Module):
+    def __init__(self):
+        super().__init__()
+        assert cfg.n_embd % cfg.n_head == 0
+        self.n_head = cfg.n_head
+        self.head_dim = cfg.n_embd // cfg.n_head
+        self.q_proj = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False)
+        self.k_proj = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False)
+        self.v_proj = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False)
+        self.out_proj = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False)
+        self.resid_drop = nn.Dropout(cfg.dropout)
+    def forward(self, x, rope_cos, rope_sin):
+        B, T, C = x.shape
+        q = self.q_proj(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = self.k_proj(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = self.v_proj(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        q = apply_rope(q, rope_cos, rope_sin)
+        k = apply_rope(k, rope_cos, rope_sin)
+        if hasattr(F, 'scaled_dot_product_attention'):
+            y = F.scaled_dot_product_attention(q, k, v,
+                dropout_p=cfg.dropout if self.training else 0.0, is_causal=True)
+        else:
+            scale = 1.0 / math.sqrt(self.head_dim)
+            att = (q @ k.transpose(-2, -1)) * scale
+            mask = torch.triu(torch.ones(T, T, device=x.device), diagonal=1).bool()
+            att = att.masked_fill(mask.unsqueeze(0).unsqueeze(0), float('-inf'))
+            att = F.softmax(att, dim=-1)
+            y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        return self.resid_drop(self.out_proj(y))
+class SwiGLUFFN(nn.Module):
+    def __init__(self):
+        super().__init__()
+        hidden_dim = int(cfg.n_embd * getattr(cfg, 'ffn_multiplier', 2.667))
+        hidden_dim = ((hidden_dim + 63) // 64) * 64
+        self.gate_proj = nn.Linear(cfg.n_embd, hidden_dim, bias=False)
+        self.up_proj = nn.Linear(cfg.n_embd, hidden_dim, bias=False)
+        self.down_proj = nn.Linear(hidden_dim, cfg.n_embd, bias=False)
+        self.dropout = nn.Dropout(cfg.dropout)
+    def forward(self, x):
+        return self.dropout(self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x)))
+class TransformerBlock(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.attn_norm = RMSNorm(cfg.n_embd)
+        self.attn = CausalSelfAttention()
+        self.ffn_norm = RMSNorm(cfg.n_embd)
+        self.ffn = SwiGLUFFN()
+    def forward(self, x, rope_cos, rope_sin):
+        x = x + self.attn(self.attn_norm(x), rope_cos, rope_sin)
+        x = x + self.ffn(self.ffn_norm(x))
+        return x
+class RoleSLM(nn.Module):
+    """Role-Based Small Language Model — ~1B params, LLaMA-style with gradient checkpointing."""
+    def __init__(self):
+        super().__init__()
+        self.tok_emb = nn.Embedding(cfg.vocab_size, cfg.n_embd)
+        self.drop = nn.Dropout(cfg.dropout)
+        self.blocks = nn.ModuleList([TransformerBlock() for _ in range(cfg.n_layer)])
+        self.norm = RMSNorm(cfg.n_embd)
+        self.lm_head = nn.Linear(cfg.n_embd, cfg.vocab_size, bias=False)
+        self.tok_emb.weight = self.lm_head.weight  # Weight tying
+        self.use_checkpointing = getattr(cfg, 'gradient_checkpointing', True)
+        head_dim = cfg.n_embd // cfg.n_head
+        max_pos = getattr(cfg, 'max_position_embeddings', 1_000_000)
+        rope_theta = getattr(cfg, 'rope_theta', 10000.0)
+        precompute_len = min(max_pos, cfg.block_size * 2)
+        cos, sin = precompute_rope_freqs(head_dim, precompute_len, theta=rope_theta)
+        self.register_buffer("rope_cos", cos, persistent=False)
+        self.register_buffer("rope_sin", sin, persistent=False)
+        self._rope_max_len = precompute_len
+        self._rope_theta = rope_theta
+        self._head_dim = head_dim
+        self.apply(self._init_weights)
+        n_params = sum(p.numel() for p in self.parameters())
+        print(f"{cfg.domain_name}-SLM initialized: {n_params/1e6:.2f}M parameters ({n_params/1e9:.3f}B)")
+        print(f"   Architecture: {cfg.n_layer}L / {cfg.n_head}H / {cfg.n_embd}D")
+        print(f"   Gradient checkpointing: {self.use_checkpointing}")
+        print(f"   Max context: {max_pos:,} tokens (via RoPE)")
+        print(f"   Estimated model size: {n_params * 4 / 1e9:.2f} GB (fp32)")
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def _extend_rope(self, seq_len, device):
+        if seq_len > self._rope_max_len:
+            new_len = max(seq_len, self._rope_max_len * 2)
+            cos, sin = precompute_rope_freqs(self._head_dim, new_len,
+                                             theta=self._rope_theta, device=device)
+            self.rope_cos = cos
+            self.rope_sin = sin
+            self._rope_max_len = new_len
+    def _block_forward(self, block, x, rope_cos, rope_sin):
+        """Wrapper for gradient checkpointing."""
+        return block(x, rope_cos, rope_sin)
+    def forward(self, idx, targets=None):
+        B, T = idx.shape
+        device = idx.device
+        self._extend_rope(T, device)
+        x = self.drop(self.tok_emb(idx))
+        rope_cos = self.rope_cos[:T].to(device)
+        rope_sin = self.rope_sin[:T].to(device)
+        for block in self.blocks:
+            if self.use_checkpointing and self.training:
+                x = grad_checkpoint(self._block_forward, block, x, rope_cos, rope_sin,
+                                    use_reentrant=False)
+            else:
+                x = block(x, rope_cos, rope_sin)
+        x = self.norm(x)
+        logits = self.lm_head(x)
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
+        return logits, loss
+    @torch.no_grad()
+    def generate(self, idx, max_new_tokens, temperature=0.8, top_k=50, top_p=0.9):
+        self.use_checkpointing = False  # No checkpointing during generation
+        for _ in range(max_new_tokens):
+            idx_cond = idx if idx.size(1) <= cfg.block_size else idx[:, -cfg.block_size:]
+            logits, _ = self(idx_cond)
+            logits = logits[:, -1, :]
+            if temperature == 0:
+                idx_next = logits.argmax(dim=-1, keepdim=True)
+            else:
+                logits = logits / temperature
+                if top_k > 0:
+                    v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                    logits[logits < v[:, [-1]]] = float('-inf')
+                if top_p < 1.0:
+                    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+                    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+                    sorted_indices_to_remove = cumulative_probs > top_p
+                    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                    sorted_indices_to_remove[..., 0] = 0
+                    indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+                    logits[indices_to_remove] = float('-inf')
+                probs = F.softmax(logits, dim=-1)
+                idx_next = torch.multinomial(probs, num_samples=1)
+            idx = torch.cat([idx, idx_next], dim=1)
+            if idx_next.item() == 3:  # <eos>
+                break
+        self.use_checkpointing = getattr(cfg, 'gradient_checkpointing', True)
+        return idx
+    def count_parameters(self):
+        return sum(p.numel() for p in self.parameters())
+if __name__ == "__main__":
+    model = RoleSLM()
+    x = torch.randint(0, cfg.vocab_size, (1, 32))
+    logits, loss = model(x, x)
+    print(f"Test forward: logits={logits.shape}, loss={loss.item():.4f}")

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a78c17d6e3ca41f28539d99e3b3564b4e5ed70e1da27e18513027006902169ea
+size 3973617464

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fb503902b7724de97928ef2e4a40e64cd6416a03caacd26db0b77092a6f58509
+size 3959746283

software_engineer_tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "bos_token": "<bos>",
+  "eos_token": "<eos>",
+  "unk_token": "<unk>",
+  "pad_token": "<pad>",
+  "model_max_length": 100000000000
+}