Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

config.py +36 -0
inference.py +124 -0
model.py +129 -0

config.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""GuppyLM configuration."""
+from dataclasses import dataclass
+@dataclass
+class GuppyConfig:
+    vocab_size: int = 4096
+    max_seq_len: int = 128
+    d_model: int = 384
+    n_layers: int = 6
+    n_heads: int = 6
+    ffn_hidden: int = 768
+    dropout: float = 0.1
+    # Special tokens
+    pad_id: int = 0
+    bos_id: int = 1           # <|im_start|>
+    eos_id: int = 2           # <|im_end|>
+@dataclass
+class TrainConfig:
+    batch_size: int = 32
+    learning_rate: float = 3e-4
+    min_lr: float = 3e-5
+    weight_decay: float = 0.1
+    warmup_steps: int = 200
+    max_steps: int = 10000
+    eval_interval: int = 200
+    save_interval: int = 500
+    grad_clip: float = 1.0
+    device: str = "auto"
+    seed: int = 42
+    data_dir: str = "data"
+    output_dir: str = "checkpoints"

inference.py ADDED Viewed

	@@ -0,0 +1,124 @@

+"""GuppyLM inference — simple chat."""
+import json
+import time
+import uuid
+import torch
+from tokenizers import Tokenizer
+from config import GuppyConfig
+from model import GuppyLM
+class GuppyInference:
+    def __init__(self, checkpoint_path, tokenizer_path, device="cpu"):
+        self.device = torch.device(device)
+        self.tokenizer = Tokenizer.from_file(tokenizer_path)
+        import os
+        ckpt = torch.load(checkpoint_path, map_location=self.device, weights_only=False)
+        # Load config.json from same directory as the model file
+        config_dir = os.path.dirname(os.path.abspath(checkpoint_path))
+        config_path = os.path.join(config_dir, "config.json")
+        # Extract state_dict — handle both legacy and standard formats
+        if isinstance(ckpt, dict) and "model_state_dict" in ckpt:
+            state_dict = ckpt["model_state_dict"]
+        else:
+            state_dict = ckpt
+        # Load config — try config.json first, fall back to embedded config
+        if os.path.exists(config_path):
+            with open(config_path) as f:
+                cfg = json.load(f)
+            # Support both HF standard keys and our own keys
+            self.config = GuppyConfig(
+                vocab_size=cfg.get("vocab_size", 4096),
+                max_seq_len=cfg.get("max_position_embeddings", cfg.get("max_seq_len", 128)),
+                d_model=cfg.get("hidden_size", cfg.get("d_model", 384)),
+                n_layers=cfg.get("num_hidden_layers", cfg.get("n_layers", 6)),
+                n_heads=cfg.get("num_attention_heads", cfg.get("n_heads", 6)),
+                ffn_hidden=cfg.get("intermediate_size", cfg.get("ffn_hidden", 768)),
+                dropout=cfg.get("hidden_dropout_prob", cfg.get("dropout", 0.1)),
+                pad_id=cfg.get("pad_token_id", cfg.get("pad_id", 0)),
+                bos_id=cfg.get("bos_token_id", cfg.get("bos_id", 1)),
+                eos_id=cfg.get("eos_token_id", cfg.get("eos_id", 2)),
+            )
+        elif isinstance(ckpt, dict) and "config" in ckpt:
+            valid_fields = {f.name for f in GuppyConfig.__dataclass_fields__.values()}
+            self.config = GuppyConfig(**{k: v for k, v in ckpt["config"].items() if k in valid_fields})
+        else:
+            print("Warning: No config found, using defaults")
+            self.config = GuppyConfig()
+        self.model = GuppyLM(self.config).to(self.device)
+        filtered = {k: v for k, v in state_dict.items() if k in self.model.state_dict()}
+        self.model.load_state_dict(filtered)
+        self.model.eval()
+        total, _ = self.model.param_count()
+        print(f"GuppyLM loaded: {total/1e6:.1f}M params")
+    def chat_completion(self, messages, temperature=0.7, max_tokens=64,
+                        top_k=50, **kwargs):
+        """Chat completion — takes messages, returns response."""
+        prompt = self._format_prompt(messages)
+        input_ids = self.tokenizer.encode(prompt).ids
+        prompt_tokens = len(input_ids)
+        input_t = torch.tensor([input_ids], dtype=torch.long, device=self.device)
+        output_t, _ = self.model.generate(input_t, max_tokens, temperature, top_k)
+        output_text = self.tokenizer.decode(output_t[0].tolist()[prompt_tokens:])
+        # Truncate at first <|im_end|> — don't let the model leak into the next turn
+        if "<|im_end|>" in output_text:
+            output_text = output_text.split("<|im_end|>")[0]
+        # Also strip any <|im_start|> fragments
+        if "<|im_start|>" in output_text:
+            output_text = output_text.split("<|im_start|>")[0]
+        resp_text = output_text.strip()
+        return {
+            "choices": [{
+                "message": {"role": "assistant", "content": resp_text},
+            }],
+        }
+    def _format_prompt(self, messages):
+        parts = []
+        for msg in messages:
+            role = msg.get("role", "user")
+            content = msg.get("content") or ""
+            if role == "system":
+                continue
+            parts.append(f"<|im_start|>{role}\n{content}<|im_end|>")
+        parts.append("<|im_start|>assistant\n")
+        return "\n".join(parts)
+def main():
+    import argparse
+    p = argparse.ArgumentParser(description="Chat with Guppy")
+    p.add_argument("--checkpoint", default="checkpoints/best_model.pt")
+    p.add_argument("--tokenizer", default="data/tokenizer.json")
+    p.add_argument("--device", default="cpu")
+    args = p.parse_args()
+    engine = GuppyInference(args.checkpoint, args.tokenizer, args.device)
+    print("\nGuppy Chat (type 'quit' to exit)")
+    msgs = []
+    while True:
+        inp = input("\nYou> ").strip()
+        if inp.lower() in ("quit", "exit", "q"):
+            break
+        msgs.append({"role": "user", "content": inp})
+        result = engine.chat_completion(msgs)
+        msg = result["choices"][0]["message"]
+        if msg.get("content"):
+            print(f"Guppy> {msg['content']}")
+        msgs.append(msg)
+if __name__ == "__main__":
+    main()

model.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""
+GuppyLM — a tiny fish brain.
+Vanilla transformer: multi-head attention, ReLU FFN, LayerNorm, learned positional embeddings.
+No GQA, no SwiGLU, no parallel residual, no RoPE. As simple as it gets.
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from config import GuppyConfig
+class Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.n_heads = config.n_heads
+        self.head_dim = config.d_model // config.n_heads
+        self.qkv = nn.Linear(config.d_model, 3 * config.d_model)
+        self.out = nn.Linear(config.d_model, config.d_model)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x, mask=None):
+        B, T, C = x.shape
+        qkv = self.qkv(x).reshape(B, T, 3, self.n_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        attn = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        if mask is not None:
+            attn = attn.masked_fill(mask == 0, float("-inf"))
+        attn = self.dropout(F.softmax(attn, dim=-1))
+        return self.out((attn @ v).transpose(1, 2).contiguous().view(B, T, C))
+class FFN(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.up = nn.Linear(config.d_model, config.ffn_hidden)
+        self.down = nn.Linear(config.ffn_hidden, config.d_model)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x):
+        return self.dropout(self.down(F.relu(self.up(x))))
+class Block(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(config.d_model)
+        self.attn = Attention(config)
+        self.norm2 = nn.LayerNorm(config.d_model)
+        self.ffn = FFN(config)
+    def forward(self, x, mask=None):
+        x = x + self.attn(self.norm1(x), mask)
+        x = x + self.ffn(self.norm2(x))
+        return x
+class GuppyLM(nn.Module):
+    def __init__(self, config: GuppyConfig):
+        super().__init__()
+        self.config = config
+        self.tok_emb = nn.Embedding(config.vocab_size, config.d_model)
+        self.pos_emb = nn.Embedding(config.max_seq_len, config.d_model)
+        self.drop = nn.Dropout(config.dropout)
+        self.blocks = nn.ModuleList([Block(config) for _ in range(config.n_layers)])
+        self.norm = nn.LayerNorm(config.d_model)
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+        self.lm_head.weight = self.tok_emb.weight  # tie weights
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.normal_(m.weight, mean=0.0, std=0.02)
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.Embedding):
+            nn.init.normal_(m.weight, mean=0.0, std=0.02)
+    def forward(self, idx, targets=None):
+        B, T = idx.shape
+        pos = torch.arange(T, device=idx.device)
+        x = self.drop(self.tok_emb(idx) + self.pos_emb(pos))
+        mask = torch.tril(torch.ones(T, T, device=idx.device)).unsqueeze(0).unsqueeze(0)
+        for block in self.blocks:
+            x = block(x, mask)
+        logits = self.lm_head(self.norm(x))
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(
+                logits.view(-1, self.config.vocab_size),
+                targets.view(-1),
+                ignore_index=0,
+            )
+        return logits, loss
+    @torch.no_grad()
+    def generate(self, idx, max_new_tokens=64, temperature=0.7, top_k=50, **kwargs):
+        self.eval()
+        for _ in range(max_new_tokens):
+            idx_cond = idx[:, -self.config.max_seq_len:]
+            logits, _ = self(idx_cond)
+            logits = logits[:, -1, :] / temperature
+            if top_k > 0:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = float("-inf")
+            probs = F.softmax(logits, dim=-1)
+            next_id = torch.multinomial(probs, num_samples=1)
+            idx = torch.cat([idx, next_id], dim=1)
+            if next_id.item() == self.config.eos_id:
+                break
+        return idx, []
+    def param_count(self):
+        total = sum(p.numel() for p in self.parameters())
+        return total, 0
+    def param_summary(self):
+        total, _ = self.param_count()
+        return f"GuppyLM: {total:,} params ({total/1e6:.1f}M)"