Upload mathstral-nano-sat

Browse files

Files changed (6) hide show

README.md +88 -0
config.json +16 -0
model.safetensors +3 -0
modeling_mathstral_nano.py +124 -0
training_metadata.json +86 -0
upload_to_hub.py +30 -0

README.md ADDED Viewed

	@@ -0,0 +1,88 @@

+---
+language: en
+license: apache-2.0
+tags:
+  - math
+  - sat
+  - education
+  - numpy
+  - causal-lm
+  - tiny-model
+datasets: []
+---
+# mathstral-nano-sat
+A tiny GPT-style causal language model (236,928 parameters) trained on SAT-level
+math problems. Built entirely in NumPy with no PyTorch dependency. Demonstrates
+the full fine-tuning pipeline: tokenization, causal attention, AdamW, and
+cross-entropy loss on a byte-level vocabulary.
+## Model Details
+| Property | Value |
+|---|---|
+| Architecture | 4-layer causal transformer |
+| Attention heads | 8 |
+| Hidden dim | 64 |
+| FFN dim | 256 |
+| Vocabulary | 256 (byte-level, UTF-8) |
+| Max seq length | 64 |
+| Total parameters | 236,928 |
+| Framework | Pure NumPy + SciPy |
+## Training
+| Property | Value |
+|---|---|
+| Dataset | 20 SAT math Q&A examples |
+| Epochs | 3 |
+| Steps | 300 |
+| Batch size | 8 |
+| Learning rate | 0.0003 (AdamW) |
+| Baseline loss | 5.5158 |
+| Final loss | 2.224 |
+| Loss reduction | **59.7%** |
+## Install
+```bash
+pip install safetensors scipy numpy
+```
+## Usage
+```python
+from modeling_mathstral_nano import MathstralNano
+model = MathstralNano.from_pretrained(".")
+print(model)
+# MathstralNano(4L 8H 64d  params=236,928)
+# Raw generation
+response = model.generate("Problem: If 2x + 5 = 13, find x. Solution:")
+print(response)
+# Convenience wrapper (formats the prompt automatically)
+response = model.solve("If 3x + 7 = 22, find x.")
+print(response)
+```
+## Limitations
+This is a proof-of-concept model. At 236,928 parameters trained on 20 examples,
+it learns structural byte patterns in math text but does not produce coherent
+mathematical reasoning. It is intended as a pipeline validation tool, not a
+production math solver.
+For a capable math model, see the full fine-tuning notebook which applies the
+same pipeline to `mistralai/Mathstral-7B-v0.1` with QLoRA on 50K examples.
+## Files
+| File | Description |
+|---|---|
+| `model.safetensors` | Weights in safetensors format |
+| `config.json` | Architecture config |
+| `modeling_mathstral_nano.py` | Pure-NumPy model class with inference |
+| `training_metadata.json` | Full training run metadata and loss curve |

config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "architectures": [
+    "MathstralNanoForCausalLM"
+  ],
+  "model_type": "mathstral_nano",
+  "n_layer": 4,
+  "n_head": 8,
+  "d_model": 64,
+  "d_ff": 256,
+  "vocab_size": 256,
+  "seq_len": 64,
+  "tokenizer": "byte-level (UTF-8 bytes 0-255, no special tokens)",
+  "total_params": 236928,
+  "torch_dtype": "float32",
+  "transformers_version": "n/a (custom numpy model)"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e0fcbf8171730908bd75cc6d506f70a15397ca8beace41f4351d3cda4fa9754
+size 951520

modeling_mathstral_nano.py ADDED Viewed

	@@ -0,0 +1,124 @@

+"""
+MathstralNano: tiny 4-layer GPT-style transformer trained on SAT math problems.
+Load and run inference entirely in NumPy — no PyTorch required.
+Usage:
+    from modeling_mathstral_nano import MathstralNano
+    model = MathstralNano.from_pretrained(".")   # path to repo folder
+    print(model.generate("Problem: If 2x+5=13, find x. Solution:"))
+"""
+import numpy as np, math, json, os
+from scipy.special import softmax as sp_softmax   # pip install scipy
+class MathstralNano:
+    """Tiny causal transformer for SAT-level math problem solving."""
+    SYSTEM = "Problem: {question} Solution:"
+    def __init__(self, params: dict, config: dict):
+        self.P   = params
+        self.cfg = config
+    # ── class method: load from a local directory ─────────────────────────────
+    @classmethod
+    def from_pretrained(cls, model_dir: str) -> "MathstralNano":
+        """Load weights and config from a local folder or HF repo path."""
+        try:
+            from safetensors.numpy import load_file
+            params = load_file(os.path.join(model_dir, "model.safetensors"))
+        except ImportError:
+            raise ImportError("pip install safetensors")
+        with open(os.path.join(model_dir, "config.json")) as f:
+            config = json.load(f)
+        return cls(params, config)
+    # ── internal helpers ──────────────────────────────────────────────────────
+    @staticmethod
+    def _encode(text: str, length: int) -> list:
+        ids = list(text.encode("utf-8", errors="replace"))[:length]
+        ids += [0] * (length - len(ids))
+        return ids
+    @staticmethod
+    def _layer_norm(x, w, b, eps=1e-5):
+        mu  = x.mean(-1, keepdims=True)
+        std = x.std(-1, keepdims=True)
+        return w * (x - mu) / (std + eps) + b
+    @staticmethod
+    def _gelu(x):
+        c = math.sqrt(2 / math.pi)
+        return 0.5 * x * (1 + np.tanh(c * (x + 0.044715 * x ** 3)))
+    def _forward(self, x_ids: np.ndarray):
+        """x_ids (B, T) -> logits (B, T, VOCAB)"""
+        P   = self.P
+        B, T = x_ids.shape
+        D   = self.cfg["d_model"]
+        H   = self.cfg["n_head"]
+        DH  = D // H
+        NL  = self.cfg["n_layer"]
+        SEQ = self.cfg["seq_len"]
+        x      = P["tok_emb"][x_ids] + P["pos_emb"][:T]
+        causal = np.triu(np.full((T, T), -1e9), k=1)
+        for i in range(NL):
+            n    = f"L{i}"
+            x_ln = self._layer_norm(x, P[f"{n}_ln1_w"], P[f"{n}_ln1_b"])
+            qkv  = x_ln @ P[f"{n}_qkv"] + P[f"{n}_qkv_b"]
+            Q_mat, K_mat, Val = np.split(qkv, 3, axis=-1)
+            Q_mat = Q_mat.reshape(B, T, H, DH).transpose(0, 2, 1, 3)
+            K_mat = K_mat.reshape(B, T, H, DH).transpose(0, 2, 1, 3)
+            Val   = Val.reshape(B, T, H, DH).transpose(0, 2, 1, 3)
+            sc    = Q_mat @ K_mat.transpose(0, 1, 3, 2) / math.sqrt(DH) + causal
+            attn  = sp_softmax(sc, axis=-1)
+            ctx   = (attn @ Val).transpose(0, 2, 1, 3).reshape(B, T, D)
+            x     = x + ctx @ P[f"{n}_proj"] + P[f"{n}_proj_b"]
+            x_ln2 = self._layer_norm(x, P[f"{n}_ln2_w"], P[f"{n}_ln2_b"])
+            h1    = self._gelu(x_ln2 @ P[f"{n}_fc1"] + P[f"{n}_fc1_b"])
+            x     = x + h1 @ P[f"{n}_fc2"] + P[f"{n}_fc2_b"]
+        x_out  = self._layer_norm(x, P["ln_f_w"], P["ln_f_b"])
+        logits = x_out @ P["head"]
+        return logits
+    # ── public inference API ──────────────────────────────────────────────────
+    def generate(
+        self,
+        prompt:      str,
+        max_new:     int   = 80,
+        temperature: float = 0.8,
+        seed:        int   = None,
+    ) -> str:
+        """Generate a completion for the given prompt string."""
+        rng = np.random.default_rng(seed)
+        SEQ = self.cfg["seq_len"]
+        ids = self._encode(prompt, SEQ)
+        out = []
+        for _ in range(max_new):
+            logits = self._forward(np.array([ids]))
+            last   = logits[0, -1, :].astype(np.float64)
+            last   = (last - last.max()) / max(temperature, 1e-6)
+            probs  = np.exp(last) / np.exp(last).sum()
+            tok    = int(rng.choice(self.cfg["vocab_size"], p=probs))
+            out.append(tok)
+            ids = ids[1:] + [tok]
+        return bytes(out).decode("utf-8", errors="replace")
+    def solve(self, question: str, **kwargs) -> str:
+        """Convenience wrapper: formats the SAT-style prompt automatically."""
+        prompt = self.SYSTEM.format(question=question.strip())
+        return prompt + self.generate(prompt, **kwargs)
+    def __repr__(self):
+        c = self.cfg
+        return (
+            f"MathstralNano("
+            f"{c['n_layer']}L {c['n_head']}H {c['d_model']}d  "
+            f"params={c['total_params']:,})"
+        )

training_metadata.json ADDED Viewed

	@@ -0,0 +1,86 @@

+{
+  "model": {
+    "n_layer": 4,
+    "n_head": 8,
+    "d_model": 64,
+    "d_ff": 256,
+    "vocab": 256,
+    "seq_len": 64,
+    "total_params": 236928
+  },
+  "training": {
+    "epochs": 3,
+    "steps": 100,
+    "batch": 8,
+    "lr": 0.0003,
+    "baseline_loss": 5.5158,
+    "final_loss": 2.224,
+    "loss_reduction_pct": 59.7
+  },
+  "loss_curve": [
+    {
+      "step": 1,
+      "loss": 5.5221
+    },
+    {
+      "step": 20,
+      "loss": 5.3729
+    },
+    {
+      "step": 40,
+      "loss": 5.0373
+    },
+    {
+      "step": 60,
+      "loss": 4.6481
+    },
+    {
+      "step": 80,
+      "loss": 4.2298
+    },
+    {
+      "step": 100,
+      "loss": 3.8664
+    },
+    {
+      "step": 120,
+      "loss": 3.5706
+    },
+    {
+      "step": 140,
+      "loss": 3.3363
+    },
+    {
+      "step": 160,
+      "loss": 3.1259
+    },
+    {
+      "step": 180,
+      "loss": 2.9435
+    },
+    {
+      "step": 200,
+      "loss": 2.7872
+    },
+    {
+      "step": 220,
+      "loss": 2.6412
+    },
+    {
+      "step": 240,
+      "loss": 2.5028
+    },
+    {
+      "step": 260,
+      "loss": 2.3663
+    },
+    {
+      "step": 280,
+      "loss": 2.2759
+    },
+    {
+      "step": 300,
+      "loss": 2.224
+    }
+  ]
+}

upload_to_hub.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""
+Upload mathstral-nano-sat to HuggingFace Hub.
+Usage:
+    pip install huggingface_hub
+    python upload_to_hub.py --repo your-username/mathstral-nano-sat --token hf_...
+"""
+import argparse, os
+from huggingface_hub import HfApi, login
+parser = argparse.ArgumentParser()
+parser.add_argument("--repo",  required=True, help="HF repo id, e.g. alice/mathstral-nano-sat")
+parser.add_argument("--token", required=True, help="HuggingFace write token")
+parser.add_argument("--private", action="store_true", help="Make repo private")
+args = parser.parse_args()
+login(token=args.token)
+api = HfApi()
+api.create_repo(repo_id=args.repo, exist_ok=True, private=args.private)
+here = os.path.dirname(os.path.abspath(__file__))
+api.upload_folder(
+    folder_path    = here,
+    repo_id        = args.repo,
+    repo_type      = "model",
+    ignore_patterns= ["upload_to_hub.py"],
+    commit_message = "Upload mathstral-nano-sat (NumPy fine-tuned SAT math model)",
+)
+print(f"\nUploaded to: https://huggingface.co/{args.repo}")