jvaldi commited on
Commit
65c3c06
·
verified ·
1 Parent(s): 21b6eae

Upload mathstral-nano-sat

Browse files
README.md ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ license: apache-2.0
4
+ tags:
5
+ - math
6
+ - sat
7
+ - education
8
+ - numpy
9
+ - causal-lm
10
+ - tiny-model
11
+ datasets: []
12
+ ---
13
+
14
+ # mathstral-nano-sat
15
+
16
+ A tiny GPT-style causal language model (236,928 parameters) trained on SAT-level
17
+ math problems. Built entirely in NumPy with no PyTorch dependency. Demonstrates
18
+ the full fine-tuning pipeline: tokenization, causal attention, AdamW, and
19
+ cross-entropy loss on a byte-level vocabulary.
20
+
21
+ ## Model Details
22
+
23
+ | Property | Value |
24
+ |---|---|
25
+ | Architecture | 4-layer causal transformer |
26
+ | Attention heads | 8 |
27
+ | Hidden dim | 64 |
28
+ | FFN dim | 256 |
29
+ | Vocabulary | 256 (byte-level, UTF-8) |
30
+ | Max seq length | 64 |
31
+ | Total parameters | 236,928 |
32
+ | Framework | Pure NumPy + SciPy |
33
+
34
+ ## Training
35
+
36
+ | Property | Value |
37
+ |---|---|
38
+ | Dataset | 20 SAT math Q&A examples |
39
+ | Epochs | 3 |
40
+ | Steps | 300 |
41
+ | Batch size | 8 |
42
+ | Learning rate | 0.0003 (AdamW) |
43
+ | Baseline loss | 5.5158 |
44
+ | Final loss | 2.224 |
45
+ | Loss reduction | **59.7%** |
46
+
47
+ ## Install
48
+
49
+ ```bash
50
+ pip install safetensors scipy numpy
51
+ ```
52
+
53
+ ## Usage
54
+
55
+ ```python
56
+ from modeling_mathstral_nano import MathstralNano
57
+
58
+ model = MathstralNano.from_pretrained(".")
59
+ print(model)
60
+ # MathstralNano(4L 8H 64d params=236,928)
61
+
62
+ # Raw generation
63
+ response = model.generate("Problem: If 2x + 5 = 13, find x. Solution:")
64
+ print(response)
65
+
66
+ # Convenience wrapper (formats the prompt automatically)
67
+ response = model.solve("If 3x + 7 = 22, find x.")
68
+ print(response)
69
+ ```
70
+
71
+ ## Limitations
72
+
73
+ This is a proof-of-concept model. At 236,928 parameters trained on 20 examples,
74
+ it learns structural byte patterns in math text but does not produce coherent
75
+ mathematical reasoning. It is intended as a pipeline validation tool, not a
76
+ production math solver.
77
+
78
+ For a capable math model, see the full fine-tuning notebook which applies the
79
+ same pipeline to `mistralai/Mathstral-7B-v0.1` with QLoRA on 50K examples.
80
+
81
+ ## Files
82
+
83
+ | File | Description |
84
+ |---|---|
85
+ | `model.safetensors` | Weights in safetensors format |
86
+ | `config.json` | Architecture config |
87
+ | `modeling_mathstral_nano.py` | Pure-NumPy model class with inference |
88
+ | `training_metadata.json` | Full training run metadata and loss curve |
config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MathstralNanoForCausalLM"
4
+ ],
5
+ "model_type": "mathstral_nano",
6
+ "n_layer": 4,
7
+ "n_head": 8,
8
+ "d_model": 64,
9
+ "d_ff": 256,
10
+ "vocab_size": 256,
11
+ "seq_len": 64,
12
+ "tokenizer": "byte-level (UTF-8 bytes 0-255, no special tokens)",
13
+ "total_params": 236928,
14
+ "torch_dtype": "float32",
15
+ "transformers_version": "n/a (custom numpy model)"
16
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e0fcbf8171730908bd75cc6d506f70a15397ca8beace41f4351d3cda4fa9754
3
+ size 951520
modeling_mathstral_nano.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MathstralNano: tiny 4-layer GPT-style transformer trained on SAT math problems.
3
+ Load and run inference entirely in NumPy — no PyTorch required.
4
+
5
+ Usage:
6
+ from modeling_mathstral_nano import MathstralNano
7
+ model = MathstralNano.from_pretrained(".") # path to repo folder
8
+ print(model.generate("Problem: If 2x+5=13, find x. Solution:"))
9
+ """
10
+
11
+ import numpy as np, math, json, os
12
+ from scipy.special import softmax as sp_softmax # pip install scipy
13
+
14
+
15
+ class MathstralNano:
16
+ """Tiny causal transformer for SAT-level math problem solving."""
17
+
18
+ SYSTEM = "Problem: {question} Solution:"
19
+
20
+ def __init__(self, params: dict, config: dict):
21
+ self.P = params
22
+ self.cfg = config
23
+
24
+ # ── class method: load from a local directory ─────────────────────────────
25
+ @classmethod
26
+ def from_pretrained(cls, model_dir: str) -> "MathstralNano":
27
+ """Load weights and config from a local folder or HF repo path."""
28
+ try:
29
+ from safetensors.numpy import load_file
30
+ params = load_file(os.path.join(model_dir, "model.safetensors"))
31
+ except ImportError:
32
+ raise ImportError("pip install safetensors")
33
+
34
+ with open(os.path.join(model_dir, "config.json")) as f:
35
+ config = json.load(f)
36
+
37
+ return cls(params, config)
38
+
39
+ # ── internal helpers ──────────────────────────────────────────────────────
40
+ @staticmethod
41
+ def _encode(text: str, length: int) -> list:
42
+ ids = list(text.encode("utf-8", errors="replace"))[:length]
43
+ ids += [0] * (length - len(ids))
44
+ return ids
45
+
46
+ @staticmethod
47
+ def _layer_norm(x, w, b, eps=1e-5):
48
+ mu = x.mean(-1, keepdims=True)
49
+ std = x.std(-1, keepdims=True)
50
+ return w * (x - mu) / (std + eps) + b
51
+
52
+ @staticmethod
53
+ def _gelu(x):
54
+ c = math.sqrt(2 / math.pi)
55
+ return 0.5 * x * (1 + np.tanh(c * (x + 0.044715 * x ** 3)))
56
+
57
+ def _forward(self, x_ids: np.ndarray):
58
+ """x_ids (B, T) -> logits (B, T, VOCAB)"""
59
+ P = self.P
60
+ B, T = x_ids.shape
61
+ D = self.cfg["d_model"]
62
+ H = self.cfg["n_head"]
63
+ DH = D // H
64
+ NL = self.cfg["n_layer"]
65
+ SEQ = self.cfg["seq_len"]
66
+
67
+ x = P["tok_emb"][x_ids] + P["pos_emb"][:T]
68
+ causal = np.triu(np.full((T, T), -1e9), k=1)
69
+
70
+ for i in range(NL):
71
+ n = f"L{i}"
72
+ x_ln = self._layer_norm(x, P[f"{n}_ln1_w"], P[f"{n}_ln1_b"])
73
+ qkv = x_ln @ P[f"{n}_qkv"] + P[f"{n}_qkv_b"]
74
+ Q_mat, K_mat, Val = np.split(qkv, 3, axis=-1)
75
+ Q_mat = Q_mat.reshape(B, T, H, DH).transpose(0, 2, 1, 3)
76
+ K_mat = K_mat.reshape(B, T, H, DH).transpose(0, 2, 1, 3)
77
+ Val = Val.reshape(B, T, H, DH).transpose(0, 2, 1, 3)
78
+ sc = Q_mat @ K_mat.transpose(0, 1, 3, 2) / math.sqrt(DH) + causal
79
+ attn = sp_softmax(sc, axis=-1)
80
+ ctx = (attn @ Val).transpose(0, 2, 1, 3).reshape(B, T, D)
81
+ x = x + ctx @ P[f"{n}_proj"] + P[f"{n}_proj_b"]
82
+ x_ln2 = self._layer_norm(x, P[f"{n}_ln2_w"], P[f"{n}_ln2_b"])
83
+ h1 = self._gelu(x_ln2 @ P[f"{n}_fc1"] + P[f"{n}_fc1_b"])
84
+ x = x + h1 @ P[f"{n}_fc2"] + P[f"{n}_fc2_b"]
85
+
86
+ x_out = self._layer_norm(x, P["ln_f_w"], P["ln_f_b"])
87
+ logits = x_out @ P["head"]
88
+ return logits
89
+
90
+ # ── public inference API ──────────────────────────────────────────────────
91
+ def generate(
92
+ self,
93
+ prompt: str,
94
+ max_new: int = 80,
95
+ temperature: float = 0.8,
96
+ seed: int = None,
97
+ ) -> str:
98
+ """Generate a completion for the given prompt string."""
99
+ rng = np.random.default_rng(seed)
100
+ SEQ = self.cfg["seq_len"]
101
+ ids = self._encode(prompt, SEQ)
102
+ out = []
103
+ for _ in range(max_new):
104
+ logits = self._forward(np.array([ids]))
105
+ last = logits[0, -1, :].astype(np.float64)
106
+ last = (last - last.max()) / max(temperature, 1e-6)
107
+ probs = np.exp(last) / np.exp(last).sum()
108
+ tok = int(rng.choice(self.cfg["vocab_size"], p=probs))
109
+ out.append(tok)
110
+ ids = ids[1:] + [tok]
111
+ return bytes(out).decode("utf-8", errors="replace")
112
+
113
+ def solve(self, question: str, **kwargs) -> str:
114
+ """Convenience wrapper: formats the SAT-style prompt automatically."""
115
+ prompt = self.SYSTEM.format(question=question.strip())
116
+ return prompt + self.generate(prompt, **kwargs)
117
+
118
+ def __repr__(self):
119
+ c = self.cfg
120
+ return (
121
+ f"MathstralNano("
122
+ f"{c['n_layer']}L {c['n_head']}H {c['d_model']}d "
123
+ f"params={c['total_params']:,})"
124
+ )
training_metadata.json ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "n_layer": 4,
4
+ "n_head": 8,
5
+ "d_model": 64,
6
+ "d_ff": 256,
7
+ "vocab": 256,
8
+ "seq_len": 64,
9
+ "total_params": 236928
10
+ },
11
+ "training": {
12
+ "epochs": 3,
13
+ "steps": 100,
14
+ "batch": 8,
15
+ "lr": 0.0003,
16
+ "baseline_loss": 5.5158,
17
+ "final_loss": 2.224,
18
+ "loss_reduction_pct": 59.7
19
+ },
20
+ "loss_curve": [
21
+ {
22
+ "step": 1,
23
+ "loss": 5.5221
24
+ },
25
+ {
26
+ "step": 20,
27
+ "loss": 5.3729
28
+ },
29
+ {
30
+ "step": 40,
31
+ "loss": 5.0373
32
+ },
33
+ {
34
+ "step": 60,
35
+ "loss": 4.6481
36
+ },
37
+ {
38
+ "step": 80,
39
+ "loss": 4.2298
40
+ },
41
+ {
42
+ "step": 100,
43
+ "loss": 3.8664
44
+ },
45
+ {
46
+ "step": 120,
47
+ "loss": 3.5706
48
+ },
49
+ {
50
+ "step": 140,
51
+ "loss": 3.3363
52
+ },
53
+ {
54
+ "step": 160,
55
+ "loss": 3.1259
56
+ },
57
+ {
58
+ "step": 180,
59
+ "loss": 2.9435
60
+ },
61
+ {
62
+ "step": 200,
63
+ "loss": 2.7872
64
+ },
65
+ {
66
+ "step": 220,
67
+ "loss": 2.6412
68
+ },
69
+ {
70
+ "step": 240,
71
+ "loss": 2.5028
72
+ },
73
+ {
74
+ "step": 260,
75
+ "loss": 2.3663
76
+ },
77
+ {
78
+ "step": 280,
79
+ "loss": 2.2759
80
+ },
81
+ {
82
+ "step": 300,
83
+ "loss": 2.224
84
+ }
85
+ ]
86
+ }
upload_to_hub.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Upload mathstral-nano-sat to HuggingFace Hub.
3
+
4
+ Usage:
5
+ pip install huggingface_hub
6
+ python upload_to_hub.py --repo your-username/mathstral-nano-sat --token hf_...
7
+ """
8
+ import argparse, os
9
+ from huggingface_hub import HfApi, login
10
+
11
+ parser = argparse.ArgumentParser()
12
+ parser.add_argument("--repo", required=True, help="HF repo id, e.g. alice/mathstral-nano-sat")
13
+ parser.add_argument("--token", required=True, help="HuggingFace write token")
14
+ parser.add_argument("--private", action="store_true", help="Make repo private")
15
+ args = parser.parse_args()
16
+
17
+ login(token=args.token)
18
+ api = HfApi()
19
+
20
+ api.create_repo(repo_id=args.repo, exist_ok=True, private=args.private)
21
+
22
+ here = os.path.dirname(os.path.abspath(__file__))
23
+ api.upload_folder(
24
+ folder_path = here,
25
+ repo_id = args.repo,
26
+ repo_type = "model",
27
+ ignore_patterns= ["upload_to_hub.py"],
28
+ commit_message = "Upload mathstral-nano-sat (NumPy fine-tuned SAT math model)",
29
+ )
30
+ print(f"\nUploaded to: https://huggingface.co/{args.repo}")