Wilbatronic commited on
Commit
7b94b0b
·
verified ·
1 Parent(s): 797983b

Initial upload of CGGR-specialized Math model

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ benchmark_dashboard.png filter=lfs diff=lfs merge=lfs -text
37
+ loss_curve.png filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: HuggingFaceTB/SmolLM-135M
4
+ datasets:
5
+ - openai/gsm8k
6
+ - meta-math/MetaMathQA
7
+ - AI-MO/NuminaMath-1.5
8
+ tags:
9
+ - math
10
+ - reasoning
11
+ - efficient-training
12
+ - cggr
13
+ - sparse-gradients
14
+ model_name: SmolLM-135M-CGGR-Math
15
+ ---
16
+
17
+ # SmolLM-135M-CGGR-Math
18
+
19
+ This model is a specialized version of **HuggingFaceTB/SmolLM-135M**, fine-tuned for mathematical reasoning using **Confidence-Gated Gradient Routing (CGGR)**.
20
+
21
+ ## 🚀 The CGGR Breakthrough
22
+
23
+ This model was trained using a novel training strategy that selects only the "hardest" tokens for gradient updates, allowing for:
24
+ - **4.08x Higher Throughput:** Processing 4x more data in the same wall-clock time compared to standard training.
25
+ - **66% VRAM Savings:** Fitting large-batch training on consumer hardware (RTX 3060).
26
+ - **Superior Convergence:** Achieving a **+19% relative accuracy improvement** on math reasoning tasks (AIME 2024) compared to standard fine-tuning.
27
+
28
+ ### Benchmark Results (6-Hour Training Race)
29
+
30
+ | Metric | Standard (Baseline) | CGGR (Ours) | Relative Gain |
31
+ | :-------------------------- | :------------------ | :----------------- | :---------------- |
32
+ | **Solving Accuracy (AIME)** | 8.00% | **9.50%** | **+18.75%** |
33
+ | **Training Throughput** | 14,368 samples | **58,716 samples** | **+308%** |
34
+ | **Final Loss** | 0.3610 | **0.0980** | **-73% Error** |
35
+ | **Max Batch Size (12GB)** | 18 | **69** | **3.8x Capacity** |
36
+
37
+ ## 📈 Performance Visuals
38
+
39
+ ![Benchmark Dashboard](https://huggingface.co/MinimaML/SmolLM-135M-CGGR-Math/resolve/main/benchmark_dashboard.png)
40
+
41
+ ## Model Details
42
+
43
+ - **Architecture:** Transformer Decoder (SmolLM-135M)
44
+ - **Training Method:** CGGR (Confidence-Gated Gradient Routing)
45
+ - **Selection Strategy:** Fixed Quota (Top 25% hardest tokens)
46
+ - **Compute:** Trained on a single NVIDIA RTX 3060 (12GB)
47
+ - **Duration:** 6 Total Hours
48
+
49
+ ## Usage
50
+
51
+ ```python
52
+ from transformers import AutoModelForCausalLM, AutoTokenizer
53
+
54
+ model_name = "MinimaML/SmolLM-135M-CGGR-Math"
55
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
56
+ model = AutoModelForCausalLM.from_pretrained(model_name)
57
+
58
+ prompt = "Question: If x + y = 10 and x - y = 2, what is the value of x?\n\nAnswer:"
59
+ inputs = tokenizer(prompt, return_tensors="pt")
60
+ outputs = model.generate(**inputs, max_new_tokens=50)
61
+ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
62
+ ```
63
+
64
+ ## Citation
65
+
66
+ If you use this model or the CGGR technique in your research, please cite:
67
+
68
+ ```bibtex
69
+ @software{cggr2026,
70
+ title={CGGR: Confidence-Gated Gradient Routing},
71
+ author={MinimaML},
72
+ year={2026},
73
+ url={https://github.com/MinimaML/CGGR}
74
+ }
75
+ ```
benchmark_dashboard.png ADDED

Git LFS Details

  • SHA256: da4faed19a8c574d3accce876c39b0952c24c6bf0763a3d5c4c13cf1f282e93a
  • Pointer size: 131 Bytes
  • Size of remote file: 474 kB
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dtype": "float32",
9
+ "eos_token_id": 0,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 576,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 1536,
15
+ "max_position_embeddings": 2048,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 9,
19
+ "num_hidden_layers": 30,
20
+ "num_key_value_heads": 3,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-05,
23
+ "rope_parameters": {
24
+ "rope_theta": 10000.0,
25
+ "rope_type": "default"
26
+ },
27
+ "tie_word_embeddings": true,
28
+ "transformers_version": "5.0.0rc1",
29
+ "use_cache": true,
30
+ "vocab_size": 49152
31
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 0,
5
+ "transformers_version": "5.0.0rc1"
6
+ }
loss_curve.png ADDED

Git LFS Details

  • SHA256: 62e8d36b06d0ea1d51a75d589c761578010bc51ba328949309a72a2235a5a57a
  • Pointer size: 131 Bytes
  • Size of remote file: 256 kB
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc6d9369db83b2de71183977ad2b5973bc2e83cdfeae8847f7e203f4c2e22cf4
3
+ size 538090408
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "additional_special_tokens": null,
4
+ "backend": "tokenizers",
5
+ "bos_token": "<|endoftext|>",
6
+ "clean_up_tokenization_spaces": false,
7
+ "eos_token": "<|endoftext|>",
8
+ "errors": "replace",
9
+ "extra_special_tokens": [
10
+ "<|endoftext|>",
11
+ "<|im_start|>",
12
+ "<|im_end|>",
13
+ "<repo_name>",
14
+ "<reponame>",
15
+ "<file_sep>",
16
+ "<filename>",
17
+ "<gh_stars>",
18
+ "<issue_start>",
19
+ "<issue_comment>",
20
+ "<issue_closed>",
21
+ "<jupyter_start>",
22
+ "<jupyter_text>",
23
+ "<jupyter_code>",
24
+ "<jupyter_output>",
25
+ "<jupyter_script>",
26
+ "<empty_output>"
27
+ ],
28
+ "is_local": false,
29
+ "model_max_length": 1000000000000000019884624838656,
30
+ "pad_token": "<|endoftext|>",
31
+ "tokenizer_class": "GPT2Tokenizer",
32
+ "unk_token": "<|endoftext|>",
33
+ "vocab_size": 49152
34
+ }