Update to checkpoint 20000 (best quality, loss 0.758)
Browse files- README.md +6 -10
- pytorch_model.bin +1 -1
- training_metadata.json +25 -0
README.md
CHANGED
|
@@ -48,9 +48,8 @@ A 53-million parameter GPT model trained from scratch on FineWebEdu educational
|
|
| 48 |
- **Framework:** Apple MLX (training), PyTorch (export)
|
| 49 |
- **Dataset:** FineWebEdu - 10M tokens of educational web content
|
| 50 |
- **Training Hardware:** Apple M2 Pro (16GB unified memory)
|
| 51 |
-
- **Checkpoint:**
|
| 52 |
-
- **Training Method:** Base pretraining
|
| 53 |
-
- **Teacher Model:** GPT-OSS-20B (via Groq API)
|
| 54 |
|
| 55 |
### Architecture Highlights
|
| 56 |
|
|
@@ -72,13 +71,10 @@ Pre-LN provides better training stability and is used in modern transformers (GP
|
|
| 72 |
|
| 73 |
- **Dataset:** FineWebEdu (diverse educational web content)
|
| 74 |
- **Training Tokens:** 10M
|
| 75 |
-
- **
|
| 76 |
-
- **Knowledge Distillation:** 15,000 additional iterations with GPT-OSS-20B as teacher
|
| 77 |
-
- **Total Iterations:** 35,000
|
| 78 |
- **Batch Size:** 12
|
| 79 |
-
- **Learning Rate:** 3e-4 with cosine decay
|
| 80 |
-
- **Final Training Loss:**
|
| 81 |
-
- **Distillation Method:** 50% hard loss (ground truth) + 50% soft loss (teacher)
|
| 82 |
|
| 83 |
### Performance Benchmarks
|
| 84 |
|
|
@@ -95,7 +91,7 @@ Measured on Apple M2 Pro (16GB unified memory):
|
|
| 95 |
| **Generation Latency** | ~0.59s per 100 tokens |
|
| 96 |
| **Activation Memory** | 843 MB (batch=4, seq=512) |
|
| 97 |
|
| 98 |
-
> **Note:**
|
| 99 |
|
| 100 |
## Usage
|
| 101 |
|
|
|
|
| 48 |
- **Framework:** Apple MLX (training), PyTorch (export)
|
| 49 |
- **Dataset:** FineWebEdu - 10M tokens of educational web content
|
| 50 |
- **Training Hardware:** Apple M2 Pro (16GB unified memory)
|
| 51 |
+
- **Checkpoint:** 20000 iterations
|
| 52 |
+
- **Training Method:** Base pretraining from scratch
|
|
|
|
| 53 |
|
| 54 |
### Architecture Highlights
|
| 55 |
|
|
|
|
| 71 |
|
| 72 |
- **Dataset:** FineWebEdu (diverse educational web content)
|
| 73 |
- **Training Tokens:** 10M
|
| 74 |
+
- **Total Iterations:** 20,000
|
|
|
|
|
|
|
| 75 |
- **Batch Size:** 12
|
| 76 |
+
- **Learning Rate:** 3e-4 with cosine decay
|
| 77 |
+
- **Final Training Loss:** 0.7583
|
|
|
|
| 78 |
|
| 79 |
### Performance Benchmarks
|
| 80 |
|
|
|
|
| 91 |
| **Generation Latency** | ~0.59s per 100 tokens |
|
| 92 |
| **Activation Memory** | 843 MB (batch=4, seq=512) |
|
| 93 |
|
| 94 |
+
> **Note:** All benchmarks measured at checkpoint 20000 (this release).
|
| 95 |
|
| 96 |
## Usage
|
| 97 |
|
pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 143190611
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:46767573997bf47cf4f151837ff4ca7288b44a09332b2c41a0666fddf3c74cd2
|
| 3 |
size 143190611
|
training_metadata.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "nanoGPT-MLX-53M-FineWebEdu",
|
| 3 |
+
"framework": "MLX",
|
| 4 |
+
"architecture": "Pre-LN Transformer (GPT-2 style)",
|
| 5 |
+
"training": {
|
| 6 |
+
"dataset": "FineWebEdu-10M",
|
| 7 |
+
"iterations": 20000,
|
| 8 |
+
"final_loss": 0.7583,
|
| 9 |
+
"optimizer": "AdamW",
|
| 10 |
+
"learning_rate": 0.0006,
|
| 11 |
+
"batch_size": 16,
|
| 12 |
+
"context_length": 512
|
| 13 |
+
},
|
| 14 |
+
"model_config": {
|
| 15 |
+
"vocab_size": 50257,
|
| 16 |
+
"d_model": 384,
|
| 17 |
+
"n_layers": 8,
|
| 18 |
+
"n_heads": 8,
|
| 19 |
+
"d_ff": 1536,
|
| 20 |
+
"dropout": 0.1
|
| 21 |
+
},
|
| 22 |
+
"parameters": "52.99M",
|
| 23 |
+
"converted_from": "MLX checkpoint_20000.npz",
|
| 24 |
+
"conversion_date": "2025-11-14"
|
| 25 |
+
}
|