Mandeep Sidhu commited on
Commit
b4b069f
·
0 Parent(s):

Initial dropout decay research pipeline

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +5 -0
  2. LICENSE +26 -0
  3. README.md +89 -0
  4. archive/legacy_docs/control_results.md +48 -0
  5. archive/legacy_docs/stage_dropout_search.md +82 -0
  6. archive/runs_legacy_20260525/control-smoke/20260522-141107/config.json +57 -0
  7. archive/runs_legacy_20260525/control-smoke/20260522-141107/metrics.jsonl +30 -0
  8. archive/runs_legacy_20260525/control-smoke/20260522-141107/summary.csv +7 -0
  9. archive/runs_legacy_20260525/control-smoke/20260522-141107/summary.json +62 -0
  10. archive/runs_legacy_20260525/control-smoke/20260522-141107/tokenizer.json +1629 -0
  11. archive/runs_legacy_20260525/controls/20260522-141120/config.json +61 -0
  12. archive/runs_legacy_20260525/controls/20260522-141120/metrics.jsonl +50 -0
  13. archive/runs_legacy_20260525/controls/20260522-141120/summary.csv +11 -0
  14. archive/runs_legacy_20260525/controls/20260522-141120/summary.json +102 -0
  15. archive/runs_legacy_20260525/controls/20260522-141120/tokenizer.json +0 -0
  16. archive/runs_legacy_20260525/publishable/20260522-132351/config.json +60 -0
  17. archive/runs_legacy_20260525/publishable/20260522-132351/metrics.jsonl +35 -0
  18. archive/runs_legacy_20260525/publishable/20260522-132351/summary.csv +8 -0
  19. archive/runs_legacy_20260525/publishable/20260522-132351/summary.json +72 -0
  20. archive/runs_legacy_20260525/publishable/20260522-132351/tokenizer.json +0 -0
  21. archive/runs_legacy_20260525/smoke/20260522-132106/config.json +51 -0
  22. archive/runs_legacy_20260525/smoke/20260522-132106/metrics.jsonl +30 -0
  23. archive/runs_legacy_20260525/smoke/20260522-132106/summary.json +62 -0
  24. archive/runs_legacy_20260525/smoke/20260522-132106/tokenizer.json +1629 -0
  25. archive/runs_legacy_20260525/smoke/20260522-132219/config.json +57 -0
  26. archive/runs_legacy_20260525/smoke/20260522-132219/metrics.jsonl +30 -0
  27. archive/runs_legacy_20260525/smoke/20260522-132219/summary.json +62 -0
  28. archive/runs_legacy_20260525/smoke/20260522-132219/tokenizer.json +1629 -0
  29. archive/runs_legacy_20260525/smoke/20260522-132336/config.json +58 -0
  30. archive/runs_legacy_20260525/smoke/20260522-132336/metrics.jsonl +30 -0
  31. archive/runs_legacy_20260525/smoke/20260522-132336/summary.csv +7 -0
  32. archive/runs_legacy_20260525/smoke/20260522-132336/summary.json +62 -0
  33. archive/runs_legacy_20260525/smoke/20260522-132336/tokenizer.json +1629 -0
  34. archive/runs_legacy_20260525/stage-dropout-multiseed/20260523-080428/config.json +68 -0
  35. archive/runs_legacy_20260525/stage-dropout-multiseed/20260523-080428/metrics.jsonl +20 -0
  36. archive/runs_legacy_20260525/stage-dropout-multiseed/20260523-080428/summary.csv +5 -0
  37. archive/runs_legacy_20260525/stage-dropout-multiseed/20260523-080428/summary.json +42 -0
  38. archive/runs_legacy_20260525/stage-dropout-multiseed/20260523-080428/tokenizer.json +0 -0
  39. archive/runs_legacy_20260525/stage-dropout-multiseed/20260523-082216/config.json +68 -0
  40. archive/runs_legacy_20260525/stage-dropout-multiseed/20260523-082216/metrics.jsonl +20 -0
  41. archive/runs_legacy_20260525/stage-dropout-multiseed/20260523-082216/summary.csv +5 -0
  42. archive/runs_legacy_20260525/stage-dropout-multiseed/20260523-082216/summary.json +42 -0
  43. archive/runs_legacy_20260525/stage-dropout-multiseed/20260523-082216/tokenizer.json +0 -0
  44. archive/runs_legacy_20260525/stage-dropout-single/20260523-075308/config.json +64 -0
  45. archive/runs_legacy_20260525/stage-dropout-single/20260523-075308/metrics.jsonl +4 -0
  46. archive/runs_legacy_20260525/stage-dropout-single/20260523-075308/summary.csv +5 -0
  47. archive/runs_legacy_20260525/stage-dropout-single/20260523-075308/summary.json +42 -0
  48. archive/runs_legacy_20260525/stage-dropout-single/20260523-075308/tokenizer.json +0 -0
  49. archive/runs_legacy_20260525/stage-dropout-single/20260523-075636/config.json +64 -0
  50. archive/runs_legacy_20260525/stage-dropout-single/20260523-075636/metrics.jsonl +4 -0
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ .DS_Store
2
+ __pycache__/
3
+ *.py[cod]
4
+ .cache/
5
+ *.npy
LICENSE ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Andrej Karpathy
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
23
+ Additional project attribution:
24
+ This repository contains research code derived from Andrej Karpathy's
25
+ nanochat project (https://github.com/karpathy/nanochat), which is distributed
26
+ under the MIT License above.
README.md ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dropout Decay Streaming Experiments
2
+
3
+ This project tests dropout decay only after first finding a model/data regime
4
+ where static dropout has a real nonzero validation optimum.
5
+
6
+ The implementation is derived from Andrej Karpathy's `nanochat` repository:
7
+ https://github.com/karpathy/nanochat. Only the core tokenizer ideas and
8
+ foundational causal Transformer architecture are retained. Chat interfaces,
9
+ deployment scripts, distributed training code, and inference services are not
10
+ included. The original nanochat MIT copyright and permission notice are retained
11
+ in derived source files and in `LICENSE`.
12
+
13
+ ## Compliance
14
+
15
+ All Torch experiment runs are MPS-only. The runner exits before model creation if
16
+ MPS is unavailable, if PyTorch was not built with MPS, or if
17
+ `PYTORCH_ENABLE_MPS_FALLBACK=1` is set.
18
+
19
+ ## Workflow
20
+
21
+ 1. Screen candidate model sizes with cheap static dropout sweeps.
22
+ 2. Select candidate models whose validation curve has an interior nonzero
23
+ dropout optimum.
24
+ 3. Confirm the winner with a 3-seed static sweep.
25
+ 4. Lock the model and run static-vs-decay streaming comparisons from scratch.
26
+
27
+ Every run writes:
28
+
29
+ - `config.json`: command, model specs, data paths, environment, attribution.
30
+ - `metrics.jsonl`: one row per seed/model/dropout/stage.
31
+ - `trace.jsonl`: optional training and intermediate evaluation trace.
32
+ - `summary.csv` / `summary.json`: mean/std train loss, validation loss, and gap.
33
+ - `model_selection.csv` / `model_selection.json`: static-sweep optimum and
34
+ plateau diagnostics for screen and confirm runs.
35
+
36
+ Old exploratory outputs are archived under `archive/`.
37
+
38
+ ## Step 1: Cheap Static Screen
39
+
40
+ Use one or two seeds. The output tells us, for each model, where the static
41
+ dropout curve bottoms out and which dropout range is within the configured
42
+ plateau delta.
43
+
44
+ ```bash
45
+ PYTHONPATH=src /Users/mandeepsidhu/Desktop/code/nanochat/.venv/bin/python scripts/run_experiments.py \
46
+ --mode screen_static \
47
+ --corpus-glob "/Users/mandeepsidhu/Desktop/code/nanochat/.nanochat-cache/base_data_climbmix/shard_*.parquet" \
48
+ --models 8x8x256 12x8x384 16x8x384 \
49
+ --seeds 1 2 \
50
+ --token-limits 5000000 \
51
+ --dropout-rates 0.0 0.02 0.05 0.08 0.10 0.14 0.20 0.30 0.50 \
52
+ --steps 2000 \
53
+ --eval-batches 64
54
+ ```
55
+
56
+ ## Step 2: Confirm Winner
57
+
58
+ After selecting a promising model, rerun the static dropout curve with exactly
59
+ three seeds.
60
+
61
+ ```bash
62
+ PYTHONPATH=src /Users/mandeepsidhu/Desktop/code/nanochat/.venv/bin/python scripts/run_experiments.py \
63
+ --mode confirm_static \
64
+ --corpus-glob "/Users/mandeepsidhu/Desktop/code/nanochat/.nanochat-cache/base_data_climbmix/shard_*.parquet" \
65
+ --models winner=12x8x384 \
66
+ --seeds 1 2 3 \
67
+ --token-limits 5000000 \
68
+ --dropout-rates 0.0 0.02 0.05 0.08 0.10 0.14 0.20 0.30 0.50 \
69
+ --steps 2000 \
70
+ --eval-batches 64
71
+ ```
72
+
73
+ ## Step 3: Locked Streaming Comparison
74
+
75
+ Only after the model is locked, compare static dropout and decay schedules from
76
+ fresh initialization.
77
+
78
+ ```bash
79
+ PYTHONPATH=src /Users/mandeepsidhu/Desktop/code/nanochat/.venv/bin/python scripts/run_experiments.py \
80
+ --mode locked_stream \
81
+ --corpus-glob "/Users/mandeepsidhu/Desktop/code/nanochat/.nanochat-cache/base_data_climbmix/shard_*.parquet" \
82
+ --models winner=12x8x384 \
83
+ --seeds 1 2 3 \
84
+ --stream-token-caps 5000000 10000000 20000000 40000000 \
85
+ --dropout-rates 0.0 0.10 0.14 0.20 \
86
+ --decays decay_030_to_014:0.30:0.14:cosine decay_020_to_010:0.20:0.10:cosine \
87
+ --stage-steps 1000 \
88
+ --eval-batches 64
89
+ ```
archive/legacy_docs/control_results.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Control Results
2
+
3
+ These controls were run after the initial core experiment to isolate whether
4
+ dropout decay caused the streaming improvement.
5
+
6
+ Both runs used CLIMB-mix parquet data from the local nanochat cache, MPS-only
7
+ Torch execution, five seeds, a 4096-token BPE vocabulary, 500k validation tokens,
8
+ and the same 8.39M-parameter large model.
9
+
10
+ ## Run Artifacts
11
+
12
+ - Core run: `runs/publishable/20260522-132351`
13
+ - Control run: `runs/controls/20260522-141120`
14
+
15
+ ## Matched Streaming Comparison
16
+
17
+ | Condition | 5M | 10M | 20M | 40M |
18
+ | --- | ---: | ---: | ---: | ---: |
19
+ | Dropout decay streaming | 6.9213 +/- 0.0434 | 6.2689 +/- 0.0539 | 5.4262 +/- 0.0722 | 4.9090 +/- 0.0452 |
20
+ | Static 0.1 dropout streaming | 5.6310 +/- 0.0438 | 5.1018 +/- 0.0306 | 4.8497 +/- 0.0338 | 4.6743 +/- 0.0449 |
21
+ | Static 0.8 dropout streaming | 6.9898 +/- 0.0505 | 6.7637 +/- 0.0762 | 6.4835 +/- 0.0408 | 6.2390 +/- 0.0543 |
22
+
23
+ ## Other Controls
24
+
25
+ | Condition | Token limit | Mean eval loss | Std |
26
+ | --- | ---: | ---: | ---: |
27
+ | Large static baseline, fixed initial data | 5M | 5.2036 | 0.0258 |
28
+ | Large dropout decay, fixed initial data | 5M | 6.7389 | 0.0484 |
29
+ | Large high static dropout, fixed initial data | 5M | 6.8355 | 0.0613 |
30
+ | Large static full data from start | 40M | 4.6915 | 0.0430 |
31
+
32
+ ## Conclusion
33
+
34
+ The original strong hypothesis is falsified in this configuration. A high-capacity
35
+ model does absorb the expanding data stream, but dropout decay from 0.8 to 0.1 is
36
+ not the causal reason for the improvement. The matched static 0.1 dropout
37
+ streaming control beats dropout decay at every token cap, and high static dropout
38
+ performs poorly.
39
+
40
+ The narrower supported claim is:
41
+
42
+ > An 8.39M-parameter causal Transformer can continuously improve as the simulated
43
+ > stream grows from 5M to 40M tokens, but for this setup static 0.1 dropout is
44
+ > stronger than a high-initial-dropout decay schedule.
45
+
46
+ Extra tuning could search for a better schedule, such as a lower initial dropout
47
+ or a faster decay horizon, but that would be a new hypothesis rather than support
48
+ for the original one.
archive/legacy_docs/stage_dropout_search.md ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Stage Dropout Search
2
+
3
+ This search tested whether the original failure came from the `0.8` initial
4
+ dropout rather than from the decay idea itself.
5
+
6
+ All runs used the same CLIMB-mix corpus, 8.39M-parameter large model, 5M/10M/20M/
7
+ 40M stream caps, MPS-only execution, and 64 validation batches.
8
+
9
+ ## Single-Seed Probe
10
+
11
+ Seed: 1.
12
+
13
+ | Condition | 5M | 10M | 20M | 40M |
14
+ | --- | ---: | ---: | ---: | ---: |
15
+ | Static 0.1 dropout | 5.5670 | 5.0850 | 4.8676 | 4.7417 |
16
+ | Old 0.8 cosine decay | 6.9460 | 6.2013 | 5.3635 | 4.9677 |
17
+ | Mild: 0.30 / 0.22 / 0.14 / 0.10 | 5.7511 | 5.1962 | 4.9055 | 4.7524 |
18
+ | Medium: 0.50 / 0.32 / 0.16 / 0.10 | 6.1544 | 5.3261 | 4.9566 | 4.8095 |
19
+ | Fast: 0.50 / 0.10 / 0.10 / 0.10 | 6.1544 | 5.1626 | 4.8824 | 4.7583 |
20
+
21
+ The medium schedule was clearly weaker. Mild and fast were close enough to static
22
+ 0.1 at 40M to justify multi-seed runs.
23
+
24
+ ## Multi-Seed Follow-Up
25
+
26
+ Seeds: 1, 2, 3, 4, 5.
27
+
28
+ | Condition | 5M | 10M | 20M | 40M |
29
+ | --- | ---: | ---: | ---: | ---: |
30
+ | Static 0.0 dropout | 5.5694 +/- 0.0465 | 5.0527 +/- 0.0596 | 4.7800 +/- 0.0423 | 4.5998 +/- 0.0391 |
31
+ | Static 0.01 dropout | 5.5807 +/- 0.0503 | 5.0627 +/- 0.0630 | 4.7904 +/- 0.0405 | 4.6152 +/- 0.0525 |
32
+ | Static 0.02 dropout | 5.5873 +/- 0.0436 | 5.0575 +/- 0.0396 | 4.8009 +/- 0.0406 | 4.6192 +/- 0.0455 |
33
+ | Static 0.03 dropout | 5.5920 +/- 0.0436 | 5.0734 +/- 0.0505 | 4.8110 +/- 0.0410 | 4.6281 +/- 0.0448 |
34
+ | Static 0.04 dropout | 5.5964 +/- 0.0435 | 5.0752 +/- 0.0490 | 4.8165 +/- 0.0444 | 4.6416 +/- 0.0475 |
35
+ | Static 0.06 dropout | 5.6075 +/- 0.0476 | 5.0797 +/- 0.0394 | 4.8324 +/- 0.0399 | 4.6499 +/- 0.0430 |
36
+ | Static 0.08 dropout | 5.6146 +/- 0.0425 | 5.0849 +/- 0.0363 | 4.8358 +/- 0.0350 | 4.6541 +/- 0.0387 |
37
+ | Static 0.1 dropout | 5.6310 +/- 0.0438 | 5.1018 +/- 0.0306 | 4.8497 +/- 0.0338 | 4.6743 +/- 0.0449 |
38
+ | Static 0.14 dropout | 5.6544 +/- 0.0361 | 5.1239 +/- 0.0318 | 4.8742 +/- 0.0323 | 4.7003 +/- 0.0365 |
39
+ | Static 0.20 dropout | 5.6960 +/- 0.0453 | 5.1722 +/- 0.0320 | 4.9164 +/- 0.0286 | 4.7492 +/- 0.0379 |
40
+ | Sub-0.1: 0.10 / 0.08 / 0.06 / 0.05 | 5.6310 +/- 0.0438 | 5.0922 +/- 0.0312 | 4.8311 +/- 0.0352 | 4.6496 +/- 0.0499 |
41
+ | Sub-0.08: 0.08 / 0.06 / 0.04 / 0.03 | 5.6146 +/- 0.0425 | 5.0768 +/- 0.0386 | 4.8163 +/- 0.0372 | 4.6322 +/- 0.0405 |
42
+ | Mild: 0.30 / 0.22 / 0.14 / 0.10 | 5.8129 +/- 0.0528 | 5.2047 +/- 0.0351 | 4.8991 +/- 0.0294 | 4.7034 +/- 0.0415 |
43
+ | Fast: 0.50 / 0.10 / 0.10 / 0.10 | 6.2006 +/- 0.0451 | 5.1861 +/- 0.0352 | 4.8875 +/- 0.0350 | 4.7025 +/- 0.0414 |
44
+ | Old 0.8 cosine decay | 6.9213 +/- 0.0434 | 6.2689 +/- 0.0539 | 5.4262 +/- 0.0722 | 4.9090 +/- 0.0452 |
45
+
46
+ ## Conclusion
47
+
48
+ The original initial dropout was too large, and the old schedule kept dropout too
49
+ large for too long. Lower and faster schedules recover most of the lost
50
+ performance, which supports the diagnosis that `0.8` dropout was the main
51
+ problem.
52
+
53
+ However, the revised decay schedules still do not beat static `0.1` dropout.
54
+ The best revised schedule at 40M was fast decay at `4.7025 +/- 0.0414`, compared
55
+ with static `0.1` at `4.6743 +/- 0.0449`. The gap is small, but not favorable.
56
+
57
+ The sub-0.1 schedules changed the result relative to static `0.1`, but the
58
+ static sub-0.1 plateau sweep changed the interpretation. Static dropout kept
59
+ improving as dropout was lowered across the tested grid, and the best tested
60
+ condition is now static `0.0`, not a decay schedule.
61
+
62
+ Updated supported claim:
63
+
64
+ > High initial dropout is harmful in this setup. Dropout values above 0.1 lose
65
+ > monotonically in the tested static sweeps. Sub-0.1 decay improves over static
66
+ > 0.1, but static sub-0.1 controls show that this improvement is likely caused
67
+ > by using lower dropout values, not by decay itself. On the current 8.39M
68
+ > parameter model and CLIMB-mix stream, the static-dropout plateau was not found
69
+ > above zero.
70
+
71
+ Run artifacts:
72
+
73
+ - Single mild: `runs/stage-dropout-single/20260523-075308`
74
+ - Single medium: `runs/stage-dropout-single/20260523-075636`
75
+ - Single fast: `runs/stage-dropout-single/20260523-080008`
76
+ - Multi mild: `runs/stage-dropout-multiseed/20260523-080428`
77
+ - Multi fast: `runs/stage-dropout-multiseed/20260523-082216`
78
+ - Static 0.14: `runs/static-dropout-sweep/20260523-084122`
79
+ - Static 0.20: `runs/static-dropout-sweep/20260523-085817`
80
+ - Static sub-0.1 sweep: `runs/static-sub01-sweep/20260525-074602`
81
+ - Sub-0.1: `runs/sub01-dropout-decay/20260525-070308`
82
+ - Sub-0.08: `runs/sub01-dropout-decay/20260525-071923`
archive/runs_legacy_20260525/control-smoke/20260522-141107/config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "args": {
3
+ "corpus": "/Users/mandeepsidhu/Desktop/code/transformer/data/tinystories_train.txt",
4
+ "corpus_glob": null,
5
+ "text_column": "text",
6
+ "output_dir": "runs/control-smoke",
7
+ "suite": "controls",
8
+ "seeds": [
9
+ 1,
10
+ 2,
11
+ 3,
12
+ 4,
13
+ 5
14
+ ],
15
+ "initial_tokens": 20000,
16
+ "stream_token_caps": [
17
+ 20000,
18
+ 40000
19
+ ],
20
+ "val_tokens": 500000,
21
+ "allow_short_corpus": true,
22
+ "force_retokenize": false,
23
+ "vocab_size": 512,
24
+ "tokenizer_train_chars": 200000,
25
+ "block_size": 32,
26
+ "batch_size": 4,
27
+ "small_layers": 4,
28
+ "small_heads": 4,
29
+ "small_embd": 128,
30
+ "large_layers": 8,
31
+ "large_heads": 8,
32
+ "large_embd": 256,
33
+ "steps_per_run": 1,
34
+ "stream_steps_per_stage": 1,
35
+ "eval_batches": 1,
36
+ "log_every": 0,
37
+ "lr": 0.0003,
38
+ "weight_decay": 0.1,
39
+ "grad_clip": 1.0,
40
+ "baseline_dropout": 0.1,
41
+ "high_dropout": 0.8,
42
+ "dropout_decay_tokens": null,
43
+ "dropout_schedule": "cosine"
44
+ },
45
+ "device": "mps",
46
+ "torch": "2.9.1",
47
+ "attribution": "Derived from Andrej Karpathy's nanochat project (https://github.com/karpathy/nanochat), MIT License, Copyright (c) 2025 Andrej Karpathy.",
48
+ "tokenizer_path": "runs/control-smoke/20260522-141107/tokenizer.json",
49
+ "encoded_path": "runs/control-smoke/20260522-141107/tokens-v512-uint16.npy",
50
+ "train_tokens": 777983,
51
+ "val_tokens": 86442,
52
+ "effective_initial_tokens": 20000,
53
+ "effective_stream_token_caps": [
54
+ 20000,
55
+ 40000
56
+ ]
57
+ }
archive/runs_legacy_20260525/control-smoke/20260522-141107/metrics.jsonl ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_high_static_fixed_data", "seed": 1, "stage": null, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.8, "train_loss_last": 6.2400970458984375, "eval_loss": 6.237194538116455, "elapsed_sec": 0.12676668167114258, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
2
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 1, "stage": 0, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.1, "train_loss_last": 6.240777015686035, "eval_loss": 6.2333245277404785, "elapsed_sec": 0.0425410270690918, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
3
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 1, "stage": 1, "token_limit": 40000, "steps": 1, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.22697639465332, "eval_loss": 6.224180698394775, "elapsed_sec": 0.03090190887451172, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
4
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 1, "stage": 0, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.8, "train_loss_last": 6.2400970458984375, "eval_loss": 6.237194538116455, "elapsed_sec": 0.03940081596374512, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
5
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 1, "stage": 1, "token_limit": 40000, "steps": 1, "tokens_seen": 256, "dropout": 0.8, "train_loss_last": 6.236742973327637, "eval_loss": 6.234339714050293, "elapsed_sec": 0.029629945755004883, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
6
+ {"phase": "phase4_full_data_controls", "condition": "large_static_full_data_from_start", "seed": 1, "stage": null, "token_limit": 40000, "steps": 2, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.221075057983398, "eval_loss": 6.228199481964111, "elapsed_sec": 0.06530499458312988, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
7
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_high_static_fixed_data", "seed": 2, "stage": null, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.8, "train_loss_last": 6.236819267272949, "eval_loss": 6.234230995178223, "elapsed_sec": 0.03899407386779785, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
8
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 2, "stage": 0, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.1, "train_loss_last": 6.238691329956055, "eval_loss": 6.229793548583984, "elapsed_sec": 0.03891491889953613, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
9
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 2, "stage": 1, "token_limit": 40000, "steps": 1, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.2327423095703125, "eval_loss": 6.218461513519287, "elapsed_sec": 0.029666900634765625, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
10
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 2, "stage": 0, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.8, "train_loss_last": 6.236819267272949, "eval_loss": 6.234230995178223, "elapsed_sec": 0.03844118118286133, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
11
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 2, "stage": 1, "token_limit": 40000, "steps": 1, "tokens_seen": 256, "dropout": 0.8, "train_loss_last": 6.236791133880615, "eval_loss": 6.230033874511719, "elapsed_sec": 0.030471086502075195, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
12
+ {"phase": "phase4_full_data_controls", "condition": "large_static_full_data_from_start", "seed": 2, "stage": null, "token_limit": 40000, "steps": 2, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.2310051918029785, "eval_loss": 6.225197792053223, "elapsed_sec": 0.06515097618103027, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
13
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_high_static_fixed_data", "seed": 3, "stage": null, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.8, "train_loss_last": 6.238975524902344, "eval_loss": 6.2339982986450195, "elapsed_sec": 0.03981590270996094, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
14
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 3, "stage": 0, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.1, "train_loss_last": 6.237975120544434, "eval_loss": 6.229854583740234, "elapsed_sec": 0.04071402549743652, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
15
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 3, "stage": 1, "token_limit": 40000, "steps": 1, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.23427677154541, "eval_loss": 6.21778678894043, "elapsed_sec": 0.03224611282348633, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
16
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 3, "stage": 0, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.8, "train_loss_last": 6.238975524902344, "eval_loss": 6.2339982986450195, "elapsed_sec": 0.040772199630737305, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
17
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 3, "stage": 1, "token_limit": 40000, "steps": 1, "tokens_seen": 256, "dropout": 0.8, "train_loss_last": 6.238644599914551, "eval_loss": 6.229787826538086, "elapsed_sec": 0.030426025390625, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
18
+ {"phase": "phase4_full_data_controls", "condition": "large_static_full_data_from_start", "seed": 3, "stage": null, "token_limit": 40000, "steps": 2, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.2319865226745605, "eval_loss": 6.2150068283081055, "elapsed_sec": 0.06477165222167969, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
19
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_high_static_fixed_data", "seed": 4, "stage": null, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.8, "train_loss_last": 6.239019393920898, "eval_loss": 6.237016677856445, "elapsed_sec": 0.03902721405029297, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
20
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 4, "stage": 0, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.1, "train_loss_last": 6.239620208740234, "eval_loss": 6.232203960418701, "elapsed_sec": 0.03930306434631348, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
21
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 4, "stage": 1, "token_limit": 40000, "steps": 1, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.233333110809326, "eval_loss": 6.221055507659912, "elapsed_sec": 0.030991077423095703, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
22
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 4, "stage": 0, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.8, "train_loss_last": 6.239019393920898, "eval_loss": 6.237016677856445, "elapsed_sec": 0.0409388542175293, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
23
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 4, "stage": 1, "token_limit": 40000, "steps": 1, "tokens_seen": 256, "dropout": 0.8, "train_loss_last": 6.235775947570801, "eval_loss": 6.23379373550415, "elapsed_sec": 0.030637741088867188, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
24
+ {"phase": "phase4_full_data_controls", "condition": "large_static_full_data_from_start", "seed": 4, "stage": null, "token_limit": 40000, "steps": 2, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.2377543449401855, "eval_loss": 6.226580619812012, "elapsed_sec": 0.06628823280334473, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
25
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_high_static_fixed_data", "seed": 5, "stage": null, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.8, "train_loss_last": 6.239316463470459, "eval_loss": 6.237186431884766, "elapsed_sec": 0.039865732192993164, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
26
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 5, "stage": 0, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.1, "train_loss_last": 6.23555850982666, "eval_loss": 6.231681823730469, "elapsed_sec": 0.039572954177856445, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
27
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 5, "stage": 1, "token_limit": 40000, "steps": 1, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.229386329650879, "eval_loss": 6.219687461853027, "elapsed_sec": 0.03072190284729004, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
28
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 5, "stage": 0, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.8, "train_loss_last": 6.239316463470459, "eval_loss": 6.237186431884766, "elapsed_sec": 0.04007697105407715, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
29
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 5, "stage": 1, "token_limit": 40000, "steps": 1, "tokens_seen": 256, "dropout": 0.8, "train_loss_last": 6.239014148712158, "eval_loss": 6.233704566955566, "elapsed_sec": 0.031112194061279297, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
30
+ {"phase": "phase4_full_data_controls", "condition": "large_static_full_data_from_start", "seed": 5, "stage": null, "token_limit": 40000, "steps": 2, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.230358123779297, "eval_loss": 6.227848052978516, "elapsed_sec": 0.06935000419616699, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
archive/runs_legacy_20260525/control-smoke/20260522-141107/summary.csv ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ phase,condition,stage,token_limit,parameters,n,mean_eval_loss,std_eval_loss
2
+ phase2_fixed_data_dropout_optimization,large_high_static_fixed_data,,20000,6553600,5,6.235925388336182,0.0016565421564777797
3
+ phase3_streaming_controls,large_high_dropout_streaming,0,20000,6553600,5,6.235925388336182,0.0016565421564777797
4
+ phase3_streaming_controls,large_high_dropout_streaming,1,40000,6553600,5,6.232331943511963,0.0022251812166423465
5
+ phase3_streaming_controls,large_static_dropout_streaming,0,20000,6553600,5,6.231371688842773,0.0015325284625100665
6
+ phase3_streaming_controls,large_static_dropout_streaming,1,40000,6553600,5,6.220234394073486,0.0025337993467635157
7
+ phase4_full_data_controls,large_static_full_data_from_start,,40000,6553600,5,6.224566555023193,0.005472880389192584
archive/runs_legacy_20260525/control-smoke/20260522-141107/summary.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "phase": "phase2_fixed_data_dropout_optimization",
4
+ "condition": "large_high_static_fixed_data",
5
+ "stage": null,
6
+ "token_limit": 20000,
7
+ "parameters": 6553600,
8
+ "n": 5,
9
+ "mean_eval_loss": 6.235925388336182,
10
+ "std_eval_loss": 0.0016565421564777797
11
+ },
12
+ {
13
+ "phase": "phase3_streaming_controls",
14
+ "condition": "large_high_dropout_streaming",
15
+ "stage": 0,
16
+ "token_limit": 20000,
17
+ "parameters": 6553600,
18
+ "n": 5,
19
+ "mean_eval_loss": 6.235925388336182,
20
+ "std_eval_loss": 0.0016565421564777797
21
+ },
22
+ {
23
+ "phase": "phase3_streaming_controls",
24
+ "condition": "large_high_dropout_streaming",
25
+ "stage": 1,
26
+ "token_limit": 40000,
27
+ "parameters": 6553600,
28
+ "n": 5,
29
+ "mean_eval_loss": 6.232331943511963,
30
+ "std_eval_loss": 0.0022251812166423465
31
+ },
32
+ {
33
+ "phase": "phase3_streaming_controls",
34
+ "condition": "large_static_dropout_streaming",
35
+ "stage": 0,
36
+ "token_limit": 20000,
37
+ "parameters": 6553600,
38
+ "n": 5,
39
+ "mean_eval_loss": 6.231371688842773,
40
+ "std_eval_loss": 0.0015325284625100665
41
+ },
42
+ {
43
+ "phase": "phase3_streaming_controls",
44
+ "condition": "large_static_dropout_streaming",
45
+ "stage": 1,
46
+ "token_limit": 40000,
47
+ "parameters": 6553600,
48
+ "n": 5,
49
+ "mean_eval_loss": 6.220234394073486,
50
+ "std_eval_loss": 0.0025337993467635157
51
+ },
52
+ {
53
+ "phase": "phase4_full_data_controls",
54
+ "condition": "large_static_full_data_from_start",
55
+ "stage": null,
56
+ "token_limit": 40000,
57
+ "parameters": 6553600,
58
+ "n": 5,
59
+ "mean_eval_loss": 6.224566555023193,
60
+ "std_eval_loss": 0.005472880389192584
61
+ }
62
+ ]
archive/runs_legacy_20260525/control-smoke/20260522-141107/tokenizer.json ADDED
@@ -0,0 +1,1629 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<|bos|>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<|user_start|>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<|user_end|>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "<|assistant_start|>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "<|assistant_end|>",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ },
51
+ {
52
+ "id": 5,
53
+ "content": "<|python_start|>",
54
+ "single_word": false,
55
+ "lstrip": false,
56
+ "rstrip": false,
57
+ "normalized": false,
58
+ "special": true
59
+ },
60
+ {
61
+ "id": 6,
62
+ "content": "<|python_end|>",
63
+ "single_word": false,
64
+ "lstrip": false,
65
+ "rstrip": false,
66
+ "normalized": false,
67
+ "special": true
68
+ },
69
+ {
70
+ "id": 7,
71
+ "content": "<|output_start|>",
72
+ "single_word": false,
73
+ "lstrip": false,
74
+ "rstrip": false,
75
+ "normalized": false,
76
+ "special": true
77
+ },
78
+ {
79
+ "id": 8,
80
+ "content": "<|output_end|>",
81
+ "single_word": false,
82
+ "lstrip": false,
83
+ "rstrip": false,
84
+ "normalized": false,
85
+ "special": true
86
+ }
87
+ ],
88
+ "normalizer": null,
89
+ "pre_tokenizer": {
90
+ "type": "Sequence",
91
+ "pretokenizers": [
92
+ {
93
+ "type": "Split",
94
+ "pattern": {
95
+ "Regex": "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}{1,2}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
96
+ },
97
+ "behavior": "Isolated",
98
+ "invert": false
99
+ },
100
+ {
101
+ "type": "ByteLevel",
102
+ "add_prefix_space": false,
103
+ "trim_offsets": true,
104
+ "use_regex": false
105
+ }
106
+ ]
107
+ },
108
+ "post_processor": null,
109
+ "decoder": {
110
+ "type": "ByteLevel",
111
+ "add_prefix_space": true,
112
+ "trim_offsets": true,
113
+ "use_regex": true
114
+ },
115
+ "model": {
116
+ "type": "BPE",
117
+ "dropout": null,
118
+ "unk_token": null,
119
+ "continuing_subword_prefix": null,
120
+ "end_of_word_suffix": null,
121
+ "fuse_unk": false,
122
+ "byte_fallback": true,
123
+ "ignore_merges": false,
124
+ "vocab": {
125
+ "<|bos|>": 0,
126
+ "<|user_start|>": 1,
127
+ "<|user_end|>": 2,
128
+ "<|assistant_start|>": 3,
129
+ "<|assistant_end|>": 4,
130
+ "<|python_start|>": 5,
131
+ "<|python_end|>": 6,
132
+ "<|output_start|>": 7,
133
+ "<|output_end|>": 8,
134
+ "!": 9,
135
+ "\"": 10,
136
+ "#": 11,
137
+ "$": 12,
138
+ "%": 13,
139
+ "&": 14,
140
+ "'": 15,
141
+ "(": 16,
142
+ ")": 17,
143
+ "*": 18,
144
+ "+": 19,
145
+ ",": 20,
146
+ "-": 21,
147
+ ".": 22,
148
+ "/": 23,
149
+ "0": 24,
150
+ "1": 25,
151
+ "2": 26,
152
+ "3": 27,
153
+ "4": 28,
154
+ "5": 29,
155
+ "6": 30,
156
+ "7": 31,
157
+ "8": 32,
158
+ "9": 33,
159
+ ":": 34,
160
+ ";": 35,
161
+ "<": 36,
162
+ "=": 37,
163
+ ">": 38,
164
+ "?": 39,
165
+ "@": 40,
166
+ "A": 41,
167
+ "B": 42,
168
+ "C": 43,
169
+ "D": 44,
170
+ "E": 45,
171
+ "F": 46,
172
+ "G": 47,
173
+ "H": 48,
174
+ "I": 49,
175
+ "J": 50,
176
+ "K": 51,
177
+ "L": 52,
178
+ "M": 53,
179
+ "N": 54,
180
+ "O": 55,
181
+ "P": 56,
182
+ "Q": 57,
183
+ "R": 58,
184
+ "S": 59,
185
+ "T": 60,
186
+ "U": 61,
187
+ "V": 62,
188
+ "W": 63,
189
+ "X": 64,
190
+ "Y": 65,
191
+ "Z": 66,
192
+ "[": 67,
193
+ "\\": 68,
194
+ "]": 69,
195
+ "^": 70,
196
+ "_": 71,
197
+ "`": 72,
198
+ "a": 73,
199
+ "b": 74,
200
+ "c": 75,
201
+ "d": 76,
202
+ "e": 77,
203
+ "f": 78,
204
+ "g": 79,
205
+ "h": 80,
206
+ "i": 81,
207
+ "j": 82,
208
+ "k": 83,
209
+ "l": 84,
210
+ "m": 85,
211
+ "n": 86,
212
+ "o": 87,
213
+ "p": 88,
214
+ "q": 89,
215
+ "r": 90,
216
+ "s": 91,
217
+ "t": 92,
218
+ "u": 93,
219
+ "v": 94,
220
+ "w": 95,
221
+ "x": 96,
222
+ "y": 97,
223
+ "z": 98,
224
+ "{": 99,
225
+ "|": 100,
226
+ "}": 101,
227
+ "~": 102,
228
+ "¡": 103,
229
+ "¢": 104,
230
+ "£": 105,
231
+ "¤": 106,
232
+ "¥": 107,
233
+ "¦": 108,
234
+ "§": 109,
235
+ "¨": 110,
236
+ "©": 111,
237
+ "ª": 112,
238
+ "«": 113,
239
+ "¬": 114,
240
+ "®": 115,
241
+ "¯": 116,
242
+ "°": 117,
243
+ "±": 118,
244
+ "²": 119,
245
+ "³": 120,
246
+ "´": 121,
247
+ "µ": 122,
248
+ "¶": 123,
249
+ "·": 124,
250
+ "¸": 125,
251
+ "¹": 126,
252
+ "º": 127,
253
+ "»": 128,
254
+ "¼": 129,
255
+ "½": 130,
256
+ "¾": 131,
257
+ "¿": 132,
258
+ "À": 133,
259
+ "Á": 134,
260
+ "Â": 135,
261
+ "Ã": 136,
262
+ "Ä": 137,
263
+ "Å": 138,
264
+ "Æ": 139,
265
+ "Ç": 140,
266
+ "È": 141,
267
+ "É": 142,
268
+ "Ê": 143,
269
+ "Ë": 144,
270
+ "Ì": 145,
271
+ "Í": 146,
272
+ "Î": 147,
273
+ "Ï": 148,
274
+ "Ð": 149,
275
+ "Ñ": 150,
276
+ "Ò": 151,
277
+ "Ó": 152,
278
+ "Ô": 153,
279
+ "Õ": 154,
280
+ "Ö": 155,
281
+ "×": 156,
282
+ "Ø": 157,
283
+ "Ù": 158,
284
+ "Ú": 159,
285
+ "Û": 160,
286
+ "Ü": 161,
287
+ "Ý": 162,
288
+ "Þ": 163,
289
+ "ß": 164,
290
+ "à": 165,
291
+ "á": 166,
292
+ "â": 167,
293
+ "ã": 168,
294
+ "ä": 169,
295
+ "å": 170,
296
+ "æ": 171,
297
+ "ç": 172,
298
+ "è": 173,
299
+ "é": 174,
300
+ "ê": 175,
301
+ "ë": 176,
302
+ "ì": 177,
303
+ "í": 178,
304
+ "î": 179,
305
+ "ï": 180,
306
+ "ð": 181,
307
+ "ñ": 182,
308
+ "ò": 183,
309
+ "ó": 184,
310
+ "ô": 185,
311
+ "õ": 186,
312
+ "ö": 187,
313
+ "÷": 188,
314
+ "ø": 189,
315
+ "ù": 190,
316
+ "ú": 191,
317
+ "û": 192,
318
+ "ü": 193,
319
+ "ý": 194,
320
+ "þ": 195,
321
+ "ÿ": 196,
322
+ "Ā": 197,
323
+ "ā": 198,
324
+ "Ă": 199,
325
+ "ă": 200,
326
+ "Ą": 201,
327
+ "ą": 202,
328
+ "Ć": 203,
329
+ "ć": 204,
330
+ "Ĉ": 205,
331
+ "ĉ": 206,
332
+ "Ċ": 207,
333
+ "ċ": 208,
334
+ "Č": 209,
335
+ "č": 210,
336
+ "Ď": 211,
337
+ "ď": 212,
338
+ "Đ": 213,
339
+ "đ": 214,
340
+ "Ē": 215,
341
+ "ē": 216,
342
+ "Ĕ": 217,
343
+ "ĕ": 218,
344
+ "Ė": 219,
345
+ "ė": 220,
346
+ "Ę": 221,
347
+ "ę": 222,
348
+ "Ě": 223,
349
+ "ě": 224,
350
+ "Ĝ": 225,
351
+ "ĝ": 226,
352
+ "Ğ": 227,
353
+ "ğ": 228,
354
+ "Ġ": 229,
355
+ "ġ": 230,
356
+ "Ģ": 231,
357
+ "ģ": 232,
358
+ "Ĥ": 233,
359
+ "ĥ": 234,
360
+ "Ħ": 235,
361
+ "ħ": 236,
362
+ "Ĩ": 237,
363
+ "ĩ": 238,
364
+ "Ī": 239,
365
+ "ī": 240,
366
+ "Ĭ": 241,
367
+ "ĭ": 242,
368
+ "Į": 243,
369
+ "į": 244,
370
+ "İ": 245,
371
+ "ı": 246,
372
+ "IJ": 247,
373
+ "ij": 248,
374
+ "Ĵ": 249,
375
+ "ĵ": 250,
376
+ "Ķ": 251,
377
+ "ķ": 252,
378
+ "ĸ": 253,
379
+ "Ĺ": 254,
380
+ "ĺ": 255,
381
+ "Ļ": 256,
382
+ "ļ": 257,
383
+ "Ľ": 258,
384
+ "ľ": 259,
385
+ "Ŀ": 260,
386
+ "ŀ": 261,
387
+ "Ł": 262,
388
+ "ł": 263,
389
+ "Ń": 264,
390
+ "he": 265,
391
+ "Ġt": 266,
392
+ "Ġa": 267,
393
+ "Ġs": 268,
394
+ "Ġw": 269,
395
+ "Ġthe": 270,
396
+ "nd": 271,
397
+ "ed": 272,
398
+ "in": 273,
399
+ "Ġand": 274,
400
+ "Ġwa": 275,
401
+ "Ġb": 276,
402
+ "Ġto": 277,
403
+ "re": 278,
404
+ "Ġh": 279,
405
+ "ou": 280,
406
+ "it": 281,
407
+ "Ġf": 282,
408
+ "er": 283,
409
+ "ĊĊ": 284,
410
+ "Ġwas": 285,
411
+ "Ġl": 286,
412
+ "Ġc": 287,
413
+ "Ġhe": 288,
414
+ "Ġp": 289,
415
+ "ing": 290,
416
+ "Ġd": 291,
417
+ "Ġm": 292,
418
+ "Ġo": 293,
419
+ "Ġg": 294,
420
+ "ar": 295,
421
+ "is": 296,
422
+ "id": 297,
423
+ "ay": 298,
424
+ "om": 299,
425
+ "at": 300,
426
+ "ll": 301,
427
+ "en": 302,
428
+ "Ġsa": 303,
429
+ "ne": 304,
430
+ "The": 305,
431
+ ".ĊĊ": 306,
432
+ "le": 307,
433
+ "Ġth": 308,
434
+ "im": 309,
435
+ "an": 310,
436
+ "Ġha": 311,
437
+ "or": 312,
438
+ "Ġit": 313,
439
+ "et": 314,
440
+ "ver": 315,
441
+ "ld": 316,
442
+ "Ġin": 317,
443
+ "ĠS": 318,
444
+ "on": 319,
445
+ "Ġe": 320,
446
+ "ce": 321,
447
+ "Ġbe": 322,
448
+ "Ġher": 323,
449
+ "ir": 324,
450
+ "Ġ\"": 325,
451
+ "ĠH": 326,
452
+ "Ġu": 327,
453
+ "Ġsaid": 328,
454
+ "Ġn": 329,
455
+ "ck": 330,
456
+ "ow": 331,
457
+ "ri": 332,
458
+ "ĠThe": 333,
459
+ "Ġshe": 334,
460
+ "Ġso": 335,
461
+ "st": 336,
462
+ "Ġy": 337,
463
+ "ot": 338,
464
+ "ĠHe": 339,
465
+ "Ġof": 340,
466
+ "il": 341,
467
+ "Ġst": 342,
468
+ "ut": 343,
469
+ "ke": 344,
470
+ "am": 345,
471
+ "ked": 346,
472
+ "oo": 347,
473
+ "pp": 348,
474
+ "Ġr": 349,
475
+ "ĠShe": 350,
476
+ "very": 351,
477
+ "ĠI": 352,
478
+ "ve": 353,
479
+ "Ġthat": 354,
480
+ "ig": 355,
481
+ "ith": 356,
482
+ "Ġhis": 357,
483
+ "Ġup": 358,
484
+ "ĠĊĊ": 359,
485
+ "Ġday": 360,
486
+ "Ġwith": 361,
487
+ "Ġpl": 362,
488
+ "Ġyou": 363,
489
+ "itt": 364,
490
+ "ould": 365,
491
+ "el": 366,
492
+ "ted": 367,
493
+ "ent": 368,
494
+ "ad": 369,
495
+ "Ġhad": 370,
496
+ "ound": 371,
497
+ "al": 372,
498
+ "ĠJ": 373,
499
+ "Ġwe": 374,
500
+ "her": 375,
501
+ "ittle": 376,
502
+ "'s": 377,
503
+ "Ġsm": 378,
504
+ "Ġplay": 379,
505
+ "end": 380,
506
+ "Ġthey": 381,
507
+ "ack": 382,
508
+ "Ġthere": 383,
509
+ "ime": 384,
510
+ "ly": 385,
511
+ "Ġsh": 386,
512
+ "Ġlittle": 387,
513
+ "Ġre": 388,
514
+ "Ġne": 389,
515
+ "Ġtime": 390,
516
+ "out": 391,
517
+ "Ġfor": 392,
518
+ "un": 393,
519
+ "ch": 394,
520
+ "se": 395,
521
+ "Ġhapp": 396,
522
+ "Ġwh": 397,
523
+ "my": 398,
524
+ "ome": 399,
525
+ "ht": 400,
526
+ "um": 401,
527
+ "Ġfri": 402,
528
+ "Ġas": 403,
529
+ "Ġfriend": 404,
530
+ "Ġvery": 405,
531
+ "all": 406,
532
+ "ter": 407,
533
+ "â": 408,
534
+ "âĤ": 409,
535
+ "âĤ¬": 410,
536
+ "On": 411,
537
+ "Ġk": 412,
538
+ "ved": 413,
539
+ "ĠT": 414,
540
+ "Ġon": 415,
541
+ "irl": 416,
542
+ "Once": 417,
543
+ "ug": 418,
544
+ "\"ĊĊ": 419,
545
+ "ill": 420,
546
+ "Ġgirl": 421,
547
+ "Ġan": 422,
548
+ "es": 423,
549
+ "Ġex": 424,
550
+ "'t": 425,
551
+ "ec": 426,
552
+ "Ġbut": 427,
553
+ "Ġloo": 428,
554
+ "Ġli": 429,
555
+ "Ġbo": 430,
556
+ "Ġwere": 431,
557
+ "One": 432,
558
+ "Ġwan": 433,
559
+ "Ġhappy": 434,
560
+ "ake": 435,
561
+ "ore": 436,
562
+ "Ġbig": 437,
563
+ "fu": 438,
564
+ "Ġsp": 439,
565
+ "ide": 440,
566
+ "Ġsaw": 441,
567
+ "ĠB": 442,
568
+ "hing": 443,
569
+ "Ġupon": 444,
570
+ "ard": 445,
571
+ "Ġcould": 446,
572
+ "ic": 447,
573
+ "Ġout": 448,
574
+ "iled": 449,
575
+ "one": 450,
576
+ "round": 451,
577
+ "ra": 452,
578
+ "ry": 453,
579
+ "Ġsmiled": 454,
580
+ "Ġhim": 455,
581
+ "ĠA": 456,
582
+ "Ġmom": 457,
583
+ "hen": 458,
584
+ "way": 459,
585
+ "ur": 460,
586
+ "ĠIt": 461,
587
+ "ful": 462,
588
+ "ain": 463,
589
+ "Ġwent": 464,
590
+ "Ġhel": 465,
591
+ "Ġnot": 466,
592
+ "ĠThey": 467,
593
+ "Ġwanted": 468,
594
+ "ind": 469,
595
+ "are": 470,
596
+ "ear": 471,
597
+ "ĠM": 472,
598
+ "Ġall": 473,
599
+ "Ġfriends": 474,
600
+ "Ġtoo": 475,
601
+ "Ġgo": 476,
602
+ "ily": 477,
603
+ "ame": 478,
604
+ "ĠTim": 479,
605
+ "Ġhelp": 480,
606
+ "omet": 481,
607
+ "ĠL": 482,
608
+ "Ġlo": 483,
609
+ "ght": 484,
610
+ "Ġsomet": 485,
611
+ "Ġat": 486,
612
+ "Ġdo": 487,
613
+ "Ġasked": 488,
614
+ "!\"": 489,
615
+ "Ġaround": 490,
616
+ "Ġj": 491,
617
+ "ree": 492,
618
+ "Ġlooked": 493,
619
+ "Ġsomething": 494,
620
+ "Ġse": 495,
621
+ "Ġwor": 496,
622
+ "dd": 497,
623
+ "hed": 498,
624
+ "ood": 499,
625
+ "Ġcl": 500,
626
+ "amed": 501,
627
+ "ro": 502,
628
+ "Ġcan": 503,
629
+ "ark": 504,
630
+ "king": 505,
631
+ "ĠE": 506,
632
+ "rom": 507,
633
+ "Ġback": 508,
634
+ "Ġexc": 509,
635
+ "ab": 510,
636
+ "ick": 511
637
+ },
638
+ "merges": [
639
+ [
640
+ "h",
641
+ "e"
642
+ ],
643
+ [
644
+ "Ġ",
645
+ "t"
646
+ ],
647
+ [
648
+ "Ġ",
649
+ "a"
650
+ ],
651
+ [
652
+ "Ġ",
653
+ "s"
654
+ ],
655
+ [
656
+ "Ġ",
657
+ "w"
658
+ ],
659
+ [
660
+ "Ġt",
661
+ "he"
662
+ ],
663
+ [
664
+ "n",
665
+ "d"
666
+ ],
667
+ [
668
+ "e",
669
+ "d"
670
+ ],
671
+ [
672
+ "i",
673
+ "n"
674
+ ],
675
+ [
676
+ "Ġa",
677
+ "nd"
678
+ ],
679
+ [
680
+ "Ġw",
681
+ "a"
682
+ ],
683
+ [
684
+ "Ġ",
685
+ "b"
686
+ ],
687
+ [
688
+ "Ġt",
689
+ "o"
690
+ ],
691
+ [
692
+ "r",
693
+ "e"
694
+ ],
695
+ [
696
+ "Ġ",
697
+ "h"
698
+ ],
699
+ [
700
+ "o",
701
+ "u"
702
+ ],
703
+ [
704
+ "i",
705
+ "t"
706
+ ],
707
+ [
708
+ "Ġ",
709
+ "f"
710
+ ],
711
+ [
712
+ "e",
713
+ "r"
714
+ ],
715
+ [
716
+ "Ċ",
717
+ "Ċ"
718
+ ],
719
+ [
720
+ "Ġwa",
721
+ "s"
722
+ ],
723
+ [
724
+ "Ġ",
725
+ "l"
726
+ ],
727
+ [
728
+ "Ġ",
729
+ "c"
730
+ ],
731
+ [
732
+ "Ġ",
733
+ "he"
734
+ ],
735
+ [
736
+ "Ġ",
737
+ "p"
738
+ ],
739
+ [
740
+ "in",
741
+ "g"
742
+ ],
743
+ [
744
+ "Ġ",
745
+ "d"
746
+ ],
747
+ [
748
+ "Ġ",
749
+ "m"
750
+ ],
751
+ [
752
+ "Ġ",
753
+ "o"
754
+ ],
755
+ [
756
+ "Ġ",
757
+ "g"
758
+ ],
759
+ [
760
+ "a",
761
+ "r"
762
+ ],
763
+ [
764
+ "i",
765
+ "s"
766
+ ],
767
+ [
768
+ "i",
769
+ "d"
770
+ ],
771
+ [
772
+ "a",
773
+ "y"
774
+ ],
775
+ [
776
+ "o",
777
+ "m"
778
+ ],
779
+ [
780
+ "a",
781
+ "t"
782
+ ],
783
+ [
784
+ "l",
785
+ "l"
786
+ ],
787
+ [
788
+ "e",
789
+ "n"
790
+ ],
791
+ [
792
+ "Ġs",
793
+ "a"
794
+ ],
795
+ [
796
+ "n",
797
+ "e"
798
+ ],
799
+ [
800
+ "T",
801
+ "he"
802
+ ],
803
+ [
804
+ ".",
805
+ "ĊĊ"
806
+ ],
807
+ [
808
+ "l",
809
+ "e"
810
+ ],
811
+ [
812
+ "Ġt",
813
+ "h"
814
+ ],
815
+ [
816
+ "i",
817
+ "m"
818
+ ],
819
+ [
820
+ "a",
821
+ "n"
822
+ ],
823
+ [
824
+ "Ġh",
825
+ "a"
826
+ ],
827
+ [
828
+ "o",
829
+ "r"
830
+ ],
831
+ [
832
+ "Ġ",
833
+ "it"
834
+ ],
835
+ [
836
+ "e",
837
+ "t"
838
+ ],
839
+ [
840
+ "v",
841
+ "er"
842
+ ],
843
+ [
844
+ "l",
845
+ "d"
846
+ ],
847
+ [
848
+ "Ġ",
849
+ "in"
850
+ ],
851
+ [
852
+ "Ġ",
853
+ "S"
854
+ ],
855
+ [
856
+ "o",
857
+ "n"
858
+ ],
859
+ [
860
+ "Ġ",
861
+ "e"
862
+ ],
863
+ [
864
+ "c",
865
+ "e"
866
+ ],
867
+ [
868
+ "Ġb",
869
+ "e"
870
+ ],
871
+ [
872
+ "Ġhe",
873
+ "r"
874
+ ],
875
+ [
876
+ "i",
877
+ "r"
878
+ ],
879
+ [
880
+ "Ġ",
881
+ "\""
882
+ ],
883
+ [
884
+ "Ġ",
885
+ "H"
886
+ ],
887
+ [
888
+ "Ġ",
889
+ "u"
890
+ ],
891
+ [
892
+ "Ġsa",
893
+ "id"
894
+ ],
895
+ [
896
+ "Ġ",
897
+ "n"
898
+ ],
899
+ [
900
+ "c",
901
+ "k"
902
+ ],
903
+ [
904
+ "o",
905
+ "w"
906
+ ],
907
+ [
908
+ "r",
909
+ "i"
910
+ ],
911
+ [
912
+ "Ġ",
913
+ "The"
914
+ ],
915
+ [
916
+ "Ġs",
917
+ "he"
918
+ ],
919
+ [
920
+ "Ġs",
921
+ "o"
922
+ ],
923
+ [
924
+ "s",
925
+ "t"
926
+ ],
927
+ [
928
+ "Ġ",
929
+ "y"
930
+ ],
931
+ [
932
+ "o",
933
+ "t"
934
+ ],
935
+ [
936
+ "ĠH",
937
+ "e"
938
+ ],
939
+ [
940
+ "Ġo",
941
+ "f"
942
+ ],
943
+ [
944
+ "i",
945
+ "l"
946
+ ],
947
+ [
948
+ "Ġs",
949
+ "t"
950
+ ],
951
+ [
952
+ "u",
953
+ "t"
954
+ ],
955
+ [
956
+ "k",
957
+ "e"
958
+ ],
959
+ [
960
+ "a",
961
+ "m"
962
+ ],
963
+ [
964
+ "k",
965
+ "ed"
966
+ ],
967
+ [
968
+ "o",
969
+ "o"
970
+ ],
971
+ [
972
+ "p",
973
+ "p"
974
+ ],
975
+ [
976
+ "Ġ",
977
+ "r"
978
+ ],
979
+ [
980
+ "ĠS",
981
+ "he"
982
+ ],
983
+ [
984
+ "ver",
985
+ "y"
986
+ ],
987
+ [
988
+ "Ġ",
989
+ "I"
990
+ ],
991
+ [
992
+ "v",
993
+ "e"
994
+ ],
995
+ [
996
+ "Ġth",
997
+ "at"
998
+ ],
999
+ [
1000
+ "i",
1001
+ "g"
1002
+ ],
1003
+ [
1004
+ "it",
1005
+ "h"
1006
+ ],
1007
+ [
1008
+ "Ġh",
1009
+ "is"
1010
+ ],
1011
+ [
1012
+ "Ġu",
1013
+ "p"
1014
+ ],
1015
+ [
1016
+ "Ġ",
1017
+ "ĊĊ"
1018
+ ],
1019
+ [
1020
+ "Ġd",
1021
+ "ay"
1022
+ ],
1023
+ [
1024
+ "Ġw",
1025
+ "ith"
1026
+ ],
1027
+ [
1028
+ "Ġp",
1029
+ "l"
1030
+ ],
1031
+ [
1032
+ "Ġy",
1033
+ "ou"
1034
+ ],
1035
+ [
1036
+ "it",
1037
+ "t"
1038
+ ],
1039
+ [
1040
+ "ou",
1041
+ "ld"
1042
+ ],
1043
+ [
1044
+ "e",
1045
+ "l"
1046
+ ],
1047
+ [
1048
+ "t",
1049
+ "ed"
1050
+ ],
1051
+ [
1052
+ "en",
1053
+ "t"
1054
+ ],
1055
+ [
1056
+ "a",
1057
+ "d"
1058
+ ],
1059
+ [
1060
+ "Ġha",
1061
+ "d"
1062
+ ],
1063
+ [
1064
+ "ou",
1065
+ "nd"
1066
+ ],
1067
+ [
1068
+ "a",
1069
+ "l"
1070
+ ],
1071
+ [
1072
+ "Ġ",
1073
+ "J"
1074
+ ],
1075
+ [
1076
+ "Ġw",
1077
+ "e"
1078
+ ],
1079
+ [
1080
+ "he",
1081
+ "r"
1082
+ ],
1083
+ [
1084
+ "itt",
1085
+ "le"
1086
+ ],
1087
+ [
1088
+ "'",
1089
+ "s"
1090
+ ],
1091
+ [
1092
+ "Ġs",
1093
+ "m"
1094
+ ],
1095
+ [
1096
+ "Ġpl",
1097
+ "ay"
1098
+ ],
1099
+ [
1100
+ "e",
1101
+ "nd"
1102
+ ],
1103
+ [
1104
+ "Ġthe",
1105
+ "y"
1106
+ ],
1107
+ [
1108
+ "a",
1109
+ "ck"
1110
+ ],
1111
+ [
1112
+ "Ġthe",
1113
+ "re"
1114
+ ],
1115
+ [
1116
+ "im",
1117
+ "e"
1118
+ ],
1119
+ [
1120
+ "l",
1121
+ "y"
1122
+ ],
1123
+ [
1124
+ "Ġs",
1125
+ "h"
1126
+ ],
1127
+ [
1128
+ "Ġl",
1129
+ "ittle"
1130
+ ],
1131
+ [
1132
+ "Ġ",
1133
+ "re"
1134
+ ],
1135
+ [
1136
+ "Ġ",
1137
+ "ne"
1138
+ ],
1139
+ [
1140
+ "Ġt",
1141
+ "ime"
1142
+ ],
1143
+ [
1144
+ "ou",
1145
+ "t"
1146
+ ],
1147
+ [
1148
+ "Ġf",
1149
+ "or"
1150
+ ],
1151
+ [
1152
+ "u",
1153
+ "n"
1154
+ ],
1155
+ [
1156
+ "c",
1157
+ "h"
1158
+ ],
1159
+ [
1160
+ "s",
1161
+ "e"
1162
+ ],
1163
+ [
1164
+ "Ġha",
1165
+ "pp"
1166
+ ],
1167
+ [
1168
+ "Ġw",
1169
+ "h"
1170
+ ],
1171
+ [
1172
+ "m",
1173
+ "y"
1174
+ ],
1175
+ [
1176
+ "om",
1177
+ "e"
1178
+ ],
1179
+ [
1180
+ "h",
1181
+ "t"
1182
+ ],
1183
+ [
1184
+ "u",
1185
+ "m"
1186
+ ],
1187
+ [
1188
+ "Ġf",
1189
+ "ri"
1190
+ ],
1191
+ [
1192
+ "Ġa",
1193
+ "s"
1194
+ ],
1195
+ [
1196
+ "Ġfri",
1197
+ "end"
1198
+ ],
1199
+ [
1200
+ "Ġ",
1201
+ "very"
1202
+ ],
1203
+ [
1204
+ "a",
1205
+ "ll"
1206
+ ],
1207
+ [
1208
+ "t",
1209
+ "er"
1210
+ ],
1211
+ [
1212
+ "Ã",
1213
+ "¢"
1214
+ ],
1215
+ [
1216
+ "â",
1217
+ "Ĥ"
1218
+ ],
1219
+ [
1220
+ "âĤ",
1221
+ "¬"
1222
+ ],
1223
+ [
1224
+ "O",
1225
+ "n"
1226
+ ],
1227
+ [
1228
+ "Ġ",
1229
+ "k"
1230
+ ],
1231
+ [
1232
+ "v",
1233
+ "ed"
1234
+ ],
1235
+ [
1236
+ "Ġ",
1237
+ "T"
1238
+ ],
1239
+ [
1240
+ "Ġo",
1241
+ "n"
1242
+ ],
1243
+ [
1244
+ "ir",
1245
+ "l"
1246
+ ],
1247
+ [
1248
+ "On",
1249
+ "ce"
1250
+ ],
1251
+ [
1252
+ "u",
1253
+ "g"
1254
+ ],
1255
+ [
1256
+ "\"",
1257
+ "ĊĊ"
1258
+ ],
1259
+ [
1260
+ "i",
1261
+ "ll"
1262
+ ],
1263
+ [
1264
+ "Ġg",
1265
+ "irl"
1266
+ ],
1267
+ [
1268
+ "Ġa",
1269
+ "n"
1270
+ ],
1271
+ [
1272
+ "e",
1273
+ "s"
1274
+ ],
1275
+ [
1276
+ "Ġe",
1277
+ "x"
1278
+ ],
1279
+ [
1280
+ "'",
1281
+ "t"
1282
+ ],
1283
+ [
1284
+ "e",
1285
+ "c"
1286
+ ],
1287
+ [
1288
+ "Ġb",
1289
+ "ut"
1290
+ ],
1291
+ [
1292
+ "Ġl",
1293
+ "oo"
1294
+ ],
1295
+ [
1296
+ "Ġl",
1297
+ "i"
1298
+ ],
1299
+ [
1300
+ "Ġb",
1301
+ "o"
1302
+ ],
1303
+ [
1304
+ "Ġwe",
1305
+ "re"
1306
+ ],
1307
+ [
1308
+ "O",
1309
+ "ne"
1310
+ ],
1311
+ [
1312
+ "Ġwa",
1313
+ "n"
1314
+ ],
1315
+ [
1316
+ "Ġhapp",
1317
+ "y"
1318
+ ],
1319
+ [
1320
+ "a",
1321
+ "ke"
1322
+ ],
1323
+ [
1324
+ "o",
1325
+ "re"
1326
+ ],
1327
+ [
1328
+ "Ġb",
1329
+ "ig"
1330
+ ],
1331
+ [
1332
+ "f",
1333
+ "u"
1334
+ ],
1335
+ [
1336
+ "Ġs",
1337
+ "p"
1338
+ ],
1339
+ [
1340
+ "id",
1341
+ "e"
1342
+ ],
1343
+ [
1344
+ "Ġsa",
1345
+ "w"
1346
+ ],
1347
+ [
1348
+ "Ġ",
1349
+ "B"
1350
+ ],
1351
+ [
1352
+ "h",
1353
+ "ing"
1354
+ ],
1355
+ [
1356
+ "Ġup",
1357
+ "on"
1358
+ ],
1359
+ [
1360
+ "ar",
1361
+ "d"
1362
+ ],
1363
+ [
1364
+ "Ġc",
1365
+ "ould"
1366
+ ],
1367
+ [
1368
+ "i",
1369
+ "c"
1370
+ ],
1371
+ [
1372
+ "Ġ",
1373
+ "out"
1374
+ ],
1375
+ [
1376
+ "il",
1377
+ "ed"
1378
+ ],
1379
+ [
1380
+ "o",
1381
+ "ne"
1382
+ ],
1383
+ [
1384
+ "r",
1385
+ "ound"
1386
+ ],
1387
+ [
1388
+ "r",
1389
+ "a"
1390
+ ],
1391
+ [
1392
+ "r",
1393
+ "y"
1394
+ ],
1395
+ [
1396
+ "Ġsm",
1397
+ "iled"
1398
+ ],
1399
+ [
1400
+ "Ġh",
1401
+ "im"
1402
+ ],
1403
+ [
1404
+ "Ġ",
1405
+ "A"
1406
+ ],
1407
+ [
1408
+ "Ġm",
1409
+ "om"
1410
+ ],
1411
+ [
1412
+ "he",
1413
+ "n"
1414
+ ],
1415
+ [
1416
+ "w",
1417
+ "ay"
1418
+ ],
1419
+ [
1420
+ "u",
1421
+ "r"
1422
+ ],
1423
+ [
1424
+ "ĠI",
1425
+ "t"
1426
+ ],
1427
+ [
1428
+ "fu",
1429
+ "l"
1430
+ ],
1431
+ [
1432
+ "a",
1433
+ "in"
1434
+ ],
1435
+ [
1436
+ "Ġw",
1437
+ "ent"
1438
+ ],
1439
+ [
1440
+ "Ġhe",
1441
+ "l"
1442
+ ],
1443
+ [
1444
+ "Ġn",
1445
+ "ot"
1446
+ ],
1447
+ [
1448
+ "ĠThe",
1449
+ "y"
1450
+ ],
1451
+ [
1452
+ "Ġwan",
1453
+ "ted"
1454
+ ],
1455
+ [
1456
+ "i",
1457
+ "nd"
1458
+ ],
1459
+ [
1460
+ "a",
1461
+ "re"
1462
+ ],
1463
+ [
1464
+ "e",
1465
+ "ar"
1466
+ ],
1467
+ [
1468
+ "Ġ",
1469
+ "M"
1470
+ ],
1471
+ [
1472
+ "Ġa",
1473
+ "ll"
1474
+ ],
1475
+ [
1476
+ "Ġfriend",
1477
+ "s"
1478
+ ],
1479
+ [
1480
+ "Ġto",
1481
+ "o"
1482
+ ],
1483
+ [
1484
+ "Ġg",
1485
+ "o"
1486
+ ],
1487
+ [
1488
+ "il",
1489
+ "y"
1490
+ ],
1491
+ [
1492
+ "am",
1493
+ "e"
1494
+ ],
1495
+ [
1496
+ "ĠT",
1497
+ "im"
1498
+ ],
1499
+ [
1500
+ "Ġhel",
1501
+ "p"
1502
+ ],
1503
+ [
1504
+ "om",
1505
+ "et"
1506
+ ],
1507
+ [
1508
+ "Ġ",
1509
+ "L"
1510
+ ],
1511
+ [
1512
+ "Ġl",
1513
+ "o"
1514
+ ],
1515
+ [
1516
+ "g",
1517
+ "ht"
1518
+ ],
1519
+ [
1520
+ "Ġs",
1521
+ "omet"
1522
+ ],
1523
+ [
1524
+ "Ġa",
1525
+ "t"
1526
+ ],
1527
+ [
1528
+ "Ġd",
1529
+ "o"
1530
+ ],
1531
+ [
1532
+ "Ġas",
1533
+ "ked"
1534
+ ],
1535
+ [
1536
+ "!",
1537
+ "\""
1538
+ ],
1539
+ [
1540
+ "Ġa",
1541
+ "round"
1542
+ ],
1543
+ [
1544
+ "Ġ",
1545
+ "j"
1546
+ ],
1547
+ [
1548
+ "re",
1549
+ "e"
1550
+ ],
1551
+ [
1552
+ "Ġloo",
1553
+ "ked"
1554
+ ],
1555
+ [
1556
+ "Ġsomet",
1557
+ "hing"
1558
+ ],
1559
+ [
1560
+ "Ġs",
1561
+ "e"
1562
+ ],
1563
+ [
1564
+ "Ġw",
1565
+ "or"
1566
+ ],
1567
+ [
1568
+ "d",
1569
+ "d"
1570
+ ],
1571
+ [
1572
+ "he",
1573
+ "d"
1574
+ ],
1575
+ [
1576
+ "oo",
1577
+ "d"
1578
+ ],
1579
+ [
1580
+ "Ġc",
1581
+ "l"
1582
+ ],
1583
+ [
1584
+ "am",
1585
+ "ed"
1586
+ ],
1587
+ [
1588
+ "r",
1589
+ "o"
1590
+ ],
1591
+ [
1592
+ "Ġc",
1593
+ "an"
1594
+ ],
1595
+ [
1596
+ "ar",
1597
+ "k"
1598
+ ],
1599
+ [
1600
+ "k",
1601
+ "ing"
1602
+ ],
1603
+ [
1604
+ "Ġ",
1605
+ "E"
1606
+ ],
1607
+ [
1608
+ "r",
1609
+ "om"
1610
+ ],
1611
+ [
1612
+ "Ġb",
1613
+ "ack"
1614
+ ],
1615
+ [
1616
+ "Ġex",
1617
+ "c"
1618
+ ],
1619
+ [
1620
+ "a",
1621
+ "b"
1622
+ ],
1623
+ [
1624
+ "i",
1625
+ "ck"
1626
+ ]
1627
+ ]
1628
+ }
1629
+ }
archive/runs_legacy_20260525/controls/20260522-141120/config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "args": {
3
+ "corpus": null,
4
+ "corpus_glob": "/Users/mandeepsidhu/Desktop/code/nanochat/.nanochat-cache/base_data_climbmix/shard_*.parquet",
5
+ "text_column": "text",
6
+ "output_dir": "runs/controls",
7
+ "suite": "controls",
8
+ "seeds": [
9
+ 1,
10
+ 2,
11
+ 3,
12
+ 4,
13
+ 5
14
+ ],
15
+ "initial_tokens": 5000000,
16
+ "stream_token_caps": [
17
+ 5000000,
18
+ 10000000,
19
+ 20000000,
20
+ 40000000
21
+ ],
22
+ "val_tokens": 500000,
23
+ "allow_short_corpus": false,
24
+ "force_retokenize": false,
25
+ "vocab_size": 4096,
26
+ "tokenizer_train_chars": 10000000,
27
+ "block_size": 128,
28
+ "batch_size": 16,
29
+ "small_layers": 4,
30
+ "small_heads": 4,
31
+ "small_embd": 128,
32
+ "large_layers": 8,
33
+ "large_heads": 8,
34
+ "large_embd": 256,
35
+ "steps_per_run": 2000,
36
+ "stream_steps_per_stage": 1000,
37
+ "eval_batches": 64,
38
+ "log_every": 500,
39
+ "lr": 0.0003,
40
+ "weight_decay": 0.1,
41
+ "grad_clip": 1.0,
42
+ "baseline_dropout": 0.1,
43
+ "high_dropout": 0.8,
44
+ "dropout_decay_tokens": null,
45
+ "dropout_schedule": "cosine"
46
+ },
47
+ "device": "mps",
48
+ "torch": "2.9.1",
49
+ "attribution": "Derived from Andrej Karpathy's nanochat project (https://github.com/karpathy/nanochat), MIT License, Copyright (c) 2025 Andrej Karpathy.",
50
+ "tokenizer_path": "runs/controls/20260522-141120/tokenizer.json",
51
+ "encoded_path": "runs/controls/20260522-141120/tokens-v4096-uint16.npy",
52
+ "train_tokens": 40000160,
53
+ "val_tokens": 500000,
54
+ "effective_initial_tokens": 5000000,
55
+ "effective_stream_token_caps": [
56
+ 5000000,
57
+ 10000000,
58
+ 20000000,
59
+ 40000000
60
+ ]
61
+ }
archive/runs_legacy_20260525/controls/20260522-141120/metrics.jsonl ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_high_static_fixed_data", "seed": 1, "stage": null, "token_limit": 5000000, "steps": 2000, "tokens_seen": 4096000, "dropout": 0.8, "train_loss_last": 6.082937240600586, "eval_loss": 6.814276188611984, "elapsed_sec": 75.75013089179993, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
2
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 1, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.1, "train_loss_last": 5.391843318939209, "eval_loss": 5.567012831568718, "elapsed_sec": 38.50700092315674, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
3
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 1, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.1, "train_loss_last": 4.914791584014893, "eval_loss": 5.084997855126858, "elapsed_sec": 38.48884105682373, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
4
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 1, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.1, "train_loss_last": 4.900063514709473, "eval_loss": 4.867573603987694, "elapsed_sec": 38.46290588378906, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
5
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 1, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.553297519683838, "eval_loss": 4.741723917424679, "elapsed_sec": 38.33492302894592, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
6
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 1, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.8, "train_loss_last": 6.440576076507568, "eval_loss": 6.986421458423138, "elapsed_sec": 38.296988010406494, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
7
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 1, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.8, "train_loss_last": 6.248085975646973, "eval_loss": 6.673301495611668, "elapsed_sec": 39.15381717681885, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
8
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 1, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.8, "train_loss_last": 6.112507343292236, "eval_loss": 6.5272732228040695, "elapsed_sec": 40.44030404090881, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
9
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 1, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.8, "train_loss_last": 5.754788398742676, "eval_loss": 6.267793655395508, "elapsed_sec": 41.09470295906067, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
10
+ {"phase": "phase4_full_data_controls", "condition": "large_static_full_data_from_start", "seed": 1, "stage": null, "token_limit": 40000000, "steps": 4000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.643326759338379, "eval_loss": 4.662323743104935, "elapsed_sec": 166.3614981174469, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
11
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_high_static_fixed_data", "seed": 2, "stage": null, "token_limit": 5000000, "steps": 2000, "tokens_seen": 4096000, "dropout": 0.8, "train_loss_last": 6.124882698059082, "eval_loss": 6.84135477244854, "elapsed_sec": 83.92461490631104, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
12
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 2, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.1, "train_loss_last": 5.3705644607543945, "eval_loss": 5.689733423292637, "elapsed_sec": 42.08622622489929, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
13
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 2, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.1, "train_loss_last": 5.033831596374512, "eval_loss": 5.1538038700819016, "elapsed_sec": 42.04193305969238, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
14
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 2, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.1, "train_loss_last": 4.8757171630859375, "eval_loss": 4.890640087425709, "elapsed_sec": 42.02943682670593, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
15
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 2, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.631913185119629, "eval_loss": 4.67994287610054, "elapsed_sec": 42.03091287612915, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
16
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 2, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.8, "train_loss_last": 6.397376537322998, "eval_loss": 7.076408430933952, "elapsed_sec": 42.1185507774353, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
17
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 2, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.8, "train_loss_last": 6.197352409362793, "eval_loss": 6.774318270385265, "elapsed_sec": 42.08896017074585, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
18
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 2, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.8, "train_loss_last": 6.076539993286133, "eval_loss": 6.475470535457134, "elapsed_sec": 42.058411836624146, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
19
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 2, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.8, "train_loss_last": 5.836711883544922, "eval_loss": 6.320266917347908, "elapsed_sec": 42.04223084449768, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
20
+ {"phase": "phase4_full_data_controls", "condition": "large_static_full_data_from_start", "seed": 2, "stage": null, "token_limit": 40000000, "steps": 4000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.5983476638793945, "eval_loss": 4.709539167582989, "elapsed_sec": 166.29632186889648, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
21
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_high_static_fixed_data", "seed": 3, "stage": null, "token_limit": 5000000, "steps": 2000, "tokens_seen": 4096000, "dropout": 0.8, "train_loss_last": 6.094411849975586, "eval_loss": 6.762092016637325, "elapsed_sec": 83.63494491577148, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
22
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 3, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.1, "train_loss_last": 5.442709922790527, "eval_loss": 5.6250176429748535, "elapsed_sec": 41.97037887573242, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
23
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 3, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.1, "train_loss_last": 4.9339494705200195, "eval_loss": 5.091545574367046, "elapsed_sec": 41.93742275238037, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
24
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 3, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.1, "train_loss_last": 4.820074081420898, "eval_loss": 4.85930198431015, "elapsed_sec": 42.00108504295349, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
25
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 3, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.612112522125244, "eval_loss": 4.680890738964081, "elapsed_sec": 42.07970404624939, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
26
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 3, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.8, "train_loss_last": 6.521695613861084, "eval_loss": 6.976697988808155, "elapsed_sec": 42.128258228302, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
27
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 3, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.8, "train_loss_last": 6.109576225280762, "eval_loss": 6.698994763195515, "elapsed_sec": 42.083462953567505, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
28
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 3, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.8, "train_loss_last": 6.1710710525512695, "eval_loss": 6.4792612716555595, "elapsed_sec": 42.04126191139221, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
29
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 3, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.8, "train_loss_last": 5.799557685852051, "eval_loss": 6.188394881784916, "elapsed_sec": 41.99322700500488, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
30
+ {"phase": "phase4_full_data_controls", "condition": "large_static_full_data_from_start", "seed": 3, "stage": null, "token_limit": 40000000, "steps": 4000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.611385345458984, "eval_loss": 4.758806340396404, "elapsed_sec": 165.85685300827026, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
31
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_high_static_fixed_data", "seed": 4, "stage": null, "token_limit": 5000000, "steps": 2000, "tokens_seen": 4096000, "dropout": 0.8, "train_loss_last": 6.14980411529541, "eval_loss": 6.930917248129845, "elapsed_sec": 83.3329451084137, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
32
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 4, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.1, "train_loss_last": 5.4042534828186035, "eval_loss": 5.632189579308033, "elapsed_sec": 41.89421987533569, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
33
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 4, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.1, "train_loss_last": 5.005560874938965, "eval_loss": 5.1024143025279045, "elapsed_sec": 41.94953107833862, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
34
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 4, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.1, "train_loss_last": 4.75225830078125, "eval_loss": 4.807056985795498, "elapsed_sec": 41.98645496368408, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
35
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 4, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.604494094848633, "eval_loss": 4.6223960518836975, "elapsed_sec": 42.013452768325806, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
36
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 4, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.8, "train_loss_last": 6.446765899658203, "eval_loss": 6.958507925271988, "elapsed_sec": 42.097583055496216, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
37
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 4, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.8, "train_loss_last": 6.149364471435547, "eval_loss": 6.848812915384769, "elapsed_sec": 42.05741786956787, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
38
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 4, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.8, "train_loss_last": 6.284854888916016, "eval_loss": 6.4219686314463615, "elapsed_sec": 42.04438519477844, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
39
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 4, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.8, "train_loss_last": 5.919014930725098, "eval_loss": 6.207054376602173, "elapsed_sec": 41.94203591346741, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
40
+ {"phase": "phase4_full_data_controls", "condition": "large_static_full_data_from_start", "seed": 4, "stage": null, "token_limit": 40000000, "steps": 4000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.628224849700928, "eval_loss": 4.670411393046379, "elapsed_sec": 166.0933392047882, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
41
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_high_static_fixed_data", "seed": 5, "stage": null, "token_limit": 5000000, "steps": 2000, "tokens_seen": 4096000, "dropout": 0.8, "train_loss_last": 6.167638778686523, "eval_loss": 6.828722849488258, "elapsed_sec": 83.42469191551208, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
42
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 5, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.1, "train_loss_last": 5.504278182983398, "eval_loss": 5.640969134867191, "elapsed_sec": 41.92028093338013, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
43
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 5, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.1, "train_loss_last": 5.050647735595703, "eval_loss": 5.076318271458149, "elapsed_sec": 41.88388991355896, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
44
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 5, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.1, "train_loss_last": 4.690304756164551, "eval_loss": 4.823811210691929, "elapsed_sec": 41.971457719802856, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
45
+ {"phase": "phase3_streaming_controls", "condition": "large_static_dropout_streaming", "seed": 5, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.762080669403076, "eval_loss": 4.646689280867577, "elapsed_sec": 41.99556493759155, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
46
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 5, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.8, "train_loss_last": 6.486720085144043, "eval_loss": 6.950839214026928, "elapsed_sec": 42.06647801399231, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
47
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 5, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.8, "train_loss_last": 6.209081649780273, "eval_loss": 6.823136441409588, "elapsed_sec": 42.13153791427612, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
48
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 5, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.8, "train_loss_last": 5.975912094116211, "eval_loss": 6.513336889445782, "elapsed_sec": 42.0465452671051, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
49
+ {"phase": "phase3_streaming_controls", "condition": "large_high_dropout_streaming", "seed": 5, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.8, "train_loss_last": 5.925265789031982, "eval_loss": 6.211368426680565, "elapsed_sec": 41.90367317199707, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
50
+ {"phase": "phase4_full_data_controls", "condition": "large_static_full_data_from_start", "seed": 5, "stage": null, "token_limit": 40000000, "steps": 4000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.792789459228516, "eval_loss": 4.656416833400726, "elapsed_sec": 166.22296500205994, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
archive/runs_legacy_20260525/controls/20260522-141120/summary.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ phase,condition,stage,token_limit,parameters,n,mean_eval_loss,std_eval_loss
2
+ phase2_fixed_data_dropout_optimization,large_high_static_fixed_data,,5000000,8388608,5,6.835472615063191,0.06128588298231963
3
+ phase3_streaming_controls,large_high_dropout_streaming,0,5000000,8388608,5,6.9897750034928325,0.0504510436205119
4
+ phase3_streaming_controls,large_high_dropout_streaming,1,10000000,8388608,5,6.763712777197361,0.07623697109669446
5
+ phase3_streaming_controls,large_high_dropout_streaming,2,20000000,8388608,5,6.483462110161781,0.040849957965590676
6
+ phase3_streaming_controls,large_high_dropout_streaming,3,40000000,8388608,5,6.238975651562214,0.054263911050486884
7
+ phase3_streaming_controls,large_static_dropout_streaming,0,5000000,8388608,5,5.6309845224022865,0.0438193989069106
8
+ phase3_streaming_controls,large_static_dropout_streaming,1,10000000,8388608,5,5.1018159747123715,0.0305841560371311
9
+ phase3_streaming_controls,large_static_dropout_streaming,2,20000000,8388608,5,4.849676774442196,0.03382457670014637
10
+ phase3_streaming_controls,large_static_dropout_streaming,3,40000000,8388608,5,4.674328573048115,0.0449378239644584
11
+ phase4_full_data_controls,large_static_full_data_from_start,,40000000,8388608,5,4.691499495506287,0.04296035064933353
archive/runs_legacy_20260525/controls/20260522-141120/summary.json ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "phase": "phase2_fixed_data_dropout_optimization",
4
+ "condition": "large_high_static_fixed_data",
5
+ "stage": null,
6
+ "token_limit": 5000000,
7
+ "parameters": 8388608,
8
+ "n": 5,
9
+ "mean_eval_loss": 6.835472615063191,
10
+ "std_eval_loss": 0.06128588298231963
11
+ },
12
+ {
13
+ "phase": "phase3_streaming_controls",
14
+ "condition": "large_high_dropout_streaming",
15
+ "stage": 0,
16
+ "token_limit": 5000000,
17
+ "parameters": 8388608,
18
+ "n": 5,
19
+ "mean_eval_loss": 6.9897750034928325,
20
+ "std_eval_loss": 0.0504510436205119
21
+ },
22
+ {
23
+ "phase": "phase3_streaming_controls",
24
+ "condition": "large_high_dropout_streaming",
25
+ "stage": 1,
26
+ "token_limit": 10000000,
27
+ "parameters": 8388608,
28
+ "n": 5,
29
+ "mean_eval_loss": 6.763712777197361,
30
+ "std_eval_loss": 0.07623697109669446
31
+ },
32
+ {
33
+ "phase": "phase3_streaming_controls",
34
+ "condition": "large_high_dropout_streaming",
35
+ "stage": 2,
36
+ "token_limit": 20000000,
37
+ "parameters": 8388608,
38
+ "n": 5,
39
+ "mean_eval_loss": 6.483462110161781,
40
+ "std_eval_loss": 0.040849957965590676
41
+ },
42
+ {
43
+ "phase": "phase3_streaming_controls",
44
+ "condition": "large_high_dropout_streaming",
45
+ "stage": 3,
46
+ "token_limit": 40000000,
47
+ "parameters": 8388608,
48
+ "n": 5,
49
+ "mean_eval_loss": 6.238975651562214,
50
+ "std_eval_loss": 0.054263911050486884
51
+ },
52
+ {
53
+ "phase": "phase3_streaming_controls",
54
+ "condition": "large_static_dropout_streaming",
55
+ "stage": 0,
56
+ "token_limit": 5000000,
57
+ "parameters": 8388608,
58
+ "n": 5,
59
+ "mean_eval_loss": 5.6309845224022865,
60
+ "std_eval_loss": 0.0438193989069106
61
+ },
62
+ {
63
+ "phase": "phase3_streaming_controls",
64
+ "condition": "large_static_dropout_streaming",
65
+ "stage": 1,
66
+ "token_limit": 10000000,
67
+ "parameters": 8388608,
68
+ "n": 5,
69
+ "mean_eval_loss": 5.1018159747123715,
70
+ "std_eval_loss": 0.0305841560371311
71
+ },
72
+ {
73
+ "phase": "phase3_streaming_controls",
74
+ "condition": "large_static_dropout_streaming",
75
+ "stage": 2,
76
+ "token_limit": 20000000,
77
+ "parameters": 8388608,
78
+ "n": 5,
79
+ "mean_eval_loss": 4.849676774442196,
80
+ "std_eval_loss": 0.03382457670014637
81
+ },
82
+ {
83
+ "phase": "phase3_streaming_controls",
84
+ "condition": "large_static_dropout_streaming",
85
+ "stage": 3,
86
+ "token_limit": 40000000,
87
+ "parameters": 8388608,
88
+ "n": 5,
89
+ "mean_eval_loss": 4.674328573048115,
90
+ "std_eval_loss": 0.0449378239644584
91
+ },
92
+ {
93
+ "phase": "phase4_full_data_controls",
94
+ "condition": "large_static_full_data_from_start",
95
+ "stage": null,
96
+ "token_limit": 40000000,
97
+ "parameters": 8388608,
98
+ "n": 5,
99
+ "mean_eval_loss": 4.691499495506287,
100
+ "std_eval_loss": 0.04296035064933353
101
+ }
102
+ ]
archive/runs_legacy_20260525/controls/20260522-141120/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
archive/runs_legacy_20260525/publishable/20260522-132351/config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "args": {
3
+ "corpus": null,
4
+ "corpus_glob": "/Users/mandeepsidhu/Desktop/code/nanochat/.nanochat-cache/base_data_climbmix/shard_*.parquet",
5
+ "text_column": "text",
6
+ "output_dir": "runs/publishable",
7
+ "seeds": [
8
+ 1,
9
+ 2,
10
+ 3,
11
+ 4,
12
+ 5
13
+ ],
14
+ "initial_tokens": 5000000,
15
+ "stream_token_caps": [
16
+ 5000000,
17
+ 10000000,
18
+ 20000000,
19
+ 40000000
20
+ ],
21
+ "val_tokens": 500000,
22
+ "allow_short_corpus": false,
23
+ "force_retokenize": false,
24
+ "vocab_size": 4096,
25
+ "tokenizer_train_chars": 10000000,
26
+ "block_size": 128,
27
+ "batch_size": 16,
28
+ "small_layers": 4,
29
+ "small_heads": 4,
30
+ "small_embd": 128,
31
+ "large_layers": 8,
32
+ "large_heads": 8,
33
+ "large_embd": 256,
34
+ "steps_per_run": 2000,
35
+ "stream_steps_per_stage": 1000,
36
+ "eval_batches": 64,
37
+ "log_every": 500,
38
+ "lr": 0.0003,
39
+ "weight_decay": 0.1,
40
+ "grad_clip": 1.0,
41
+ "baseline_dropout": 0.1,
42
+ "high_dropout": 0.8,
43
+ "dropout_decay_tokens": null,
44
+ "dropout_schedule": "cosine"
45
+ },
46
+ "device": "mps",
47
+ "torch": "2.9.1",
48
+ "attribution": "Derived from Andrej Karpathy's nanochat project (https://github.com/karpathy/nanochat), MIT License, Copyright (c) 2025 Andrej Karpathy.",
49
+ "tokenizer_path": "runs/publishable/20260522-132351/tokenizer.json",
50
+ "encoded_path": "runs/publishable/20260522-132351/tokens-v4096-uint16.npy",
51
+ "train_tokens": 40000160,
52
+ "val_tokens": 500000,
53
+ "effective_initial_tokens": 5000000,
54
+ "effective_stream_token_caps": [
55
+ 5000000,
56
+ 10000000,
57
+ 20000000,
58
+ 40000000
59
+ ]
60
+ }
archive/runs_legacy_20260525/publishable/20260522-132351/metrics.jsonl ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"phase": "phase1_baseline_comparison", "condition": "small_static", "seed": 1, "stage": null, "token_limit": 5000000, "steps": 2000, "tokens_seen": 4096000, "dropout": 0.1, "train_loss_last": 5.450620174407959, "eval_loss": 5.772196859121323, "elapsed_sec": 31.631267070770264, "parameters": 1835008, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 4, "n_head": 4, "n_embd": 128, "dropout": 0.1}}
2
+ {"phase": "phase1_baseline_comparison", "condition": "large_static", "seed": 1, "stage": null, "token_limit": 5000000, "steps": 2000, "tokens_seen": 4096000, "dropout": 0.1, "train_loss_last": 4.913602828979492, "eval_loss": 5.207479499280453, "elapsed_sec": 80.3967981338501, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
3
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_dropout_decay_fixed_data", "seed": 1, "stage": null, "token_limit": 5000000, "steps": 2000, "tokens_seen": 4096000, "dropout": 0.7820626459322383, "train_loss_last": 6.046256065368652, "eval_loss": 6.741213291883469, "elapsed_sec": 85.33950591087341, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
4
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 1, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.7733578363789504, "train_loss_last": 6.342434883117676, "eval_loss": 6.946014620363712, "elapsed_sec": 38.47059369087219, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
5
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 1, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.6974873734152917, "train_loss_last": 5.969331741333008, "eval_loss": 6.201295383274555, "elapsed_sec": 39.12462592124939, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
6
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 1, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.45000000000000007, "train_loss_last": 5.437297821044922, "eval_loss": 5.363524094223976, "elapsed_sec": 40.467276096343994, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
7
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 1, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.748871803283691, "eval_loss": 4.967737302184105, "elapsed_sec": 41.435073137283325, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
8
+ {"phase": "phase1_baseline_comparison", "condition": "small_static", "seed": 2, "stage": null, "token_limit": 5000000, "steps": 2000, "tokens_seen": 4096000, "dropout": 0.1, "train_loss_last": 5.485232830047607, "eval_loss": 5.843787379562855, "elapsed_sec": 30.04755687713623, "parameters": 1835008, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 4, "n_head": 4, "n_embd": 128, "dropout": 0.1}}
9
+ {"phase": "phase1_baseline_comparison", "condition": "large_static", "seed": 2, "stage": null, "token_limit": 5000000, "steps": 2000, "tokens_seen": 4096000, "dropout": 0.1, "train_loss_last": 4.922041416168213, "eval_loss": 5.218974679708481, "elapsed_sec": 84.12733793258667, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
10
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_dropout_decay_fixed_data", "seed": 2, "stage": null, "token_limit": 5000000, "steps": 2000, "tokens_seen": 4096000, "dropout": 0.7820626459322383, "train_loss_last": 6.0871381759643555, "eval_loss": 6.744259864091873, "elapsed_sec": 84.6592960357666, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
11
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 2, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.7733578363789504, "train_loss_last": 6.306346893310547, "eval_loss": 6.984690964221954, "elapsed_sec": 42.66448211669922, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
12
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 2, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.6974873734152917, "train_loss_last": 5.959338188171387, "eval_loss": 6.249792195856571, "elapsed_sec": 42.512439250946045, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
13
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 2, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.45000000000000007, "train_loss_last": 5.383289813995361, "eval_loss": 5.421286106109619, "elapsed_sec": 42.43638801574707, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
14
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 2, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.842014312744141, "eval_loss": 4.901044279336929, "elapsed_sec": 42.34708309173584, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
15
+ {"phase": "phase1_baseline_comparison", "condition": "small_static", "seed": 3, "stage": null, "token_limit": 5000000, "steps": 2000, "tokens_seen": 4096000, "dropout": 0.1, "train_loss_last": 5.422863960266113, "eval_loss": 5.829333141446114, "elapsed_sec": 29.95318865776062, "parameters": 1835008, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 4, "n_head": 4, "n_embd": 128, "dropout": 0.1}}
16
+ {"phase": "phase1_baseline_comparison", "condition": "large_static", "seed": 3, "stage": null, "token_limit": 5000000, "steps": 2000, "tokens_seen": 4096000, "dropout": 0.1, "train_loss_last": 4.909119606018066, "eval_loss": 5.191086798906326, "elapsed_sec": 81.83940720558167, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
17
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_dropout_decay_fixed_data", "seed": 3, "stage": null, "token_limit": 5000000, "steps": 2000, "tokens_seen": 4096000, "dropout": 0.7820626459322383, "train_loss_last": 6.057084083557129, "eval_loss": 6.660528361797333, "elapsed_sec": 81.67780780792236, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
18
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 3, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.7733578363789504, "train_loss_last": 6.432514190673828, "eval_loss": 6.905187286436558, "elapsed_sec": 41.10620903968811, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
19
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 3, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.6974873734152917, "train_loss_last": 5.846698760986328, "eval_loss": 6.245104171335697, "elapsed_sec": 41.87897992134094, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
20
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 3, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.45000000000000007, "train_loss_last": 5.401530742645264, "eval_loss": 5.512500010430813, "elapsed_sec": 42.506834983825684, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
21
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 3, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.839996337890625, "eval_loss": 4.933420121669769, "elapsed_sec": 42.479960203170776, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
22
+ {"phase": "phase1_baseline_comparison", "condition": "small_static", "seed": 4, "stage": null, "token_limit": 5000000, "steps": 2000, "tokens_seen": 4096000, "dropout": 0.1, "train_loss_last": 5.519951820373535, "eval_loss": 5.800006277859211, "elapsed_sec": 29.89086604118347, "parameters": 1835008, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 4, "n_head": 4, "n_embd": 128, "dropout": 0.1}}
23
+ {"phase": "phase1_baseline_comparison", "condition": "large_static", "seed": 4, "stage": null, "token_limit": 5000000, "steps": 2000, "tokens_seen": 4096000, "dropout": 0.1, "train_loss_last": 4.959898471832275, "eval_loss": 5.166869513690472, "elapsed_sec": 84.1205141544342, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
24
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_dropout_decay_fixed_data", "seed": 4, "stage": null, "token_limit": 5000000, "steps": 2000, "tokens_seen": 4096000, "dropout": 0.7820626459322383, "train_loss_last": 6.105772018432617, "eval_loss": 6.792966462671757, "elapsed_sec": 84.61292290687561, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
25
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 4, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.7733578363789504, "train_loss_last": 6.376546859741211, "eval_loss": 6.8808704018592834, "elapsed_sec": 42.68510293960571, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
26
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 4, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.6974873734152917, "train_loss_last": 5.919679641723633, "eval_loss": 6.322697900235653, "elapsed_sec": 42.60045385360718, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
27
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 4, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.45000000000000007, "train_loss_last": 5.4210944175720215, "eval_loss": 5.348750911653042, "elapsed_sec": 42.2938597202301, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
28
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 4, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.828921318054199, "eval_loss": 4.846531391143799, "elapsed_sec": 42.36263298988342, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
29
+ {"phase": "phase1_baseline_comparison", "condition": "small_static", "seed": 5, "stage": null, "token_limit": 5000000, "steps": 2000, "tokens_seen": 4096000, "dropout": 0.1, "train_loss_last": 5.451421737670898, "eval_loss": 5.83368007838726, "elapsed_sec": 29.718762159347534, "parameters": 1835008, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 4, "n_head": 4, "n_embd": 128, "dropout": 0.1}}
30
+ {"phase": "phase1_baseline_comparison", "condition": "large_static", "seed": 5, "stage": null, "token_limit": 5000000, "steps": 2000, "tokens_seen": 4096000, "dropout": 0.1, "train_loss_last": 4.846748352050781, "eval_loss": 5.233553200960159, "elapsed_sec": 83.74996209144592, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
31
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_dropout_decay_fixed_data", "seed": 5, "stage": null, "token_limit": 5000000, "steps": 2000, "tokens_seen": 4096000, "dropout": 0.7820626459322383, "train_loss_last": 6.124170780181885, "eval_loss": 6.755315415561199, "elapsed_sec": 84.20570206642151, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
32
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 5, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.7733578363789504, "train_loss_last": 6.408639907836914, "eval_loss": 6.889496520161629, "elapsed_sec": 42.334360122680664, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
33
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 5, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.6974873734152917, "train_loss_last": 5.972659111022949, "eval_loss": 6.325718097388744, "elapsed_sec": 42.13765597343445, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
34
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 5, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.45000000000000007, "train_loss_last": 5.278019428253174, "eval_loss": 5.48484568297863, "elapsed_sec": 42.261926889419556, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
35
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 5, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.95810079574585, "eval_loss": 4.896161742508411, "elapsed_sec": 42.27780485153198, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
archive/runs_legacy_20260525/publishable/20260522-132351/summary.csv ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ phase,condition,stage,token_limit,parameters,n,mean_eval_loss,std_eval_loss
2
+ phase1_baseline_comparison,large_static,,5000000,8388608,5,5.203592738509178,0.02576011500878837
3
+ phase1_baseline_comparison,small_static,,5000000,1835008,5,5.815800747275352,0.029312165077338317
4
+ phase2_fixed_data_dropout_optimization,large_dropout_decay_fixed_data,,5000000,8388608,5,6.738856679201126,0.04839636351890629
5
+ phase3_simulated_streaming_scale_up,large_dropout_decay_streaming,0,5000000,8388608,5,6.921251958608627,0.0434017526687061
6
+ phase3_simulated_streaming_scale_up,large_dropout_decay_streaming,1,10000000,8388608,5,6.268921549618244,0.05390788852828186
7
+ phase3_simulated_streaming_scale_up,large_dropout_decay_streaming,2,20000000,8388608,5,5.426181361079216,0.0721764902844538
8
+ phase3_simulated_streaming_scale_up,large_dropout_decay_streaming,3,40000000,8388608,5,4.908978967368602,0.045212974458371656
archive/runs_legacy_20260525/publishable/20260522-132351/summary.json ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "phase": "phase1_baseline_comparison",
4
+ "condition": "large_static",
5
+ "stage": null,
6
+ "token_limit": 5000000,
7
+ "parameters": 8388608,
8
+ "n": 5,
9
+ "mean_eval_loss": 5.203592738509178,
10
+ "std_eval_loss": 0.02576011500878837
11
+ },
12
+ {
13
+ "phase": "phase1_baseline_comparison",
14
+ "condition": "small_static",
15
+ "stage": null,
16
+ "token_limit": 5000000,
17
+ "parameters": 1835008,
18
+ "n": 5,
19
+ "mean_eval_loss": 5.815800747275352,
20
+ "std_eval_loss": 0.029312165077338317
21
+ },
22
+ {
23
+ "phase": "phase2_fixed_data_dropout_optimization",
24
+ "condition": "large_dropout_decay_fixed_data",
25
+ "stage": null,
26
+ "token_limit": 5000000,
27
+ "parameters": 8388608,
28
+ "n": 5,
29
+ "mean_eval_loss": 6.738856679201126,
30
+ "std_eval_loss": 0.04839636351890629
31
+ },
32
+ {
33
+ "phase": "phase3_simulated_streaming_scale_up",
34
+ "condition": "large_dropout_decay_streaming",
35
+ "stage": 0,
36
+ "token_limit": 5000000,
37
+ "parameters": 8388608,
38
+ "n": 5,
39
+ "mean_eval_loss": 6.921251958608627,
40
+ "std_eval_loss": 0.0434017526687061
41
+ },
42
+ {
43
+ "phase": "phase3_simulated_streaming_scale_up",
44
+ "condition": "large_dropout_decay_streaming",
45
+ "stage": 1,
46
+ "token_limit": 10000000,
47
+ "parameters": 8388608,
48
+ "n": 5,
49
+ "mean_eval_loss": 6.268921549618244,
50
+ "std_eval_loss": 0.05390788852828186
51
+ },
52
+ {
53
+ "phase": "phase3_simulated_streaming_scale_up",
54
+ "condition": "large_dropout_decay_streaming",
55
+ "stage": 2,
56
+ "token_limit": 20000000,
57
+ "parameters": 8388608,
58
+ "n": 5,
59
+ "mean_eval_loss": 5.426181361079216,
60
+ "std_eval_loss": 0.0721764902844538
61
+ },
62
+ {
63
+ "phase": "phase3_simulated_streaming_scale_up",
64
+ "condition": "large_dropout_decay_streaming",
65
+ "stage": 3,
66
+ "token_limit": 40000000,
67
+ "parameters": 8388608,
68
+ "n": 5,
69
+ "mean_eval_loss": 4.908978967368602,
70
+ "std_eval_loss": 0.045212974458371656
71
+ }
72
+ ]
archive/runs_legacy_20260525/publishable/20260522-132351/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
archive/runs_legacy_20260525/smoke/20260522-132106/config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "args": {
3
+ "corpus": "/Users/mandeepsidhu/Desktop/code/transformer/data/tinystories_train.txt",
4
+ "corpus_glob": null,
5
+ "text_column": "text",
6
+ "output_dir": "runs/smoke",
7
+ "seeds": [
8
+ 1,
9
+ 2,
10
+ 3,
11
+ 4,
12
+ 5
13
+ ],
14
+ "initial_tokens": 20000,
15
+ "stream_token_caps": [
16
+ 20000,
17
+ 40000,
18
+ 80000
19
+ ],
20
+ "val_tokens": 500000,
21
+ "allow_short_corpus": true,
22
+ "force_retokenize": false,
23
+ "vocab_size": 512,
24
+ "tokenizer_train_chars": 200000,
25
+ "block_size": 32,
26
+ "batch_size": 4,
27
+ "steps_per_run": 2,
28
+ "stream_steps_per_stage": 1,
29
+ "eval_batches": 1,
30
+ "lr": 0.0003,
31
+ "weight_decay": 0.1,
32
+ "grad_clip": 1.0,
33
+ "baseline_dropout": 0.1,
34
+ "high_dropout": 0.8,
35
+ "dropout_decay_tokens": null,
36
+ "dropout_schedule": "cosine"
37
+ },
38
+ "device": "mps",
39
+ "torch": "2.9.1",
40
+ "attribution": "Derived from Andrej Karpathy's nanochat project (https://github.com/karpathy/nanochat), MIT License, Copyright (c) 2025 Andrej Karpathy.",
41
+ "tokenizer_path": "runs/smoke/20260522-132106/tokenizer.json",
42
+ "encoded_path": "runs/smoke/20260522-132106/tokens-v512-uint16.npy",
43
+ "train_tokens": 777983,
44
+ "val_tokens": 86442,
45
+ "effective_initial_tokens": 20000,
46
+ "effective_stream_token_caps": [
47
+ 20000,
48
+ 40000,
49
+ 80000
50
+ ]
51
+ }
archive/runs_legacy_20260525/smoke/20260522-132106/metrics.jsonl ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"phase": "phase1_baseline_comparison", "condition": "small_static", "seed": 1, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.231201171875, "eval_loss": 6.233233451843262, "elapsed_sec": 2.1697847843170166, "parameters": 917504, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 4, "n_head": 4, "n_embd": 128, "dropout": 0.1}}
2
+ {"phase": "phase1_baseline_comparison", "condition": "large_static", "seed": 1, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.222594738006592, "eval_loss": 6.22459602355957, "elapsed_sec": 0.2784230709075928, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
3
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_dropout_decay_fixed_data", "seed": 1, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.799995578426538, "train_loss_last": 6.238912582397461, "eval_loss": 6.234484672546387, "elapsed_sec": 0.0704340934753418, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
4
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 1, "stage": 0, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.6974873734152917, "train_loss_last": 6.240694999694824, "eval_loss": 6.236194610595703, "elapsed_sec": 0.04119396209716797, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
5
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 1, "stage": 1, "token_limit": 40000, "steps": 1, "tokens_seen": 256, "dropout": 0.45000000000000007, "train_loss_last": 6.233834266662598, "eval_loss": 6.2317962646484375, "elapsed_sec": 0.032363176345825195, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
6
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 1, "stage": 2, "token_limit": 80000, "steps": 1, "tokens_seen": 384, "dropout": 0.1, "train_loss_last": 6.231166839599609, "eval_loss": 6.221035957336426, "elapsed_sec": 0.030178070068359375, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
7
+ {"phase": "phase1_baseline_comparison", "condition": "small_static", "seed": 2, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.234216690063477, "eval_loss": 6.230881214141846, "elapsed_sec": 0.03483176231384277, "parameters": 917504, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 4, "n_head": 4, "n_embd": 128, "dropout": 0.1}}
8
+ {"phase": "phase1_baseline_comparison", "condition": "large_static", "seed": 2, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.228965759277344, "eval_loss": 6.218391418457031, "elapsed_sec": 0.06615877151489258, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
9
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_dropout_decay_fixed_data", "seed": 2, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.799995578426538, "train_loss_last": 6.237312316894531, "eval_loss": 6.230140209197998, "elapsed_sec": 0.0667262077331543, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
10
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 2, "stage": 0, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.6974873734152917, "train_loss_last": 6.23726749420166, "eval_loss": 6.232943534851074, "elapsed_sec": 0.03887295722961426, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
11
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 2, "stage": 1, "token_limit": 40000, "steps": 1, "tokens_seen": 256, "dropout": 0.45000000000000007, "train_loss_last": 6.2372612953186035, "eval_loss": 6.226807594299316, "elapsed_sec": 0.02962803840637207, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
12
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 2, "stage": 2, "token_limit": 80000, "steps": 1, "tokens_seen": 384, "dropout": 0.1, "train_loss_last": 6.228798866271973, "eval_loss": 6.214606285095215, "elapsed_sec": 0.0299530029296875, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
13
+ {"phase": "phase1_baseline_comparison", "condition": "small_static", "seed": 3, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.236220359802246, "eval_loss": 6.231494903564453, "elapsed_sec": 0.032794952392578125, "parameters": 917504, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 4, "n_head": 4, "n_embd": 128, "dropout": 0.1}}
14
+ {"phase": "phase1_baseline_comparison", "condition": "large_static", "seed": 3, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.231418132781982, "eval_loss": 6.218476295471191, "elapsed_sec": 0.06430721282958984, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
15
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_dropout_decay_fixed_data", "seed": 3, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.799995578426538, "train_loss_last": 6.236932754516602, "eval_loss": 6.230288505554199, "elapsed_sec": 0.06323909759521484, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
16
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 3, "stage": 0, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.6974873734152917, "train_loss_last": 6.238128662109375, "eval_loss": 6.2329301834106445, "elapsed_sec": 0.03883004188537598, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
17
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 3, "stage": 1, "token_limit": 40000, "steps": 1, "tokens_seen": 256, "dropout": 0.45000000000000007, "train_loss_last": 6.236502170562744, "eval_loss": 6.225698471069336, "elapsed_sec": 0.03026604652404785, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
18
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 3, "stage": 2, "token_limit": 80000, "steps": 1, "tokens_seen": 384, "dropout": 0.1, "train_loss_last": 6.231315612792969, "eval_loss": 6.213197708129883, "elapsed_sec": 0.030527830123901367, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
19
+ {"phase": "phase1_baseline_comparison", "condition": "small_static", "seed": 4, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.235054016113281, "eval_loss": 6.234989166259766, "elapsed_sec": 0.03532004356384277, "parameters": 917504, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 4, "n_head": 4, "n_embd": 128, "dropout": 0.1}}
20
+ {"phase": "phase1_baseline_comparison", "condition": "large_static", "seed": 4, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.231119632720947, "eval_loss": 6.223240852355957, "elapsed_sec": 0.065032958984375, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
21
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_dropout_decay_fixed_data", "seed": 4, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.799995578426538, "train_loss_last": 6.237499237060547, "eval_loss": 6.234406471252441, "elapsed_sec": 0.06367802619934082, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
22
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 4, "stage": 0, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.6974873734152917, "train_loss_last": 6.239429950714111, "eval_loss": 6.235345363616943, "elapsed_sec": 0.03851008415222168, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
23
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 4, "stage": 1, "token_limit": 40000, "steps": 1, "tokens_seen": 256, "dropout": 0.45000000000000007, "train_loss_last": 6.23697566986084, "eval_loss": 6.229433536529541, "elapsed_sec": 0.030126094818115234, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
24
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 4, "stage": 2, "token_limit": 80000, "steps": 1, "tokens_seen": 384, "dropout": 0.1, "train_loss_last": 6.231019020080566, "eval_loss": 6.218218803405762, "elapsed_sec": 0.02992105484008789, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
25
+ {"phase": "phase1_baseline_comparison", "condition": "small_static", "seed": 5, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.236649513244629, "eval_loss": 6.2315874099731445, "elapsed_sec": 0.034050703048706055, "parameters": 917504, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 4, "n_head": 4, "n_embd": 128, "dropout": 0.1}}
26
+ {"phase": "phase1_baseline_comparison", "condition": "large_static", "seed": 5, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.230384826660156, "eval_loss": 6.221922874450684, "elapsed_sec": 0.06528592109680176, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
27
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_dropout_decay_fixed_data", "seed": 5, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.799995578426538, "train_loss_last": 6.236981391906738, "eval_loss": 6.234584331512451, "elapsed_sec": 0.06582903861999512, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
28
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 5, "stage": 0, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.6974873734152917, "train_loss_last": 6.239423751831055, "eval_loss": 6.236317157745361, "elapsed_sec": 0.04046988487243652, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
29
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 5, "stage": 1, "token_limit": 40000, "steps": 1, "tokens_seen": 256, "dropout": 0.45000000000000007, "train_loss_last": 6.234818458557129, "eval_loss": 6.230718612670898, "elapsed_sec": 0.0317072868347168, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
30
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 5, "stage": 2, "token_limit": 80000, "steps": 1, "tokens_seen": 384, "dropout": 0.1, "train_loss_last": 6.232948303222656, "eval_loss": 6.2211151123046875, "elapsed_sec": 0.030757665634155273, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
archive/runs_legacy_20260525/smoke/20260522-132106/summary.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "phase": "phase1_baseline_comparison",
4
+ "condition": "large_static",
5
+ "stage": null,
6
+ "token_limit": 20000,
7
+ "parameters": 6553600,
8
+ "n": 5,
9
+ "mean_eval_loss": 6.221325492858886,
10
+ "std_eval_loss": 0.0028039506140008267
11
+ },
12
+ {
13
+ "phase": "phase1_baseline_comparison",
14
+ "condition": "small_static",
15
+ "stage": null,
16
+ "token_limit": 20000,
17
+ "parameters": 917504,
18
+ "n": 5,
19
+ "mean_eval_loss": 6.232437229156494,
20
+ "std_eval_loss": 0.001671653854539933
21
+ },
22
+ {
23
+ "phase": "phase2_fixed_data_dropout_optimization",
24
+ "condition": "large_dropout_decay_fixed_data",
25
+ "stage": null,
26
+ "token_limit": 20000,
27
+ "parameters": 6553600,
28
+ "n": 5,
29
+ "mean_eval_loss": 6.232780838012696,
30
+ "std_eval_loss": 0.0023442997873944736
31
+ },
32
+ {
33
+ "phase": "phase3_simulated_streaming_scale_up",
34
+ "condition": "large_dropout_decay_streaming",
35
+ "stage": 0,
36
+ "token_limit": 20000,
37
+ "parameters": 6553600,
38
+ "n": 5,
39
+ "mean_eval_loss": 6.2347461700439455,
40
+ "std_eval_loss": 0.0016935404053936482
41
+ },
42
+ {
43
+ "phase": "phase3_simulated_streaming_scale_up",
44
+ "condition": "large_dropout_decay_streaming",
45
+ "stage": 1,
46
+ "token_limit": 40000,
47
+ "parameters": 6553600,
48
+ "n": 5,
49
+ "mean_eval_loss": 6.228890895843506,
50
+ "std_eval_loss": 0.0025791421476287733
51
+ },
52
+ {
53
+ "phase": "phase3_simulated_streaming_scale_up",
54
+ "condition": "large_dropout_decay_streaming",
55
+ "stage": 2,
56
+ "token_limit": 80000,
57
+ "parameters": 6553600,
58
+ "n": 5,
59
+ "mean_eval_loss": 6.217634773254394,
60
+ "std_eval_loss": 0.0036359727629622267
61
+ }
62
+ ]
archive/runs_legacy_20260525/smoke/20260522-132106/tokenizer.json ADDED
@@ -0,0 +1,1629 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<|bos|>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<|user_start|>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<|user_end|>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "<|assistant_start|>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "<|assistant_end|>",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ },
51
+ {
52
+ "id": 5,
53
+ "content": "<|python_start|>",
54
+ "single_word": false,
55
+ "lstrip": false,
56
+ "rstrip": false,
57
+ "normalized": false,
58
+ "special": true
59
+ },
60
+ {
61
+ "id": 6,
62
+ "content": "<|python_end|>",
63
+ "single_word": false,
64
+ "lstrip": false,
65
+ "rstrip": false,
66
+ "normalized": false,
67
+ "special": true
68
+ },
69
+ {
70
+ "id": 7,
71
+ "content": "<|output_start|>",
72
+ "single_word": false,
73
+ "lstrip": false,
74
+ "rstrip": false,
75
+ "normalized": false,
76
+ "special": true
77
+ },
78
+ {
79
+ "id": 8,
80
+ "content": "<|output_end|>",
81
+ "single_word": false,
82
+ "lstrip": false,
83
+ "rstrip": false,
84
+ "normalized": false,
85
+ "special": true
86
+ }
87
+ ],
88
+ "normalizer": null,
89
+ "pre_tokenizer": {
90
+ "type": "Sequence",
91
+ "pretokenizers": [
92
+ {
93
+ "type": "Split",
94
+ "pattern": {
95
+ "Regex": "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}{1,2}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
96
+ },
97
+ "behavior": "Isolated",
98
+ "invert": false
99
+ },
100
+ {
101
+ "type": "ByteLevel",
102
+ "add_prefix_space": false,
103
+ "trim_offsets": true,
104
+ "use_regex": false
105
+ }
106
+ ]
107
+ },
108
+ "post_processor": null,
109
+ "decoder": {
110
+ "type": "ByteLevel",
111
+ "add_prefix_space": true,
112
+ "trim_offsets": true,
113
+ "use_regex": true
114
+ },
115
+ "model": {
116
+ "type": "BPE",
117
+ "dropout": null,
118
+ "unk_token": null,
119
+ "continuing_subword_prefix": null,
120
+ "end_of_word_suffix": null,
121
+ "fuse_unk": false,
122
+ "byte_fallback": true,
123
+ "ignore_merges": false,
124
+ "vocab": {
125
+ "<|bos|>": 0,
126
+ "<|user_start|>": 1,
127
+ "<|user_end|>": 2,
128
+ "<|assistant_start|>": 3,
129
+ "<|assistant_end|>": 4,
130
+ "<|python_start|>": 5,
131
+ "<|python_end|>": 6,
132
+ "<|output_start|>": 7,
133
+ "<|output_end|>": 8,
134
+ "!": 9,
135
+ "\"": 10,
136
+ "#": 11,
137
+ "$": 12,
138
+ "%": 13,
139
+ "&": 14,
140
+ "'": 15,
141
+ "(": 16,
142
+ ")": 17,
143
+ "*": 18,
144
+ "+": 19,
145
+ ",": 20,
146
+ "-": 21,
147
+ ".": 22,
148
+ "/": 23,
149
+ "0": 24,
150
+ "1": 25,
151
+ "2": 26,
152
+ "3": 27,
153
+ "4": 28,
154
+ "5": 29,
155
+ "6": 30,
156
+ "7": 31,
157
+ "8": 32,
158
+ "9": 33,
159
+ ":": 34,
160
+ ";": 35,
161
+ "<": 36,
162
+ "=": 37,
163
+ ">": 38,
164
+ "?": 39,
165
+ "@": 40,
166
+ "A": 41,
167
+ "B": 42,
168
+ "C": 43,
169
+ "D": 44,
170
+ "E": 45,
171
+ "F": 46,
172
+ "G": 47,
173
+ "H": 48,
174
+ "I": 49,
175
+ "J": 50,
176
+ "K": 51,
177
+ "L": 52,
178
+ "M": 53,
179
+ "N": 54,
180
+ "O": 55,
181
+ "P": 56,
182
+ "Q": 57,
183
+ "R": 58,
184
+ "S": 59,
185
+ "T": 60,
186
+ "U": 61,
187
+ "V": 62,
188
+ "W": 63,
189
+ "X": 64,
190
+ "Y": 65,
191
+ "Z": 66,
192
+ "[": 67,
193
+ "\\": 68,
194
+ "]": 69,
195
+ "^": 70,
196
+ "_": 71,
197
+ "`": 72,
198
+ "a": 73,
199
+ "b": 74,
200
+ "c": 75,
201
+ "d": 76,
202
+ "e": 77,
203
+ "f": 78,
204
+ "g": 79,
205
+ "h": 80,
206
+ "i": 81,
207
+ "j": 82,
208
+ "k": 83,
209
+ "l": 84,
210
+ "m": 85,
211
+ "n": 86,
212
+ "o": 87,
213
+ "p": 88,
214
+ "q": 89,
215
+ "r": 90,
216
+ "s": 91,
217
+ "t": 92,
218
+ "u": 93,
219
+ "v": 94,
220
+ "w": 95,
221
+ "x": 96,
222
+ "y": 97,
223
+ "z": 98,
224
+ "{": 99,
225
+ "|": 100,
226
+ "}": 101,
227
+ "~": 102,
228
+ "¡": 103,
229
+ "¢": 104,
230
+ "£": 105,
231
+ "¤": 106,
232
+ "¥": 107,
233
+ "¦": 108,
234
+ "§": 109,
235
+ "¨": 110,
236
+ "©": 111,
237
+ "ª": 112,
238
+ "«": 113,
239
+ "¬": 114,
240
+ "®": 115,
241
+ "¯": 116,
242
+ "°": 117,
243
+ "±": 118,
244
+ "²": 119,
245
+ "³": 120,
246
+ "´": 121,
247
+ "µ": 122,
248
+ "¶": 123,
249
+ "·": 124,
250
+ "¸": 125,
251
+ "¹": 126,
252
+ "º": 127,
253
+ "»": 128,
254
+ "¼": 129,
255
+ "½": 130,
256
+ "¾": 131,
257
+ "¿": 132,
258
+ "À": 133,
259
+ "Á": 134,
260
+ "Â": 135,
261
+ "Ã": 136,
262
+ "Ä": 137,
263
+ "Å": 138,
264
+ "Æ": 139,
265
+ "Ç": 140,
266
+ "È": 141,
267
+ "É": 142,
268
+ "Ê": 143,
269
+ "Ë": 144,
270
+ "Ì": 145,
271
+ "Í": 146,
272
+ "Î": 147,
273
+ "Ï": 148,
274
+ "Ð": 149,
275
+ "Ñ": 150,
276
+ "Ò": 151,
277
+ "Ó": 152,
278
+ "Ô": 153,
279
+ "Õ": 154,
280
+ "Ö": 155,
281
+ "×": 156,
282
+ "Ø": 157,
283
+ "Ù": 158,
284
+ "Ú": 159,
285
+ "Û": 160,
286
+ "Ü": 161,
287
+ "Ý": 162,
288
+ "Þ": 163,
289
+ "ß": 164,
290
+ "à": 165,
291
+ "á": 166,
292
+ "â": 167,
293
+ "ã": 168,
294
+ "ä": 169,
295
+ "å": 170,
296
+ "æ": 171,
297
+ "ç": 172,
298
+ "è": 173,
299
+ "é": 174,
300
+ "ê": 175,
301
+ "ë": 176,
302
+ "ì": 177,
303
+ "í": 178,
304
+ "î": 179,
305
+ "ï": 180,
306
+ "ð": 181,
307
+ "ñ": 182,
308
+ "ò": 183,
309
+ "ó": 184,
310
+ "ô": 185,
311
+ "õ": 186,
312
+ "ö": 187,
313
+ "÷": 188,
314
+ "ø": 189,
315
+ "ù": 190,
316
+ "ú": 191,
317
+ "û": 192,
318
+ "ü": 193,
319
+ "ý": 194,
320
+ "þ": 195,
321
+ "ÿ": 196,
322
+ "Ā": 197,
323
+ "ā": 198,
324
+ "Ă": 199,
325
+ "ă": 200,
326
+ "Ą": 201,
327
+ "ą": 202,
328
+ "Ć": 203,
329
+ "ć": 204,
330
+ "Ĉ": 205,
331
+ "ĉ": 206,
332
+ "Ċ": 207,
333
+ "ċ": 208,
334
+ "Č": 209,
335
+ "č": 210,
336
+ "Ď": 211,
337
+ "ď": 212,
338
+ "Đ": 213,
339
+ "đ": 214,
340
+ "Ē": 215,
341
+ "ē": 216,
342
+ "Ĕ": 217,
343
+ "ĕ": 218,
344
+ "Ė": 219,
345
+ "ė": 220,
346
+ "Ę": 221,
347
+ "ę": 222,
348
+ "Ě": 223,
349
+ "ě": 224,
350
+ "Ĝ": 225,
351
+ "ĝ": 226,
352
+ "Ğ": 227,
353
+ "ğ": 228,
354
+ "Ġ": 229,
355
+ "ġ": 230,
356
+ "Ģ": 231,
357
+ "ģ": 232,
358
+ "Ĥ": 233,
359
+ "ĥ": 234,
360
+ "Ħ": 235,
361
+ "ħ": 236,
362
+ "Ĩ": 237,
363
+ "ĩ": 238,
364
+ "Ī": 239,
365
+ "ī": 240,
366
+ "Ĭ": 241,
367
+ "ĭ": 242,
368
+ "Į": 243,
369
+ "į": 244,
370
+ "İ": 245,
371
+ "ı": 246,
372
+ "IJ": 247,
373
+ "ij": 248,
374
+ "Ĵ": 249,
375
+ "ĵ": 250,
376
+ "Ķ": 251,
377
+ "ķ": 252,
378
+ "ĸ": 253,
379
+ "Ĺ": 254,
380
+ "ĺ": 255,
381
+ "Ļ": 256,
382
+ "ļ": 257,
383
+ "Ľ": 258,
384
+ "ľ": 259,
385
+ "Ŀ": 260,
386
+ "ŀ": 261,
387
+ "Ł": 262,
388
+ "ł": 263,
389
+ "Ń": 264,
390
+ "he": 265,
391
+ "Ġt": 266,
392
+ "Ġa": 267,
393
+ "Ġs": 268,
394
+ "Ġw": 269,
395
+ "Ġthe": 270,
396
+ "nd": 271,
397
+ "ed": 272,
398
+ "in": 273,
399
+ "Ġand": 274,
400
+ "Ġwa": 275,
401
+ "Ġb": 276,
402
+ "Ġto": 277,
403
+ "re": 278,
404
+ "Ġh": 279,
405
+ "ou": 280,
406
+ "it": 281,
407
+ "Ġf": 282,
408
+ "er": 283,
409
+ "ĊĊ": 284,
410
+ "Ġwas": 285,
411
+ "Ġl": 286,
412
+ "Ġc": 287,
413
+ "Ġhe": 288,
414
+ "Ġp": 289,
415
+ "ing": 290,
416
+ "Ġd": 291,
417
+ "Ġm": 292,
418
+ "Ġo": 293,
419
+ "Ġg": 294,
420
+ "ar": 295,
421
+ "is": 296,
422
+ "id": 297,
423
+ "ay": 298,
424
+ "om": 299,
425
+ "at": 300,
426
+ "ll": 301,
427
+ "en": 302,
428
+ "Ġsa": 303,
429
+ "ne": 304,
430
+ "The": 305,
431
+ ".ĊĊ": 306,
432
+ "le": 307,
433
+ "Ġth": 308,
434
+ "im": 309,
435
+ "an": 310,
436
+ "Ġha": 311,
437
+ "or": 312,
438
+ "Ġit": 313,
439
+ "et": 314,
440
+ "ver": 315,
441
+ "ld": 316,
442
+ "Ġin": 317,
443
+ "ĠS": 318,
444
+ "on": 319,
445
+ "Ġe": 320,
446
+ "ce": 321,
447
+ "Ġbe": 322,
448
+ "Ġher": 323,
449
+ "ir": 324,
450
+ "Ġ\"": 325,
451
+ "ĠH": 326,
452
+ "Ġu": 327,
453
+ "Ġsaid": 328,
454
+ "Ġn": 329,
455
+ "ck": 330,
456
+ "ow": 331,
457
+ "ri": 332,
458
+ "ĠThe": 333,
459
+ "Ġshe": 334,
460
+ "Ġso": 335,
461
+ "st": 336,
462
+ "Ġy": 337,
463
+ "ot": 338,
464
+ "ĠHe": 339,
465
+ "Ġof": 340,
466
+ "il": 341,
467
+ "Ġst": 342,
468
+ "ut": 343,
469
+ "ke": 344,
470
+ "am": 345,
471
+ "ked": 346,
472
+ "oo": 347,
473
+ "pp": 348,
474
+ "Ġr": 349,
475
+ "ĠShe": 350,
476
+ "very": 351,
477
+ "ĠI": 352,
478
+ "ve": 353,
479
+ "Ġthat": 354,
480
+ "ig": 355,
481
+ "ith": 356,
482
+ "Ġhis": 357,
483
+ "Ġup": 358,
484
+ "ĠĊĊ": 359,
485
+ "Ġday": 360,
486
+ "Ġwith": 361,
487
+ "Ġpl": 362,
488
+ "Ġyou": 363,
489
+ "itt": 364,
490
+ "ould": 365,
491
+ "el": 366,
492
+ "ted": 367,
493
+ "ent": 368,
494
+ "ad": 369,
495
+ "Ġhad": 370,
496
+ "ound": 371,
497
+ "al": 372,
498
+ "ĠJ": 373,
499
+ "Ġwe": 374,
500
+ "her": 375,
501
+ "ittle": 376,
502
+ "'s": 377,
503
+ "Ġsm": 378,
504
+ "Ġplay": 379,
505
+ "end": 380,
506
+ "Ġthey": 381,
507
+ "ack": 382,
508
+ "Ġthere": 383,
509
+ "ime": 384,
510
+ "ly": 385,
511
+ "Ġsh": 386,
512
+ "Ġlittle": 387,
513
+ "Ġre": 388,
514
+ "Ġne": 389,
515
+ "Ġtime": 390,
516
+ "out": 391,
517
+ "Ġfor": 392,
518
+ "un": 393,
519
+ "ch": 394,
520
+ "se": 395,
521
+ "Ġhapp": 396,
522
+ "Ġwh": 397,
523
+ "my": 398,
524
+ "ome": 399,
525
+ "ht": 400,
526
+ "um": 401,
527
+ "Ġfri": 402,
528
+ "Ġas": 403,
529
+ "Ġfriend": 404,
530
+ "Ġvery": 405,
531
+ "all": 406,
532
+ "ter": 407,
533
+ "â": 408,
534
+ "âĤ": 409,
535
+ "âĤ¬": 410,
536
+ "On": 411,
537
+ "Ġk": 412,
538
+ "ved": 413,
539
+ "ĠT": 414,
540
+ "Ġon": 415,
541
+ "irl": 416,
542
+ "Once": 417,
543
+ "ug": 418,
544
+ "\"ĊĊ": 419,
545
+ "ill": 420,
546
+ "Ġgirl": 421,
547
+ "Ġan": 422,
548
+ "es": 423,
549
+ "Ġex": 424,
550
+ "'t": 425,
551
+ "ec": 426,
552
+ "Ġbut": 427,
553
+ "Ġloo": 428,
554
+ "Ġli": 429,
555
+ "Ġbo": 430,
556
+ "Ġwere": 431,
557
+ "One": 432,
558
+ "Ġwan": 433,
559
+ "Ġhappy": 434,
560
+ "ake": 435,
561
+ "ore": 436,
562
+ "Ġbig": 437,
563
+ "fu": 438,
564
+ "Ġsp": 439,
565
+ "ide": 440,
566
+ "Ġsaw": 441,
567
+ "ĠB": 442,
568
+ "hing": 443,
569
+ "Ġupon": 444,
570
+ "ard": 445,
571
+ "Ġcould": 446,
572
+ "ic": 447,
573
+ "Ġout": 448,
574
+ "iled": 449,
575
+ "one": 450,
576
+ "round": 451,
577
+ "ra": 452,
578
+ "ry": 453,
579
+ "Ġsmiled": 454,
580
+ "Ġhim": 455,
581
+ "ĠA": 456,
582
+ "Ġmom": 457,
583
+ "hen": 458,
584
+ "way": 459,
585
+ "ur": 460,
586
+ "ĠIt": 461,
587
+ "ful": 462,
588
+ "ain": 463,
589
+ "Ġwent": 464,
590
+ "Ġhel": 465,
591
+ "Ġnot": 466,
592
+ "ĠThey": 467,
593
+ "Ġwanted": 468,
594
+ "ind": 469,
595
+ "are": 470,
596
+ "ear": 471,
597
+ "ĠM": 472,
598
+ "Ġall": 473,
599
+ "Ġfriends": 474,
600
+ "Ġtoo": 475,
601
+ "Ġgo": 476,
602
+ "ily": 477,
603
+ "ame": 478,
604
+ "ĠTim": 479,
605
+ "Ġhelp": 480,
606
+ "omet": 481,
607
+ "ĠL": 482,
608
+ "Ġlo": 483,
609
+ "ght": 484,
610
+ "Ġsomet": 485,
611
+ "Ġat": 486,
612
+ "Ġdo": 487,
613
+ "Ġasked": 488,
614
+ "!\"": 489,
615
+ "Ġaround": 490,
616
+ "Ġj": 491,
617
+ "ree": 492,
618
+ "Ġlooked": 493,
619
+ "Ġsomething": 494,
620
+ "Ġse": 495,
621
+ "Ġwor": 496,
622
+ "dd": 497,
623
+ "hed": 498,
624
+ "ood": 499,
625
+ "Ġcl": 500,
626
+ "amed": 501,
627
+ "ro": 502,
628
+ "Ġcan": 503,
629
+ "ark": 504,
630
+ "king": 505,
631
+ "ĠE": 506,
632
+ "rom": 507,
633
+ "Ġback": 508,
634
+ "Ġexc": 509,
635
+ "ab": 510,
636
+ "ick": 511
637
+ },
638
+ "merges": [
639
+ [
640
+ "h",
641
+ "e"
642
+ ],
643
+ [
644
+ "Ġ",
645
+ "t"
646
+ ],
647
+ [
648
+ "Ġ",
649
+ "a"
650
+ ],
651
+ [
652
+ "Ġ",
653
+ "s"
654
+ ],
655
+ [
656
+ "Ġ",
657
+ "w"
658
+ ],
659
+ [
660
+ "Ġt",
661
+ "he"
662
+ ],
663
+ [
664
+ "n",
665
+ "d"
666
+ ],
667
+ [
668
+ "e",
669
+ "d"
670
+ ],
671
+ [
672
+ "i",
673
+ "n"
674
+ ],
675
+ [
676
+ "Ġa",
677
+ "nd"
678
+ ],
679
+ [
680
+ "Ġw",
681
+ "a"
682
+ ],
683
+ [
684
+ "Ġ",
685
+ "b"
686
+ ],
687
+ [
688
+ "Ġt",
689
+ "o"
690
+ ],
691
+ [
692
+ "r",
693
+ "e"
694
+ ],
695
+ [
696
+ "Ġ",
697
+ "h"
698
+ ],
699
+ [
700
+ "o",
701
+ "u"
702
+ ],
703
+ [
704
+ "i",
705
+ "t"
706
+ ],
707
+ [
708
+ "Ġ",
709
+ "f"
710
+ ],
711
+ [
712
+ "e",
713
+ "r"
714
+ ],
715
+ [
716
+ "Ċ",
717
+ "Ċ"
718
+ ],
719
+ [
720
+ "Ġwa",
721
+ "s"
722
+ ],
723
+ [
724
+ "Ġ",
725
+ "l"
726
+ ],
727
+ [
728
+ "Ġ",
729
+ "c"
730
+ ],
731
+ [
732
+ "Ġ",
733
+ "he"
734
+ ],
735
+ [
736
+ "Ġ",
737
+ "p"
738
+ ],
739
+ [
740
+ "in",
741
+ "g"
742
+ ],
743
+ [
744
+ "Ġ",
745
+ "d"
746
+ ],
747
+ [
748
+ "Ġ",
749
+ "m"
750
+ ],
751
+ [
752
+ "Ġ",
753
+ "o"
754
+ ],
755
+ [
756
+ "Ġ",
757
+ "g"
758
+ ],
759
+ [
760
+ "a",
761
+ "r"
762
+ ],
763
+ [
764
+ "i",
765
+ "s"
766
+ ],
767
+ [
768
+ "i",
769
+ "d"
770
+ ],
771
+ [
772
+ "a",
773
+ "y"
774
+ ],
775
+ [
776
+ "o",
777
+ "m"
778
+ ],
779
+ [
780
+ "a",
781
+ "t"
782
+ ],
783
+ [
784
+ "l",
785
+ "l"
786
+ ],
787
+ [
788
+ "e",
789
+ "n"
790
+ ],
791
+ [
792
+ "Ġs",
793
+ "a"
794
+ ],
795
+ [
796
+ "n",
797
+ "e"
798
+ ],
799
+ [
800
+ "T",
801
+ "he"
802
+ ],
803
+ [
804
+ ".",
805
+ "ĊĊ"
806
+ ],
807
+ [
808
+ "l",
809
+ "e"
810
+ ],
811
+ [
812
+ "Ġt",
813
+ "h"
814
+ ],
815
+ [
816
+ "i",
817
+ "m"
818
+ ],
819
+ [
820
+ "a",
821
+ "n"
822
+ ],
823
+ [
824
+ "Ġh",
825
+ "a"
826
+ ],
827
+ [
828
+ "o",
829
+ "r"
830
+ ],
831
+ [
832
+ "Ġ",
833
+ "it"
834
+ ],
835
+ [
836
+ "e",
837
+ "t"
838
+ ],
839
+ [
840
+ "v",
841
+ "er"
842
+ ],
843
+ [
844
+ "l",
845
+ "d"
846
+ ],
847
+ [
848
+ "Ġ",
849
+ "in"
850
+ ],
851
+ [
852
+ "Ġ",
853
+ "S"
854
+ ],
855
+ [
856
+ "o",
857
+ "n"
858
+ ],
859
+ [
860
+ "Ġ",
861
+ "e"
862
+ ],
863
+ [
864
+ "c",
865
+ "e"
866
+ ],
867
+ [
868
+ "Ġb",
869
+ "e"
870
+ ],
871
+ [
872
+ "Ġhe",
873
+ "r"
874
+ ],
875
+ [
876
+ "i",
877
+ "r"
878
+ ],
879
+ [
880
+ "Ġ",
881
+ "\""
882
+ ],
883
+ [
884
+ "Ġ",
885
+ "H"
886
+ ],
887
+ [
888
+ "Ġ",
889
+ "u"
890
+ ],
891
+ [
892
+ "Ġsa",
893
+ "id"
894
+ ],
895
+ [
896
+ "Ġ",
897
+ "n"
898
+ ],
899
+ [
900
+ "c",
901
+ "k"
902
+ ],
903
+ [
904
+ "o",
905
+ "w"
906
+ ],
907
+ [
908
+ "r",
909
+ "i"
910
+ ],
911
+ [
912
+ "Ġ",
913
+ "The"
914
+ ],
915
+ [
916
+ "Ġs",
917
+ "he"
918
+ ],
919
+ [
920
+ "Ġs",
921
+ "o"
922
+ ],
923
+ [
924
+ "s",
925
+ "t"
926
+ ],
927
+ [
928
+ "Ġ",
929
+ "y"
930
+ ],
931
+ [
932
+ "o",
933
+ "t"
934
+ ],
935
+ [
936
+ "ĠH",
937
+ "e"
938
+ ],
939
+ [
940
+ "Ġo",
941
+ "f"
942
+ ],
943
+ [
944
+ "i",
945
+ "l"
946
+ ],
947
+ [
948
+ "Ġs",
949
+ "t"
950
+ ],
951
+ [
952
+ "u",
953
+ "t"
954
+ ],
955
+ [
956
+ "k",
957
+ "e"
958
+ ],
959
+ [
960
+ "a",
961
+ "m"
962
+ ],
963
+ [
964
+ "k",
965
+ "ed"
966
+ ],
967
+ [
968
+ "o",
969
+ "o"
970
+ ],
971
+ [
972
+ "p",
973
+ "p"
974
+ ],
975
+ [
976
+ "Ġ",
977
+ "r"
978
+ ],
979
+ [
980
+ "ĠS",
981
+ "he"
982
+ ],
983
+ [
984
+ "ver",
985
+ "y"
986
+ ],
987
+ [
988
+ "Ġ",
989
+ "I"
990
+ ],
991
+ [
992
+ "v",
993
+ "e"
994
+ ],
995
+ [
996
+ "Ġth",
997
+ "at"
998
+ ],
999
+ [
1000
+ "i",
1001
+ "g"
1002
+ ],
1003
+ [
1004
+ "it",
1005
+ "h"
1006
+ ],
1007
+ [
1008
+ "Ġh",
1009
+ "is"
1010
+ ],
1011
+ [
1012
+ "Ġu",
1013
+ "p"
1014
+ ],
1015
+ [
1016
+ "Ġ",
1017
+ "ĊĊ"
1018
+ ],
1019
+ [
1020
+ "Ġd",
1021
+ "ay"
1022
+ ],
1023
+ [
1024
+ "Ġw",
1025
+ "ith"
1026
+ ],
1027
+ [
1028
+ "Ġp",
1029
+ "l"
1030
+ ],
1031
+ [
1032
+ "Ġy",
1033
+ "ou"
1034
+ ],
1035
+ [
1036
+ "it",
1037
+ "t"
1038
+ ],
1039
+ [
1040
+ "ou",
1041
+ "ld"
1042
+ ],
1043
+ [
1044
+ "e",
1045
+ "l"
1046
+ ],
1047
+ [
1048
+ "t",
1049
+ "ed"
1050
+ ],
1051
+ [
1052
+ "en",
1053
+ "t"
1054
+ ],
1055
+ [
1056
+ "a",
1057
+ "d"
1058
+ ],
1059
+ [
1060
+ "Ġha",
1061
+ "d"
1062
+ ],
1063
+ [
1064
+ "ou",
1065
+ "nd"
1066
+ ],
1067
+ [
1068
+ "a",
1069
+ "l"
1070
+ ],
1071
+ [
1072
+ "Ġ",
1073
+ "J"
1074
+ ],
1075
+ [
1076
+ "Ġw",
1077
+ "e"
1078
+ ],
1079
+ [
1080
+ "he",
1081
+ "r"
1082
+ ],
1083
+ [
1084
+ "itt",
1085
+ "le"
1086
+ ],
1087
+ [
1088
+ "'",
1089
+ "s"
1090
+ ],
1091
+ [
1092
+ "Ġs",
1093
+ "m"
1094
+ ],
1095
+ [
1096
+ "Ġpl",
1097
+ "ay"
1098
+ ],
1099
+ [
1100
+ "e",
1101
+ "nd"
1102
+ ],
1103
+ [
1104
+ "Ġthe",
1105
+ "y"
1106
+ ],
1107
+ [
1108
+ "a",
1109
+ "ck"
1110
+ ],
1111
+ [
1112
+ "Ġthe",
1113
+ "re"
1114
+ ],
1115
+ [
1116
+ "im",
1117
+ "e"
1118
+ ],
1119
+ [
1120
+ "l",
1121
+ "y"
1122
+ ],
1123
+ [
1124
+ "Ġs",
1125
+ "h"
1126
+ ],
1127
+ [
1128
+ "Ġl",
1129
+ "ittle"
1130
+ ],
1131
+ [
1132
+ "Ġ",
1133
+ "re"
1134
+ ],
1135
+ [
1136
+ "Ġ",
1137
+ "ne"
1138
+ ],
1139
+ [
1140
+ "Ġt",
1141
+ "ime"
1142
+ ],
1143
+ [
1144
+ "ou",
1145
+ "t"
1146
+ ],
1147
+ [
1148
+ "Ġf",
1149
+ "or"
1150
+ ],
1151
+ [
1152
+ "u",
1153
+ "n"
1154
+ ],
1155
+ [
1156
+ "c",
1157
+ "h"
1158
+ ],
1159
+ [
1160
+ "s",
1161
+ "e"
1162
+ ],
1163
+ [
1164
+ "Ġha",
1165
+ "pp"
1166
+ ],
1167
+ [
1168
+ "Ġw",
1169
+ "h"
1170
+ ],
1171
+ [
1172
+ "m",
1173
+ "y"
1174
+ ],
1175
+ [
1176
+ "om",
1177
+ "e"
1178
+ ],
1179
+ [
1180
+ "h",
1181
+ "t"
1182
+ ],
1183
+ [
1184
+ "u",
1185
+ "m"
1186
+ ],
1187
+ [
1188
+ "Ġf",
1189
+ "ri"
1190
+ ],
1191
+ [
1192
+ "Ġa",
1193
+ "s"
1194
+ ],
1195
+ [
1196
+ "Ġfri",
1197
+ "end"
1198
+ ],
1199
+ [
1200
+ "Ġ",
1201
+ "very"
1202
+ ],
1203
+ [
1204
+ "a",
1205
+ "ll"
1206
+ ],
1207
+ [
1208
+ "t",
1209
+ "er"
1210
+ ],
1211
+ [
1212
+ "Ã",
1213
+ "¢"
1214
+ ],
1215
+ [
1216
+ "â",
1217
+ "Ĥ"
1218
+ ],
1219
+ [
1220
+ "âĤ",
1221
+ "¬"
1222
+ ],
1223
+ [
1224
+ "O",
1225
+ "n"
1226
+ ],
1227
+ [
1228
+ "Ġ",
1229
+ "k"
1230
+ ],
1231
+ [
1232
+ "v",
1233
+ "ed"
1234
+ ],
1235
+ [
1236
+ "Ġ",
1237
+ "T"
1238
+ ],
1239
+ [
1240
+ "Ġo",
1241
+ "n"
1242
+ ],
1243
+ [
1244
+ "ir",
1245
+ "l"
1246
+ ],
1247
+ [
1248
+ "On",
1249
+ "ce"
1250
+ ],
1251
+ [
1252
+ "u",
1253
+ "g"
1254
+ ],
1255
+ [
1256
+ "\"",
1257
+ "ĊĊ"
1258
+ ],
1259
+ [
1260
+ "i",
1261
+ "ll"
1262
+ ],
1263
+ [
1264
+ "Ġg",
1265
+ "irl"
1266
+ ],
1267
+ [
1268
+ "Ġa",
1269
+ "n"
1270
+ ],
1271
+ [
1272
+ "e",
1273
+ "s"
1274
+ ],
1275
+ [
1276
+ "Ġe",
1277
+ "x"
1278
+ ],
1279
+ [
1280
+ "'",
1281
+ "t"
1282
+ ],
1283
+ [
1284
+ "e",
1285
+ "c"
1286
+ ],
1287
+ [
1288
+ "Ġb",
1289
+ "ut"
1290
+ ],
1291
+ [
1292
+ "Ġl",
1293
+ "oo"
1294
+ ],
1295
+ [
1296
+ "Ġl",
1297
+ "i"
1298
+ ],
1299
+ [
1300
+ "Ġb",
1301
+ "o"
1302
+ ],
1303
+ [
1304
+ "Ġwe",
1305
+ "re"
1306
+ ],
1307
+ [
1308
+ "O",
1309
+ "ne"
1310
+ ],
1311
+ [
1312
+ "Ġwa",
1313
+ "n"
1314
+ ],
1315
+ [
1316
+ "Ġhapp",
1317
+ "y"
1318
+ ],
1319
+ [
1320
+ "a",
1321
+ "ke"
1322
+ ],
1323
+ [
1324
+ "o",
1325
+ "re"
1326
+ ],
1327
+ [
1328
+ "Ġb",
1329
+ "ig"
1330
+ ],
1331
+ [
1332
+ "f",
1333
+ "u"
1334
+ ],
1335
+ [
1336
+ "Ġs",
1337
+ "p"
1338
+ ],
1339
+ [
1340
+ "id",
1341
+ "e"
1342
+ ],
1343
+ [
1344
+ "Ġsa",
1345
+ "w"
1346
+ ],
1347
+ [
1348
+ "Ġ",
1349
+ "B"
1350
+ ],
1351
+ [
1352
+ "h",
1353
+ "ing"
1354
+ ],
1355
+ [
1356
+ "Ġup",
1357
+ "on"
1358
+ ],
1359
+ [
1360
+ "ar",
1361
+ "d"
1362
+ ],
1363
+ [
1364
+ "Ġc",
1365
+ "ould"
1366
+ ],
1367
+ [
1368
+ "i",
1369
+ "c"
1370
+ ],
1371
+ [
1372
+ "Ġ",
1373
+ "out"
1374
+ ],
1375
+ [
1376
+ "il",
1377
+ "ed"
1378
+ ],
1379
+ [
1380
+ "o",
1381
+ "ne"
1382
+ ],
1383
+ [
1384
+ "r",
1385
+ "ound"
1386
+ ],
1387
+ [
1388
+ "r",
1389
+ "a"
1390
+ ],
1391
+ [
1392
+ "r",
1393
+ "y"
1394
+ ],
1395
+ [
1396
+ "Ġsm",
1397
+ "iled"
1398
+ ],
1399
+ [
1400
+ "Ġh",
1401
+ "im"
1402
+ ],
1403
+ [
1404
+ "Ġ",
1405
+ "A"
1406
+ ],
1407
+ [
1408
+ "Ġm",
1409
+ "om"
1410
+ ],
1411
+ [
1412
+ "he",
1413
+ "n"
1414
+ ],
1415
+ [
1416
+ "w",
1417
+ "ay"
1418
+ ],
1419
+ [
1420
+ "u",
1421
+ "r"
1422
+ ],
1423
+ [
1424
+ "ĠI",
1425
+ "t"
1426
+ ],
1427
+ [
1428
+ "fu",
1429
+ "l"
1430
+ ],
1431
+ [
1432
+ "a",
1433
+ "in"
1434
+ ],
1435
+ [
1436
+ "Ġw",
1437
+ "ent"
1438
+ ],
1439
+ [
1440
+ "Ġhe",
1441
+ "l"
1442
+ ],
1443
+ [
1444
+ "Ġn",
1445
+ "ot"
1446
+ ],
1447
+ [
1448
+ "ĠThe",
1449
+ "y"
1450
+ ],
1451
+ [
1452
+ "Ġwan",
1453
+ "ted"
1454
+ ],
1455
+ [
1456
+ "i",
1457
+ "nd"
1458
+ ],
1459
+ [
1460
+ "a",
1461
+ "re"
1462
+ ],
1463
+ [
1464
+ "e",
1465
+ "ar"
1466
+ ],
1467
+ [
1468
+ "Ġ",
1469
+ "M"
1470
+ ],
1471
+ [
1472
+ "Ġa",
1473
+ "ll"
1474
+ ],
1475
+ [
1476
+ "Ġfriend",
1477
+ "s"
1478
+ ],
1479
+ [
1480
+ "Ġto",
1481
+ "o"
1482
+ ],
1483
+ [
1484
+ "Ġg",
1485
+ "o"
1486
+ ],
1487
+ [
1488
+ "il",
1489
+ "y"
1490
+ ],
1491
+ [
1492
+ "am",
1493
+ "e"
1494
+ ],
1495
+ [
1496
+ "ĠT",
1497
+ "im"
1498
+ ],
1499
+ [
1500
+ "Ġhel",
1501
+ "p"
1502
+ ],
1503
+ [
1504
+ "om",
1505
+ "et"
1506
+ ],
1507
+ [
1508
+ "Ġ",
1509
+ "L"
1510
+ ],
1511
+ [
1512
+ "Ġl",
1513
+ "o"
1514
+ ],
1515
+ [
1516
+ "g",
1517
+ "ht"
1518
+ ],
1519
+ [
1520
+ "Ġs",
1521
+ "omet"
1522
+ ],
1523
+ [
1524
+ "Ġa",
1525
+ "t"
1526
+ ],
1527
+ [
1528
+ "Ġd",
1529
+ "o"
1530
+ ],
1531
+ [
1532
+ "Ġas",
1533
+ "ked"
1534
+ ],
1535
+ [
1536
+ "!",
1537
+ "\""
1538
+ ],
1539
+ [
1540
+ "Ġa",
1541
+ "round"
1542
+ ],
1543
+ [
1544
+ "Ġ",
1545
+ "j"
1546
+ ],
1547
+ [
1548
+ "re",
1549
+ "e"
1550
+ ],
1551
+ [
1552
+ "Ġloo",
1553
+ "ked"
1554
+ ],
1555
+ [
1556
+ "Ġsomet",
1557
+ "hing"
1558
+ ],
1559
+ [
1560
+ "Ġs",
1561
+ "e"
1562
+ ],
1563
+ [
1564
+ "Ġw",
1565
+ "or"
1566
+ ],
1567
+ [
1568
+ "d",
1569
+ "d"
1570
+ ],
1571
+ [
1572
+ "he",
1573
+ "d"
1574
+ ],
1575
+ [
1576
+ "oo",
1577
+ "d"
1578
+ ],
1579
+ [
1580
+ "Ġc",
1581
+ "l"
1582
+ ],
1583
+ [
1584
+ "am",
1585
+ "ed"
1586
+ ],
1587
+ [
1588
+ "r",
1589
+ "o"
1590
+ ],
1591
+ [
1592
+ "Ġc",
1593
+ "an"
1594
+ ],
1595
+ [
1596
+ "ar",
1597
+ "k"
1598
+ ],
1599
+ [
1600
+ "k",
1601
+ "ing"
1602
+ ],
1603
+ [
1604
+ "Ġ",
1605
+ "E"
1606
+ ],
1607
+ [
1608
+ "r",
1609
+ "om"
1610
+ ],
1611
+ [
1612
+ "Ġb",
1613
+ "ack"
1614
+ ],
1615
+ [
1616
+ "Ġex",
1617
+ "c"
1618
+ ],
1619
+ [
1620
+ "a",
1621
+ "b"
1622
+ ],
1623
+ [
1624
+ "i",
1625
+ "ck"
1626
+ ]
1627
+ ]
1628
+ }
1629
+ }
archive/runs_legacy_20260525/smoke/20260522-132219/config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "args": {
3
+ "corpus": "/Users/mandeepsidhu/Desktop/code/transformer/data/tinystories_train.txt",
4
+ "corpus_glob": null,
5
+ "text_column": "text",
6
+ "output_dir": "runs/smoke",
7
+ "seeds": [
8
+ 1,
9
+ 2,
10
+ 3,
11
+ 4,
12
+ 5
13
+ ],
14
+ "initial_tokens": 20000,
15
+ "stream_token_caps": [
16
+ 20000,
17
+ 40000,
18
+ 80000
19
+ ],
20
+ "val_tokens": 500000,
21
+ "allow_short_corpus": true,
22
+ "force_retokenize": false,
23
+ "vocab_size": 512,
24
+ "tokenizer_train_chars": 200000,
25
+ "block_size": 32,
26
+ "batch_size": 4,
27
+ "small_layers": 4,
28
+ "small_heads": 4,
29
+ "small_embd": 128,
30
+ "large_layers": 8,
31
+ "large_heads": 8,
32
+ "large_embd": 256,
33
+ "steps_per_run": 2,
34
+ "stream_steps_per_stage": 1,
35
+ "eval_batches": 1,
36
+ "lr": 0.0003,
37
+ "weight_decay": 0.1,
38
+ "grad_clip": 1.0,
39
+ "baseline_dropout": 0.1,
40
+ "high_dropout": 0.8,
41
+ "dropout_decay_tokens": null,
42
+ "dropout_schedule": "cosine"
43
+ },
44
+ "device": "mps",
45
+ "torch": "2.9.1",
46
+ "attribution": "Derived from Andrej Karpathy's nanochat project (https://github.com/karpathy/nanochat), MIT License, Copyright (c) 2025 Andrej Karpathy.",
47
+ "tokenizer_path": "runs/smoke/20260522-132219/tokenizer.json",
48
+ "encoded_path": "runs/smoke/20260522-132219/tokens-v512-uint16.npy",
49
+ "train_tokens": 777983,
50
+ "val_tokens": 86442,
51
+ "effective_initial_tokens": 20000,
52
+ "effective_stream_token_caps": [
53
+ 20000,
54
+ 40000,
55
+ 80000
56
+ ]
57
+ }
archive/runs_legacy_20260525/smoke/20260522-132219/metrics.jsonl ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"phase": "phase1_baseline_comparison", "condition": "small_static", "seed": 1, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.231201171875, "eval_loss": 6.233233451843262, "elapsed_sec": 0.16947412490844727, "parameters": 917504, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 4, "n_head": 4, "n_embd": 128, "dropout": 0.1}}
2
+ {"phase": "phase1_baseline_comparison", "condition": "large_static", "seed": 1, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.222594738006592, "eval_loss": 6.22459602355957, "elapsed_sec": 0.14142203330993652, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
3
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_dropout_decay_fixed_data", "seed": 1, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.799995578426538, "train_loss_last": 6.238912582397461, "eval_loss": 6.234484672546387, "elapsed_sec": 0.07789421081542969, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
4
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 1, "stage": 0, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.6974873734152917, "train_loss_last": 6.240694999694824, "eval_loss": 6.236194610595703, "elapsed_sec": 0.04669904708862305, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
5
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 1, "stage": 1, "token_limit": 40000, "steps": 1, "tokens_seen": 256, "dropout": 0.45000000000000007, "train_loss_last": 6.233834266662598, "eval_loss": 6.2317962646484375, "elapsed_sec": 0.03601408004760742, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
6
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 1, "stage": 2, "token_limit": 80000, "steps": 1, "tokens_seen": 384, "dropout": 0.1, "train_loss_last": 6.231166839599609, "eval_loss": 6.221035957336426, "elapsed_sec": 0.03204011917114258, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
7
+ {"phase": "phase1_baseline_comparison", "condition": "small_static", "seed": 2, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.234216690063477, "eval_loss": 6.230881214141846, "elapsed_sec": 0.03834128379821777, "parameters": 917504, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 4, "n_head": 4, "n_embd": 128, "dropout": 0.1}}
8
+ {"phase": "phase1_baseline_comparison", "condition": "large_static", "seed": 2, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.228965759277344, "eval_loss": 6.218391418457031, "elapsed_sec": 0.07362222671508789, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
9
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_dropout_decay_fixed_data", "seed": 2, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.799995578426538, "train_loss_last": 6.237312316894531, "eval_loss": 6.230140209197998, "elapsed_sec": 0.07616114616394043, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
10
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 2, "stage": 0, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.6974873734152917, "train_loss_last": 6.23726749420166, "eval_loss": 6.232943534851074, "elapsed_sec": 0.04496598243713379, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
11
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 2, "stage": 1, "token_limit": 40000, "steps": 1, "tokens_seen": 256, "dropout": 0.45000000000000007, "train_loss_last": 6.2372612953186035, "eval_loss": 6.226807594299316, "elapsed_sec": 0.03384113311767578, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
12
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 2, "stage": 2, "token_limit": 80000, "steps": 1, "tokens_seen": 384, "dropout": 0.1, "train_loss_last": 6.228798866271973, "eval_loss": 6.214606285095215, "elapsed_sec": 0.03626108169555664, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
13
+ {"phase": "phase1_baseline_comparison", "condition": "small_static", "seed": 3, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.236220359802246, "eval_loss": 6.231494903564453, "elapsed_sec": 0.04115915298461914, "parameters": 917504, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 4, "n_head": 4, "n_embd": 128, "dropout": 0.1}}
14
+ {"phase": "phase1_baseline_comparison", "condition": "large_static", "seed": 3, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.231418132781982, "eval_loss": 6.218476295471191, "elapsed_sec": 0.07224202156066895, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
15
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_dropout_decay_fixed_data", "seed": 3, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.799995578426538, "train_loss_last": 6.236932754516602, "eval_loss": 6.230288505554199, "elapsed_sec": 0.06969714164733887, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
16
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 3, "stage": 0, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.6974873734152917, "train_loss_last": 6.238128662109375, "eval_loss": 6.2329301834106445, "elapsed_sec": 0.04061102867126465, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
17
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 3, "stage": 1, "token_limit": 40000, "steps": 1, "tokens_seen": 256, "dropout": 0.45000000000000007, "train_loss_last": 6.236502170562744, "eval_loss": 6.225698471069336, "elapsed_sec": 0.0318758487701416, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
18
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 3, "stage": 2, "token_limit": 80000, "steps": 1, "tokens_seen": 384, "dropout": 0.1, "train_loss_last": 6.231315612792969, "eval_loss": 6.213197708129883, "elapsed_sec": 0.032286882400512695, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
19
+ {"phase": "phase1_baseline_comparison", "condition": "small_static", "seed": 4, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.235054016113281, "eval_loss": 6.234989166259766, "elapsed_sec": 0.0366358757019043, "parameters": 917504, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 4, "n_head": 4, "n_embd": 128, "dropout": 0.1}}
20
+ {"phase": "phase1_baseline_comparison", "condition": "large_static", "seed": 4, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.231119632720947, "eval_loss": 6.223240852355957, "elapsed_sec": 0.06865692138671875, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
21
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_dropout_decay_fixed_data", "seed": 4, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.799995578426538, "train_loss_last": 6.237499237060547, "eval_loss": 6.234406471252441, "elapsed_sec": 0.07039403915405273, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
22
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 4, "stage": 0, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.6974873734152917, "train_loss_last": 6.239429950714111, "eval_loss": 6.235345363616943, "elapsed_sec": 0.04240703582763672, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
23
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 4, "stage": 1, "token_limit": 40000, "steps": 1, "tokens_seen": 256, "dropout": 0.45000000000000007, "train_loss_last": 6.23697566986084, "eval_loss": 6.229433536529541, "elapsed_sec": 0.0330810546875, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
24
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 4, "stage": 2, "token_limit": 80000, "steps": 1, "tokens_seen": 384, "dropout": 0.1, "train_loss_last": 6.231019020080566, "eval_loss": 6.218218803405762, "elapsed_sec": 0.033483028411865234, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
25
+ {"phase": "phase1_baseline_comparison", "condition": "small_static", "seed": 5, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.236649513244629, "eval_loss": 6.2315874099731445, "elapsed_sec": 0.03715205192565918, "parameters": 917504, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 4, "n_head": 4, "n_embd": 128, "dropout": 0.1}}
26
+ {"phase": "phase1_baseline_comparison", "condition": "large_static", "seed": 5, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.1, "train_loss_last": 6.230384826660156, "eval_loss": 6.221922874450684, "elapsed_sec": 0.06780505180358887, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
27
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_dropout_decay_fixed_data", "seed": 5, "stage": null, "token_limit": 20000, "steps": 2, "tokens_seen": 256, "dropout": 0.799995578426538, "train_loss_last": 6.236981391906738, "eval_loss": 6.234584331512451, "elapsed_sec": 0.06941580772399902, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
28
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 5, "stage": 0, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.6974873734152917, "train_loss_last": 6.239423751831055, "eval_loss": 6.236317157745361, "elapsed_sec": 0.04212617874145508, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
29
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 5, "stage": 1, "token_limit": 40000, "steps": 1, "tokens_seen": 256, "dropout": 0.45000000000000007, "train_loss_last": 6.234818458557129, "eval_loss": 6.230718612670898, "elapsed_sec": 0.03323984146118164, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
30
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 5, "stage": 2, "token_limit": 80000, "steps": 1, "tokens_seen": 384, "dropout": 0.1, "train_loss_last": 6.232948303222656, "eval_loss": 6.2211151123046875, "elapsed_sec": 0.033397674560546875, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
archive/runs_legacy_20260525/smoke/20260522-132219/summary.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "phase": "phase1_baseline_comparison",
4
+ "condition": "large_static",
5
+ "stage": null,
6
+ "token_limit": 20000,
7
+ "parameters": 6553600,
8
+ "n": 5,
9
+ "mean_eval_loss": 6.221325492858886,
10
+ "std_eval_loss": 0.0028039506140008267
11
+ },
12
+ {
13
+ "phase": "phase1_baseline_comparison",
14
+ "condition": "small_static",
15
+ "stage": null,
16
+ "token_limit": 20000,
17
+ "parameters": 917504,
18
+ "n": 5,
19
+ "mean_eval_loss": 6.232437229156494,
20
+ "std_eval_loss": 0.001671653854539933
21
+ },
22
+ {
23
+ "phase": "phase2_fixed_data_dropout_optimization",
24
+ "condition": "large_dropout_decay_fixed_data",
25
+ "stage": null,
26
+ "token_limit": 20000,
27
+ "parameters": 6553600,
28
+ "n": 5,
29
+ "mean_eval_loss": 6.232780838012696,
30
+ "std_eval_loss": 0.0023442997873944736
31
+ },
32
+ {
33
+ "phase": "phase3_simulated_streaming_scale_up",
34
+ "condition": "large_dropout_decay_streaming",
35
+ "stage": 0,
36
+ "token_limit": 20000,
37
+ "parameters": 6553600,
38
+ "n": 5,
39
+ "mean_eval_loss": 6.2347461700439455,
40
+ "std_eval_loss": 0.0016935404053936482
41
+ },
42
+ {
43
+ "phase": "phase3_simulated_streaming_scale_up",
44
+ "condition": "large_dropout_decay_streaming",
45
+ "stage": 1,
46
+ "token_limit": 40000,
47
+ "parameters": 6553600,
48
+ "n": 5,
49
+ "mean_eval_loss": 6.228890895843506,
50
+ "std_eval_loss": 0.0025791421476287733
51
+ },
52
+ {
53
+ "phase": "phase3_simulated_streaming_scale_up",
54
+ "condition": "large_dropout_decay_streaming",
55
+ "stage": 2,
56
+ "token_limit": 80000,
57
+ "parameters": 6553600,
58
+ "n": 5,
59
+ "mean_eval_loss": 6.217634773254394,
60
+ "std_eval_loss": 0.0036359727629622267
61
+ }
62
+ ]
archive/runs_legacy_20260525/smoke/20260522-132219/tokenizer.json ADDED
@@ -0,0 +1,1629 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<|bos|>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<|user_start|>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<|user_end|>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "<|assistant_start|>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "<|assistant_end|>",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ },
51
+ {
52
+ "id": 5,
53
+ "content": "<|python_start|>",
54
+ "single_word": false,
55
+ "lstrip": false,
56
+ "rstrip": false,
57
+ "normalized": false,
58
+ "special": true
59
+ },
60
+ {
61
+ "id": 6,
62
+ "content": "<|python_end|>",
63
+ "single_word": false,
64
+ "lstrip": false,
65
+ "rstrip": false,
66
+ "normalized": false,
67
+ "special": true
68
+ },
69
+ {
70
+ "id": 7,
71
+ "content": "<|output_start|>",
72
+ "single_word": false,
73
+ "lstrip": false,
74
+ "rstrip": false,
75
+ "normalized": false,
76
+ "special": true
77
+ },
78
+ {
79
+ "id": 8,
80
+ "content": "<|output_end|>",
81
+ "single_word": false,
82
+ "lstrip": false,
83
+ "rstrip": false,
84
+ "normalized": false,
85
+ "special": true
86
+ }
87
+ ],
88
+ "normalizer": null,
89
+ "pre_tokenizer": {
90
+ "type": "Sequence",
91
+ "pretokenizers": [
92
+ {
93
+ "type": "Split",
94
+ "pattern": {
95
+ "Regex": "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}{1,2}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
96
+ },
97
+ "behavior": "Isolated",
98
+ "invert": false
99
+ },
100
+ {
101
+ "type": "ByteLevel",
102
+ "add_prefix_space": false,
103
+ "trim_offsets": true,
104
+ "use_regex": false
105
+ }
106
+ ]
107
+ },
108
+ "post_processor": null,
109
+ "decoder": {
110
+ "type": "ByteLevel",
111
+ "add_prefix_space": true,
112
+ "trim_offsets": true,
113
+ "use_regex": true
114
+ },
115
+ "model": {
116
+ "type": "BPE",
117
+ "dropout": null,
118
+ "unk_token": null,
119
+ "continuing_subword_prefix": null,
120
+ "end_of_word_suffix": null,
121
+ "fuse_unk": false,
122
+ "byte_fallback": true,
123
+ "ignore_merges": false,
124
+ "vocab": {
125
+ "<|bos|>": 0,
126
+ "<|user_start|>": 1,
127
+ "<|user_end|>": 2,
128
+ "<|assistant_start|>": 3,
129
+ "<|assistant_end|>": 4,
130
+ "<|python_start|>": 5,
131
+ "<|python_end|>": 6,
132
+ "<|output_start|>": 7,
133
+ "<|output_end|>": 8,
134
+ "!": 9,
135
+ "\"": 10,
136
+ "#": 11,
137
+ "$": 12,
138
+ "%": 13,
139
+ "&": 14,
140
+ "'": 15,
141
+ "(": 16,
142
+ ")": 17,
143
+ "*": 18,
144
+ "+": 19,
145
+ ",": 20,
146
+ "-": 21,
147
+ ".": 22,
148
+ "/": 23,
149
+ "0": 24,
150
+ "1": 25,
151
+ "2": 26,
152
+ "3": 27,
153
+ "4": 28,
154
+ "5": 29,
155
+ "6": 30,
156
+ "7": 31,
157
+ "8": 32,
158
+ "9": 33,
159
+ ":": 34,
160
+ ";": 35,
161
+ "<": 36,
162
+ "=": 37,
163
+ ">": 38,
164
+ "?": 39,
165
+ "@": 40,
166
+ "A": 41,
167
+ "B": 42,
168
+ "C": 43,
169
+ "D": 44,
170
+ "E": 45,
171
+ "F": 46,
172
+ "G": 47,
173
+ "H": 48,
174
+ "I": 49,
175
+ "J": 50,
176
+ "K": 51,
177
+ "L": 52,
178
+ "M": 53,
179
+ "N": 54,
180
+ "O": 55,
181
+ "P": 56,
182
+ "Q": 57,
183
+ "R": 58,
184
+ "S": 59,
185
+ "T": 60,
186
+ "U": 61,
187
+ "V": 62,
188
+ "W": 63,
189
+ "X": 64,
190
+ "Y": 65,
191
+ "Z": 66,
192
+ "[": 67,
193
+ "\\": 68,
194
+ "]": 69,
195
+ "^": 70,
196
+ "_": 71,
197
+ "`": 72,
198
+ "a": 73,
199
+ "b": 74,
200
+ "c": 75,
201
+ "d": 76,
202
+ "e": 77,
203
+ "f": 78,
204
+ "g": 79,
205
+ "h": 80,
206
+ "i": 81,
207
+ "j": 82,
208
+ "k": 83,
209
+ "l": 84,
210
+ "m": 85,
211
+ "n": 86,
212
+ "o": 87,
213
+ "p": 88,
214
+ "q": 89,
215
+ "r": 90,
216
+ "s": 91,
217
+ "t": 92,
218
+ "u": 93,
219
+ "v": 94,
220
+ "w": 95,
221
+ "x": 96,
222
+ "y": 97,
223
+ "z": 98,
224
+ "{": 99,
225
+ "|": 100,
226
+ "}": 101,
227
+ "~": 102,
228
+ "¡": 103,
229
+ "¢": 104,
230
+ "£": 105,
231
+ "¤": 106,
232
+ "¥": 107,
233
+ "¦": 108,
234
+ "§": 109,
235
+ "¨": 110,
236
+ "©": 111,
237
+ "ª": 112,
238
+ "«": 113,
239
+ "¬": 114,
240
+ "®": 115,
241
+ "¯": 116,
242
+ "°": 117,
243
+ "±": 118,
244
+ "²": 119,
245
+ "³": 120,
246
+ "´": 121,
247
+ "µ": 122,
248
+ "¶": 123,
249
+ "·": 124,
250
+ "¸": 125,
251
+ "¹": 126,
252
+ "º": 127,
253
+ "»": 128,
254
+ "¼": 129,
255
+ "½": 130,
256
+ "¾": 131,
257
+ "¿": 132,
258
+ "À": 133,
259
+ "Á": 134,
260
+ "Â": 135,
261
+ "Ã": 136,
262
+ "Ä": 137,
263
+ "Å": 138,
264
+ "Æ": 139,
265
+ "Ç": 140,
266
+ "È": 141,
267
+ "É": 142,
268
+ "Ê": 143,
269
+ "Ë": 144,
270
+ "Ì": 145,
271
+ "Í": 146,
272
+ "Î": 147,
273
+ "Ï": 148,
274
+ "Ð": 149,
275
+ "Ñ": 150,
276
+ "Ò": 151,
277
+ "Ó": 152,
278
+ "Ô": 153,
279
+ "Õ": 154,
280
+ "Ö": 155,
281
+ "×": 156,
282
+ "Ø": 157,
283
+ "Ù": 158,
284
+ "Ú": 159,
285
+ "Û": 160,
286
+ "Ü": 161,
287
+ "Ý": 162,
288
+ "Þ": 163,
289
+ "ß": 164,
290
+ "à": 165,
291
+ "á": 166,
292
+ "â": 167,
293
+ "ã": 168,
294
+ "ä": 169,
295
+ "å": 170,
296
+ "æ": 171,
297
+ "ç": 172,
298
+ "è": 173,
299
+ "é": 174,
300
+ "ê": 175,
301
+ "ë": 176,
302
+ "ì": 177,
303
+ "í": 178,
304
+ "î": 179,
305
+ "ï": 180,
306
+ "ð": 181,
307
+ "ñ": 182,
308
+ "ò": 183,
309
+ "ó": 184,
310
+ "ô": 185,
311
+ "õ": 186,
312
+ "ö": 187,
313
+ "÷": 188,
314
+ "ø": 189,
315
+ "ù": 190,
316
+ "ú": 191,
317
+ "û": 192,
318
+ "ü": 193,
319
+ "ý": 194,
320
+ "þ": 195,
321
+ "ÿ": 196,
322
+ "Ā": 197,
323
+ "ā": 198,
324
+ "Ă": 199,
325
+ "ă": 200,
326
+ "Ą": 201,
327
+ "ą": 202,
328
+ "Ć": 203,
329
+ "ć": 204,
330
+ "Ĉ": 205,
331
+ "ĉ": 206,
332
+ "Ċ": 207,
333
+ "ċ": 208,
334
+ "Č": 209,
335
+ "č": 210,
336
+ "Ď": 211,
337
+ "ď": 212,
338
+ "Đ": 213,
339
+ "đ": 214,
340
+ "Ē": 215,
341
+ "ē": 216,
342
+ "Ĕ": 217,
343
+ "ĕ": 218,
344
+ "Ė": 219,
345
+ "ė": 220,
346
+ "Ę": 221,
347
+ "ę": 222,
348
+ "Ě": 223,
349
+ "ě": 224,
350
+ "Ĝ": 225,
351
+ "ĝ": 226,
352
+ "Ğ": 227,
353
+ "ğ": 228,
354
+ "Ġ": 229,
355
+ "ġ": 230,
356
+ "Ģ": 231,
357
+ "ģ": 232,
358
+ "Ĥ": 233,
359
+ "ĥ": 234,
360
+ "Ħ": 235,
361
+ "ħ": 236,
362
+ "Ĩ": 237,
363
+ "ĩ": 238,
364
+ "Ī": 239,
365
+ "ī": 240,
366
+ "Ĭ": 241,
367
+ "ĭ": 242,
368
+ "Į": 243,
369
+ "į": 244,
370
+ "İ": 245,
371
+ "ı": 246,
372
+ "IJ": 247,
373
+ "ij": 248,
374
+ "Ĵ": 249,
375
+ "ĵ": 250,
376
+ "Ķ": 251,
377
+ "ķ": 252,
378
+ "ĸ": 253,
379
+ "Ĺ": 254,
380
+ "ĺ": 255,
381
+ "Ļ": 256,
382
+ "ļ": 257,
383
+ "Ľ": 258,
384
+ "ľ": 259,
385
+ "Ŀ": 260,
386
+ "ŀ": 261,
387
+ "Ł": 262,
388
+ "ł": 263,
389
+ "Ń": 264,
390
+ "he": 265,
391
+ "Ġt": 266,
392
+ "Ġa": 267,
393
+ "Ġs": 268,
394
+ "Ġw": 269,
395
+ "Ġthe": 270,
396
+ "nd": 271,
397
+ "ed": 272,
398
+ "in": 273,
399
+ "Ġand": 274,
400
+ "Ġwa": 275,
401
+ "Ġb": 276,
402
+ "Ġto": 277,
403
+ "re": 278,
404
+ "Ġh": 279,
405
+ "ou": 280,
406
+ "it": 281,
407
+ "Ġf": 282,
408
+ "er": 283,
409
+ "ĊĊ": 284,
410
+ "Ġwas": 285,
411
+ "Ġl": 286,
412
+ "Ġc": 287,
413
+ "Ġhe": 288,
414
+ "Ġp": 289,
415
+ "ing": 290,
416
+ "Ġd": 291,
417
+ "Ġm": 292,
418
+ "Ġo": 293,
419
+ "Ġg": 294,
420
+ "ar": 295,
421
+ "is": 296,
422
+ "id": 297,
423
+ "ay": 298,
424
+ "om": 299,
425
+ "at": 300,
426
+ "ll": 301,
427
+ "en": 302,
428
+ "Ġsa": 303,
429
+ "ne": 304,
430
+ "The": 305,
431
+ ".ĊĊ": 306,
432
+ "le": 307,
433
+ "Ġth": 308,
434
+ "im": 309,
435
+ "an": 310,
436
+ "Ġha": 311,
437
+ "or": 312,
438
+ "Ġit": 313,
439
+ "et": 314,
440
+ "ver": 315,
441
+ "ld": 316,
442
+ "Ġin": 317,
443
+ "ĠS": 318,
444
+ "on": 319,
445
+ "Ġe": 320,
446
+ "ce": 321,
447
+ "Ġbe": 322,
448
+ "Ġher": 323,
449
+ "ir": 324,
450
+ "Ġ\"": 325,
451
+ "ĠH": 326,
452
+ "Ġu": 327,
453
+ "Ġsaid": 328,
454
+ "Ġn": 329,
455
+ "ck": 330,
456
+ "ow": 331,
457
+ "ri": 332,
458
+ "ĠThe": 333,
459
+ "Ġshe": 334,
460
+ "Ġso": 335,
461
+ "st": 336,
462
+ "Ġy": 337,
463
+ "ot": 338,
464
+ "ĠHe": 339,
465
+ "Ġof": 340,
466
+ "il": 341,
467
+ "Ġst": 342,
468
+ "ut": 343,
469
+ "ke": 344,
470
+ "am": 345,
471
+ "ked": 346,
472
+ "oo": 347,
473
+ "pp": 348,
474
+ "Ġr": 349,
475
+ "ĠShe": 350,
476
+ "very": 351,
477
+ "ĠI": 352,
478
+ "ve": 353,
479
+ "Ġthat": 354,
480
+ "ig": 355,
481
+ "ith": 356,
482
+ "Ġhis": 357,
483
+ "Ġup": 358,
484
+ "ĠĊĊ": 359,
485
+ "Ġday": 360,
486
+ "Ġwith": 361,
487
+ "Ġpl": 362,
488
+ "Ġyou": 363,
489
+ "itt": 364,
490
+ "ould": 365,
491
+ "el": 366,
492
+ "ted": 367,
493
+ "ent": 368,
494
+ "ad": 369,
495
+ "Ġhad": 370,
496
+ "ound": 371,
497
+ "al": 372,
498
+ "ĠJ": 373,
499
+ "Ġwe": 374,
500
+ "her": 375,
501
+ "ittle": 376,
502
+ "'s": 377,
503
+ "Ġsm": 378,
504
+ "Ġplay": 379,
505
+ "end": 380,
506
+ "Ġthey": 381,
507
+ "ack": 382,
508
+ "Ġthere": 383,
509
+ "ime": 384,
510
+ "ly": 385,
511
+ "Ġsh": 386,
512
+ "Ġlittle": 387,
513
+ "Ġre": 388,
514
+ "Ġne": 389,
515
+ "Ġtime": 390,
516
+ "out": 391,
517
+ "Ġfor": 392,
518
+ "un": 393,
519
+ "ch": 394,
520
+ "se": 395,
521
+ "Ġhapp": 396,
522
+ "Ġwh": 397,
523
+ "my": 398,
524
+ "ome": 399,
525
+ "ht": 400,
526
+ "um": 401,
527
+ "Ġfri": 402,
528
+ "Ġas": 403,
529
+ "Ġfriend": 404,
530
+ "Ġvery": 405,
531
+ "all": 406,
532
+ "ter": 407,
533
+ "â": 408,
534
+ "âĤ": 409,
535
+ "âĤ¬": 410,
536
+ "On": 411,
537
+ "Ġk": 412,
538
+ "ved": 413,
539
+ "ĠT": 414,
540
+ "Ġon": 415,
541
+ "irl": 416,
542
+ "Once": 417,
543
+ "ug": 418,
544
+ "\"ĊĊ": 419,
545
+ "ill": 420,
546
+ "Ġgirl": 421,
547
+ "Ġan": 422,
548
+ "es": 423,
549
+ "Ġex": 424,
550
+ "'t": 425,
551
+ "ec": 426,
552
+ "Ġbut": 427,
553
+ "Ġloo": 428,
554
+ "Ġli": 429,
555
+ "Ġbo": 430,
556
+ "Ġwere": 431,
557
+ "One": 432,
558
+ "Ġwan": 433,
559
+ "Ġhappy": 434,
560
+ "ake": 435,
561
+ "ore": 436,
562
+ "Ġbig": 437,
563
+ "fu": 438,
564
+ "Ġsp": 439,
565
+ "ide": 440,
566
+ "Ġsaw": 441,
567
+ "ĠB": 442,
568
+ "hing": 443,
569
+ "Ġupon": 444,
570
+ "ard": 445,
571
+ "Ġcould": 446,
572
+ "ic": 447,
573
+ "Ġout": 448,
574
+ "iled": 449,
575
+ "one": 450,
576
+ "round": 451,
577
+ "ra": 452,
578
+ "ry": 453,
579
+ "Ġsmiled": 454,
580
+ "Ġhim": 455,
581
+ "ĠA": 456,
582
+ "Ġmom": 457,
583
+ "hen": 458,
584
+ "way": 459,
585
+ "ur": 460,
586
+ "ĠIt": 461,
587
+ "ful": 462,
588
+ "ain": 463,
589
+ "Ġwent": 464,
590
+ "Ġhel": 465,
591
+ "Ġnot": 466,
592
+ "ĠThey": 467,
593
+ "Ġwanted": 468,
594
+ "ind": 469,
595
+ "are": 470,
596
+ "ear": 471,
597
+ "ĠM": 472,
598
+ "Ġall": 473,
599
+ "Ġfriends": 474,
600
+ "Ġtoo": 475,
601
+ "Ġgo": 476,
602
+ "ily": 477,
603
+ "ame": 478,
604
+ "ĠTim": 479,
605
+ "Ġhelp": 480,
606
+ "omet": 481,
607
+ "ĠL": 482,
608
+ "Ġlo": 483,
609
+ "ght": 484,
610
+ "Ġsomet": 485,
611
+ "Ġat": 486,
612
+ "Ġdo": 487,
613
+ "Ġasked": 488,
614
+ "!\"": 489,
615
+ "Ġaround": 490,
616
+ "Ġj": 491,
617
+ "ree": 492,
618
+ "Ġlooked": 493,
619
+ "Ġsomething": 494,
620
+ "Ġse": 495,
621
+ "Ġwor": 496,
622
+ "dd": 497,
623
+ "hed": 498,
624
+ "ood": 499,
625
+ "Ġcl": 500,
626
+ "amed": 501,
627
+ "ro": 502,
628
+ "Ġcan": 503,
629
+ "ark": 504,
630
+ "king": 505,
631
+ "ĠE": 506,
632
+ "rom": 507,
633
+ "Ġback": 508,
634
+ "Ġexc": 509,
635
+ "ab": 510,
636
+ "ick": 511
637
+ },
638
+ "merges": [
639
+ [
640
+ "h",
641
+ "e"
642
+ ],
643
+ [
644
+ "Ġ",
645
+ "t"
646
+ ],
647
+ [
648
+ "Ġ",
649
+ "a"
650
+ ],
651
+ [
652
+ "Ġ",
653
+ "s"
654
+ ],
655
+ [
656
+ "Ġ",
657
+ "w"
658
+ ],
659
+ [
660
+ "Ġt",
661
+ "he"
662
+ ],
663
+ [
664
+ "n",
665
+ "d"
666
+ ],
667
+ [
668
+ "e",
669
+ "d"
670
+ ],
671
+ [
672
+ "i",
673
+ "n"
674
+ ],
675
+ [
676
+ "Ġa",
677
+ "nd"
678
+ ],
679
+ [
680
+ "Ġw",
681
+ "a"
682
+ ],
683
+ [
684
+ "Ġ",
685
+ "b"
686
+ ],
687
+ [
688
+ "Ġt",
689
+ "o"
690
+ ],
691
+ [
692
+ "r",
693
+ "e"
694
+ ],
695
+ [
696
+ "Ġ",
697
+ "h"
698
+ ],
699
+ [
700
+ "o",
701
+ "u"
702
+ ],
703
+ [
704
+ "i",
705
+ "t"
706
+ ],
707
+ [
708
+ "Ġ",
709
+ "f"
710
+ ],
711
+ [
712
+ "e",
713
+ "r"
714
+ ],
715
+ [
716
+ "Ċ",
717
+ "Ċ"
718
+ ],
719
+ [
720
+ "Ġwa",
721
+ "s"
722
+ ],
723
+ [
724
+ "Ġ",
725
+ "l"
726
+ ],
727
+ [
728
+ "Ġ",
729
+ "c"
730
+ ],
731
+ [
732
+ "Ġ",
733
+ "he"
734
+ ],
735
+ [
736
+ "Ġ",
737
+ "p"
738
+ ],
739
+ [
740
+ "in",
741
+ "g"
742
+ ],
743
+ [
744
+ "Ġ",
745
+ "d"
746
+ ],
747
+ [
748
+ "Ġ",
749
+ "m"
750
+ ],
751
+ [
752
+ "Ġ",
753
+ "o"
754
+ ],
755
+ [
756
+ "Ġ",
757
+ "g"
758
+ ],
759
+ [
760
+ "a",
761
+ "r"
762
+ ],
763
+ [
764
+ "i",
765
+ "s"
766
+ ],
767
+ [
768
+ "i",
769
+ "d"
770
+ ],
771
+ [
772
+ "a",
773
+ "y"
774
+ ],
775
+ [
776
+ "o",
777
+ "m"
778
+ ],
779
+ [
780
+ "a",
781
+ "t"
782
+ ],
783
+ [
784
+ "l",
785
+ "l"
786
+ ],
787
+ [
788
+ "e",
789
+ "n"
790
+ ],
791
+ [
792
+ "Ġs",
793
+ "a"
794
+ ],
795
+ [
796
+ "n",
797
+ "e"
798
+ ],
799
+ [
800
+ "T",
801
+ "he"
802
+ ],
803
+ [
804
+ ".",
805
+ "ĊĊ"
806
+ ],
807
+ [
808
+ "l",
809
+ "e"
810
+ ],
811
+ [
812
+ "Ġt",
813
+ "h"
814
+ ],
815
+ [
816
+ "i",
817
+ "m"
818
+ ],
819
+ [
820
+ "a",
821
+ "n"
822
+ ],
823
+ [
824
+ "Ġh",
825
+ "a"
826
+ ],
827
+ [
828
+ "o",
829
+ "r"
830
+ ],
831
+ [
832
+ "Ġ",
833
+ "it"
834
+ ],
835
+ [
836
+ "e",
837
+ "t"
838
+ ],
839
+ [
840
+ "v",
841
+ "er"
842
+ ],
843
+ [
844
+ "l",
845
+ "d"
846
+ ],
847
+ [
848
+ "Ġ",
849
+ "in"
850
+ ],
851
+ [
852
+ "Ġ",
853
+ "S"
854
+ ],
855
+ [
856
+ "o",
857
+ "n"
858
+ ],
859
+ [
860
+ "Ġ",
861
+ "e"
862
+ ],
863
+ [
864
+ "c",
865
+ "e"
866
+ ],
867
+ [
868
+ "Ġb",
869
+ "e"
870
+ ],
871
+ [
872
+ "Ġhe",
873
+ "r"
874
+ ],
875
+ [
876
+ "i",
877
+ "r"
878
+ ],
879
+ [
880
+ "Ġ",
881
+ "\""
882
+ ],
883
+ [
884
+ "Ġ",
885
+ "H"
886
+ ],
887
+ [
888
+ "Ġ",
889
+ "u"
890
+ ],
891
+ [
892
+ "Ġsa",
893
+ "id"
894
+ ],
895
+ [
896
+ "Ġ",
897
+ "n"
898
+ ],
899
+ [
900
+ "c",
901
+ "k"
902
+ ],
903
+ [
904
+ "o",
905
+ "w"
906
+ ],
907
+ [
908
+ "r",
909
+ "i"
910
+ ],
911
+ [
912
+ "Ġ",
913
+ "The"
914
+ ],
915
+ [
916
+ "Ġs",
917
+ "he"
918
+ ],
919
+ [
920
+ "Ġs",
921
+ "o"
922
+ ],
923
+ [
924
+ "s",
925
+ "t"
926
+ ],
927
+ [
928
+ "Ġ",
929
+ "y"
930
+ ],
931
+ [
932
+ "o",
933
+ "t"
934
+ ],
935
+ [
936
+ "ĠH",
937
+ "e"
938
+ ],
939
+ [
940
+ "Ġo",
941
+ "f"
942
+ ],
943
+ [
944
+ "i",
945
+ "l"
946
+ ],
947
+ [
948
+ "Ġs",
949
+ "t"
950
+ ],
951
+ [
952
+ "u",
953
+ "t"
954
+ ],
955
+ [
956
+ "k",
957
+ "e"
958
+ ],
959
+ [
960
+ "a",
961
+ "m"
962
+ ],
963
+ [
964
+ "k",
965
+ "ed"
966
+ ],
967
+ [
968
+ "o",
969
+ "o"
970
+ ],
971
+ [
972
+ "p",
973
+ "p"
974
+ ],
975
+ [
976
+ "Ġ",
977
+ "r"
978
+ ],
979
+ [
980
+ "ĠS",
981
+ "he"
982
+ ],
983
+ [
984
+ "ver",
985
+ "y"
986
+ ],
987
+ [
988
+ "Ġ",
989
+ "I"
990
+ ],
991
+ [
992
+ "v",
993
+ "e"
994
+ ],
995
+ [
996
+ "Ġth",
997
+ "at"
998
+ ],
999
+ [
1000
+ "i",
1001
+ "g"
1002
+ ],
1003
+ [
1004
+ "it",
1005
+ "h"
1006
+ ],
1007
+ [
1008
+ "Ġh",
1009
+ "is"
1010
+ ],
1011
+ [
1012
+ "Ġu",
1013
+ "p"
1014
+ ],
1015
+ [
1016
+ "Ġ",
1017
+ "ĊĊ"
1018
+ ],
1019
+ [
1020
+ "Ġd",
1021
+ "ay"
1022
+ ],
1023
+ [
1024
+ "Ġw",
1025
+ "ith"
1026
+ ],
1027
+ [
1028
+ "Ġp",
1029
+ "l"
1030
+ ],
1031
+ [
1032
+ "Ġy",
1033
+ "ou"
1034
+ ],
1035
+ [
1036
+ "it",
1037
+ "t"
1038
+ ],
1039
+ [
1040
+ "ou",
1041
+ "ld"
1042
+ ],
1043
+ [
1044
+ "e",
1045
+ "l"
1046
+ ],
1047
+ [
1048
+ "t",
1049
+ "ed"
1050
+ ],
1051
+ [
1052
+ "en",
1053
+ "t"
1054
+ ],
1055
+ [
1056
+ "a",
1057
+ "d"
1058
+ ],
1059
+ [
1060
+ "Ġha",
1061
+ "d"
1062
+ ],
1063
+ [
1064
+ "ou",
1065
+ "nd"
1066
+ ],
1067
+ [
1068
+ "a",
1069
+ "l"
1070
+ ],
1071
+ [
1072
+ "Ġ",
1073
+ "J"
1074
+ ],
1075
+ [
1076
+ "Ġw",
1077
+ "e"
1078
+ ],
1079
+ [
1080
+ "he",
1081
+ "r"
1082
+ ],
1083
+ [
1084
+ "itt",
1085
+ "le"
1086
+ ],
1087
+ [
1088
+ "'",
1089
+ "s"
1090
+ ],
1091
+ [
1092
+ "Ġs",
1093
+ "m"
1094
+ ],
1095
+ [
1096
+ "Ġpl",
1097
+ "ay"
1098
+ ],
1099
+ [
1100
+ "e",
1101
+ "nd"
1102
+ ],
1103
+ [
1104
+ "Ġthe",
1105
+ "y"
1106
+ ],
1107
+ [
1108
+ "a",
1109
+ "ck"
1110
+ ],
1111
+ [
1112
+ "Ġthe",
1113
+ "re"
1114
+ ],
1115
+ [
1116
+ "im",
1117
+ "e"
1118
+ ],
1119
+ [
1120
+ "l",
1121
+ "y"
1122
+ ],
1123
+ [
1124
+ "Ġs",
1125
+ "h"
1126
+ ],
1127
+ [
1128
+ "Ġl",
1129
+ "ittle"
1130
+ ],
1131
+ [
1132
+ "Ġ",
1133
+ "re"
1134
+ ],
1135
+ [
1136
+ "Ġ",
1137
+ "ne"
1138
+ ],
1139
+ [
1140
+ "Ġt",
1141
+ "ime"
1142
+ ],
1143
+ [
1144
+ "ou",
1145
+ "t"
1146
+ ],
1147
+ [
1148
+ "Ġf",
1149
+ "or"
1150
+ ],
1151
+ [
1152
+ "u",
1153
+ "n"
1154
+ ],
1155
+ [
1156
+ "c",
1157
+ "h"
1158
+ ],
1159
+ [
1160
+ "s",
1161
+ "e"
1162
+ ],
1163
+ [
1164
+ "Ġha",
1165
+ "pp"
1166
+ ],
1167
+ [
1168
+ "Ġw",
1169
+ "h"
1170
+ ],
1171
+ [
1172
+ "m",
1173
+ "y"
1174
+ ],
1175
+ [
1176
+ "om",
1177
+ "e"
1178
+ ],
1179
+ [
1180
+ "h",
1181
+ "t"
1182
+ ],
1183
+ [
1184
+ "u",
1185
+ "m"
1186
+ ],
1187
+ [
1188
+ "Ġf",
1189
+ "ri"
1190
+ ],
1191
+ [
1192
+ "Ġa",
1193
+ "s"
1194
+ ],
1195
+ [
1196
+ "Ġfri",
1197
+ "end"
1198
+ ],
1199
+ [
1200
+ "Ġ",
1201
+ "very"
1202
+ ],
1203
+ [
1204
+ "a",
1205
+ "ll"
1206
+ ],
1207
+ [
1208
+ "t",
1209
+ "er"
1210
+ ],
1211
+ [
1212
+ "Ã",
1213
+ "¢"
1214
+ ],
1215
+ [
1216
+ "â",
1217
+ "Ĥ"
1218
+ ],
1219
+ [
1220
+ "âĤ",
1221
+ "¬"
1222
+ ],
1223
+ [
1224
+ "O",
1225
+ "n"
1226
+ ],
1227
+ [
1228
+ "Ġ",
1229
+ "k"
1230
+ ],
1231
+ [
1232
+ "v",
1233
+ "ed"
1234
+ ],
1235
+ [
1236
+ "Ġ",
1237
+ "T"
1238
+ ],
1239
+ [
1240
+ "Ġo",
1241
+ "n"
1242
+ ],
1243
+ [
1244
+ "ir",
1245
+ "l"
1246
+ ],
1247
+ [
1248
+ "On",
1249
+ "ce"
1250
+ ],
1251
+ [
1252
+ "u",
1253
+ "g"
1254
+ ],
1255
+ [
1256
+ "\"",
1257
+ "ĊĊ"
1258
+ ],
1259
+ [
1260
+ "i",
1261
+ "ll"
1262
+ ],
1263
+ [
1264
+ "Ġg",
1265
+ "irl"
1266
+ ],
1267
+ [
1268
+ "Ġa",
1269
+ "n"
1270
+ ],
1271
+ [
1272
+ "e",
1273
+ "s"
1274
+ ],
1275
+ [
1276
+ "Ġe",
1277
+ "x"
1278
+ ],
1279
+ [
1280
+ "'",
1281
+ "t"
1282
+ ],
1283
+ [
1284
+ "e",
1285
+ "c"
1286
+ ],
1287
+ [
1288
+ "Ġb",
1289
+ "ut"
1290
+ ],
1291
+ [
1292
+ "Ġl",
1293
+ "oo"
1294
+ ],
1295
+ [
1296
+ "Ġl",
1297
+ "i"
1298
+ ],
1299
+ [
1300
+ "Ġb",
1301
+ "o"
1302
+ ],
1303
+ [
1304
+ "Ġwe",
1305
+ "re"
1306
+ ],
1307
+ [
1308
+ "O",
1309
+ "ne"
1310
+ ],
1311
+ [
1312
+ "Ġwa",
1313
+ "n"
1314
+ ],
1315
+ [
1316
+ "Ġhapp",
1317
+ "y"
1318
+ ],
1319
+ [
1320
+ "a",
1321
+ "ke"
1322
+ ],
1323
+ [
1324
+ "o",
1325
+ "re"
1326
+ ],
1327
+ [
1328
+ "Ġb",
1329
+ "ig"
1330
+ ],
1331
+ [
1332
+ "f",
1333
+ "u"
1334
+ ],
1335
+ [
1336
+ "Ġs",
1337
+ "p"
1338
+ ],
1339
+ [
1340
+ "id",
1341
+ "e"
1342
+ ],
1343
+ [
1344
+ "Ġsa",
1345
+ "w"
1346
+ ],
1347
+ [
1348
+ "Ġ",
1349
+ "B"
1350
+ ],
1351
+ [
1352
+ "h",
1353
+ "ing"
1354
+ ],
1355
+ [
1356
+ "Ġup",
1357
+ "on"
1358
+ ],
1359
+ [
1360
+ "ar",
1361
+ "d"
1362
+ ],
1363
+ [
1364
+ "Ġc",
1365
+ "ould"
1366
+ ],
1367
+ [
1368
+ "i",
1369
+ "c"
1370
+ ],
1371
+ [
1372
+ "Ġ",
1373
+ "out"
1374
+ ],
1375
+ [
1376
+ "il",
1377
+ "ed"
1378
+ ],
1379
+ [
1380
+ "o",
1381
+ "ne"
1382
+ ],
1383
+ [
1384
+ "r",
1385
+ "ound"
1386
+ ],
1387
+ [
1388
+ "r",
1389
+ "a"
1390
+ ],
1391
+ [
1392
+ "r",
1393
+ "y"
1394
+ ],
1395
+ [
1396
+ "Ġsm",
1397
+ "iled"
1398
+ ],
1399
+ [
1400
+ "Ġh",
1401
+ "im"
1402
+ ],
1403
+ [
1404
+ "Ġ",
1405
+ "A"
1406
+ ],
1407
+ [
1408
+ "Ġm",
1409
+ "om"
1410
+ ],
1411
+ [
1412
+ "he",
1413
+ "n"
1414
+ ],
1415
+ [
1416
+ "w",
1417
+ "ay"
1418
+ ],
1419
+ [
1420
+ "u",
1421
+ "r"
1422
+ ],
1423
+ [
1424
+ "ĠI",
1425
+ "t"
1426
+ ],
1427
+ [
1428
+ "fu",
1429
+ "l"
1430
+ ],
1431
+ [
1432
+ "a",
1433
+ "in"
1434
+ ],
1435
+ [
1436
+ "Ġw",
1437
+ "ent"
1438
+ ],
1439
+ [
1440
+ "Ġhe",
1441
+ "l"
1442
+ ],
1443
+ [
1444
+ "Ġn",
1445
+ "ot"
1446
+ ],
1447
+ [
1448
+ "ĠThe",
1449
+ "y"
1450
+ ],
1451
+ [
1452
+ "Ġwan",
1453
+ "ted"
1454
+ ],
1455
+ [
1456
+ "i",
1457
+ "nd"
1458
+ ],
1459
+ [
1460
+ "a",
1461
+ "re"
1462
+ ],
1463
+ [
1464
+ "e",
1465
+ "ar"
1466
+ ],
1467
+ [
1468
+ "Ġ",
1469
+ "M"
1470
+ ],
1471
+ [
1472
+ "Ġa",
1473
+ "ll"
1474
+ ],
1475
+ [
1476
+ "Ġfriend",
1477
+ "s"
1478
+ ],
1479
+ [
1480
+ "Ġto",
1481
+ "o"
1482
+ ],
1483
+ [
1484
+ "Ġg",
1485
+ "o"
1486
+ ],
1487
+ [
1488
+ "il",
1489
+ "y"
1490
+ ],
1491
+ [
1492
+ "am",
1493
+ "e"
1494
+ ],
1495
+ [
1496
+ "ĠT",
1497
+ "im"
1498
+ ],
1499
+ [
1500
+ "Ġhel",
1501
+ "p"
1502
+ ],
1503
+ [
1504
+ "om",
1505
+ "et"
1506
+ ],
1507
+ [
1508
+ "Ġ",
1509
+ "L"
1510
+ ],
1511
+ [
1512
+ "Ġl",
1513
+ "o"
1514
+ ],
1515
+ [
1516
+ "g",
1517
+ "ht"
1518
+ ],
1519
+ [
1520
+ "Ġs",
1521
+ "omet"
1522
+ ],
1523
+ [
1524
+ "Ġa",
1525
+ "t"
1526
+ ],
1527
+ [
1528
+ "Ġd",
1529
+ "o"
1530
+ ],
1531
+ [
1532
+ "Ġas",
1533
+ "ked"
1534
+ ],
1535
+ [
1536
+ "!",
1537
+ "\""
1538
+ ],
1539
+ [
1540
+ "Ġa",
1541
+ "round"
1542
+ ],
1543
+ [
1544
+ "Ġ",
1545
+ "j"
1546
+ ],
1547
+ [
1548
+ "re",
1549
+ "e"
1550
+ ],
1551
+ [
1552
+ "Ġloo",
1553
+ "ked"
1554
+ ],
1555
+ [
1556
+ "Ġsomet",
1557
+ "hing"
1558
+ ],
1559
+ [
1560
+ "Ġs",
1561
+ "e"
1562
+ ],
1563
+ [
1564
+ "Ġw",
1565
+ "or"
1566
+ ],
1567
+ [
1568
+ "d",
1569
+ "d"
1570
+ ],
1571
+ [
1572
+ "he",
1573
+ "d"
1574
+ ],
1575
+ [
1576
+ "oo",
1577
+ "d"
1578
+ ],
1579
+ [
1580
+ "Ġc",
1581
+ "l"
1582
+ ],
1583
+ [
1584
+ "am",
1585
+ "ed"
1586
+ ],
1587
+ [
1588
+ "r",
1589
+ "o"
1590
+ ],
1591
+ [
1592
+ "Ġc",
1593
+ "an"
1594
+ ],
1595
+ [
1596
+ "ar",
1597
+ "k"
1598
+ ],
1599
+ [
1600
+ "k",
1601
+ "ing"
1602
+ ],
1603
+ [
1604
+ "Ġ",
1605
+ "E"
1606
+ ],
1607
+ [
1608
+ "r",
1609
+ "om"
1610
+ ],
1611
+ [
1612
+ "Ġb",
1613
+ "ack"
1614
+ ],
1615
+ [
1616
+ "Ġex",
1617
+ "c"
1618
+ ],
1619
+ [
1620
+ "a",
1621
+ "b"
1622
+ ],
1623
+ [
1624
+ "i",
1625
+ "ck"
1626
+ ]
1627
+ ]
1628
+ }
1629
+ }
archive/runs_legacy_20260525/smoke/20260522-132336/config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "args": {
3
+ "corpus": "/Users/mandeepsidhu/Desktop/code/transformer/data/tinystories_train.txt",
4
+ "corpus_glob": null,
5
+ "text_column": "text",
6
+ "output_dir": "runs/smoke",
7
+ "seeds": [
8
+ 1,
9
+ 2,
10
+ 3,
11
+ 4,
12
+ 5
13
+ ],
14
+ "initial_tokens": 20000,
15
+ "stream_token_caps": [
16
+ 20000,
17
+ 40000,
18
+ 80000
19
+ ],
20
+ "val_tokens": 500000,
21
+ "allow_short_corpus": true,
22
+ "force_retokenize": false,
23
+ "vocab_size": 512,
24
+ "tokenizer_train_chars": 200000,
25
+ "block_size": 32,
26
+ "batch_size": 4,
27
+ "small_layers": 4,
28
+ "small_heads": 4,
29
+ "small_embd": 128,
30
+ "large_layers": 8,
31
+ "large_heads": 8,
32
+ "large_embd": 256,
33
+ "steps_per_run": 1,
34
+ "stream_steps_per_stage": 1,
35
+ "eval_batches": 1,
36
+ "log_every": 0,
37
+ "lr": 0.0003,
38
+ "weight_decay": 0.1,
39
+ "grad_clip": 1.0,
40
+ "baseline_dropout": 0.1,
41
+ "high_dropout": 0.8,
42
+ "dropout_decay_tokens": null,
43
+ "dropout_schedule": "cosine"
44
+ },
45
+ "device": "mps",
46
+ "torch": "2.9.1",
47
+ "attribution": "Derived from Andrej Karpathy's nanochat project (https://github.com/karpathy/nanochat), MIT License, Copyright (c) 2025 Andrej Karpathy.",
48
+ "tokenizer_path": "runs/smoke/20260522-132336/tokenizer.json",
49
+ "encoded_path": "runs/smoke/20260522-132336/tokens-v512-uint16.npy",
50
+ "train_tokens": 777983,
51
+ "val_tokens": 86442,
52
+ "effective_initial_tokens": 20000,
53
+ "effective_stream_token_caps": [
54
+ 20000,
55
+ 40000,
56
+ 80000
57
+ ]
58
+ }
archive/runs_legacy_20260525/smoke/20260522-132336/metrics.jsonl ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"phase": "phase1_baseline_comparison", "condition": "small_static", "seed": 1, "stage": null, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.1, "train_loss_last": 6.238768100738525, "eval_loss": 6.236357688903809, "elapsed_sec": 0.15027999877929688, "parameters": 917504, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 4, "n_head": 4, "n_embd": 128, "dropout": 0.1}}
2
+ {"phase": "phase1_baseline_comparison", "condition": "large_static", "seed": 1, "stage": null, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.1, "train_loss_last": 6.240777015686035, "eval_loss": 6.2333245277404785, "elapsed_sec": 0.13346290588378906, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
3
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_dropout_decay_fixed_data", "seed": 1, "stage": null, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.8, "train_loss_last": 6.2400970458984375, "eval_loss": 6.237194538116455, "elapsed_sec": 0.04652595520019531, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
4
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 1, "stage": 0, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.6974873734152917, "train_loss_last": 6.240694999694824, "eval_loss": 6.236194610595703, "elapsed_sec": 0.04595613479614258, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
5
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 1, "stage": 1, "token_limit": 40000, "steps": 1, "tokens_seen": 256, "dropout": 0.45000000000000007, "train_loss_last": 6.233834266662598, "eval_loss": 6.2317962646484375, "elapsed_sec": 0.03716301918029785, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
6
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 1, "stage": 2, "token_limit": 80000, "steps": 1, "tokens_seen": 384, "dropout": 0.1, "train_loss_last": 6.231166839599609, "eval_loss": 6.221035957336426, "elapsed_sec": 0.032035112380981445, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
7
+ {"phase": "phase1_baseline_comparison", "condition": "small_static", "seed": 2, "stage": null, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.1, "train_loss_last": 6.237826824188232, "eval_loss": 6.234821796417236, "elapsed_sec": 0.0224301815032959, "parameters": 917504, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 4, "n_head": 4, "n_embd": 128, "dropout": 0.1}}
8
+ {"phase": "phase1_baseline_comparison", "condition": "large_static", "seed": 2, "stage": null, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.1, "train_loss_last": 6.238691329956055, "eval_loss": 6.229793548583984, "elapsed_sec": 0.042351722717285156, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
9
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_dropout_decay_fixed_data", "seed": 2, "stage": null, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.8, "train_loss_last": 6.236819267272949, "eval_loss": 6.234230995178223, "elapsed_sec": 0.04147815704345703, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
10
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 2, "stage": 0, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.6974873734152917, "train_loss_last": 6.23726749420166, "eval_loss": 6.232943534851074, "elapsed_sec": 0.04183006286621094, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
11
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 2, "stage": 1, "token_limit": 40000, "steps": 1, "tokens_seen": 256, "dropout": 0.45000000000000007, "train_loss_last": 6.2372612953186035, "eval_loss": 6.226807594299316, "elapsed_sec": 0.032605886459350586, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
12
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 2, "stage": 2, "token_limit": 80000, "steps": 1, "tokens_seen": 384, "dropout": 0.1, "train_loss_last": 6.228798866271973, "eval_loss": 6.214606285095215, "elapsed_sec": 0.03294706344604492, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
13
+ {"phase": "phase1_baseline_comparison", "condition": "small_static", "seed": 3, "stage": null, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.1, "train_loss_last": 6.237939834594727, "eval_loss": 6.235240459442139, "elapsed_sec": 0.02249908447265625, "parameters": 917504, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 4, "n_head": 4, "n_embd": 128, "dropout": 0.1}}
14
+ {"phase": "phase1_baseline_comparison", "condition": "large_static", "seed": 3, "stage": null, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.1, "train_loss_last": 6.237975120544434, "eval_loss": 6.229854583740234, "elapsed_sec": 0.04176592826843262, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
15
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_dropout_decay_fixed_data", "seed": 3, "stage": null, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.8, "train_loss_last": 6.238975524902344, "eval_loss": 6.2339982986450195, "elapsed_sec": 0.04214000701904297, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
16
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 3, "stage": 0, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.6974873734152917, "train_loss_last": 6.238128662109375, "eval_loss": 6.2329301834106445, "elapsed_sec": 0.04157400131225586, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
17
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 3, "stage": 1, "token_limit": 40000, "steps": 1, "tokens_seen": 256, "dropout": 0.45000000000000007, "train_loss_last": 6.236502170562744, "eval_loss": 6.225698471069336, "elapsed_sec": 0.03307604789733887, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
18
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 3, "stage": 2, "token_limit": 80000, "steps": 1, "tokens_seen": 384, "dropout": 0.1, "train_loss_last": 6.231315612792969, "eval_loss": 6.213197708129883, "elapsed_sec": 0.033010244369506836, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
19
+ {"phase": "phase1_baseline_comparison", "condition": "small_static", "seed": 4, "stage": null, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.1, "train_loss_last": 6.239793300628662, "eval_loss": 6.237911224365234, "elapsed_sec": 0.02308821678161621, "parameters": 917504, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 4, "n_head": 4, "n_embd": 128, "dropout": 0.1}}
20
+ {"phase": "phase1_baseline_comparison", "condition": "large_static", "seed": 4, "stage": null, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.1, "train_loss_last": 6.239620208740234, "eval_loss": 6.232203960418701, "elapsed_sec": 0.043370962142944336, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
21
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_dropout_decay_fixed_data", "seed": 4, "stage": null, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.8, "train_loss_last": 6.239019393920898, "eval_loss": 6.237016677856445, "elapsed_sec": 0.04206585884094238, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
22
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 4, "stage": 0, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.6974873734152917, "train_loss_last": 6.239429950714111, "eval_loss": 6.235345363616943, "elapsed_sec": 0.04175519943237305, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
23
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 4, "stage": 1, "token_limit": 40000, "steps": 1, "tokens_seen": 256, "dropout": 0.45000000000000007, "train_loss_last": 6.23697566986084, "eval_loss": 6.229433536529541, "elapsed_sec": 0.03307700157165527, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
24
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 4, "stage": 2, "token_limit": 80000, "steps": 1, "tokens_seen": 384, "dropout": 0.1, "train_loss_last": 6.231019020080566, "eval_loss": 6.218218803405762, "elapsed_sec": 0.03255319595336914, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
25
+ {"phase": "phase1_baseline_comparison", "condition": "small_static", "seed": 5, "stage": null, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.1, "train_loss_last": 6.239540100097656, "eval_loss": 6.2350263595581055, "elapsed_sec": 0.02461099624633789, "parameters": 917504, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 4, "n_head": 4, "n_embd": 128, "dropout": 0.1}}
26
+ {"phase": "phase1_baseline_comparison", "condition": "large_static", "seed": 5, "stage": null, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.1, "train_loss_last": 6.23555850982666, "eval_loss": 6.231681823730469, "elapsed_sec": 0.04686999320983887, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.1}}
27
+ {"phase": "phase2_fixed_data_dropout_optimization", "condition": "large_dropout_decay_fixed_data", "seed": 5, "stage": null, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.8, "train_loss_last": 6.239316463470459, "eval_loss": 6.237186431884766, "elapsed_sec": 0.04636812210083008, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
28
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 5, "stage": 0, "token_limit": 20000, "steps": 1, "tokens_seen": 128, "dropout": 0.6974873734152917, "train_loss_last": 6.239423751831055, "eval_loss": 6.236317157745361, "elapsed_sec": 0.045987844467163086, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
29
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 5, "stage": 1, "token_limit": 40000, "steps": 1, "tokens_seen": 256, "dropout": 0.45000000000000007, "train_loss_last": 6.234818458557129, "eval_loss": 6.230718612670898, "elapsed_sec": 0.034147024154663086, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
30
+ {"phase": "phase3_simulated_streaming_scale_up", "condition": "large_dropout_decay_streaming", "seed": 5, "stage": 2, "token_limit": 80000, "steps": 1, "tokens_seen": 384, "dropout": 0.1, "train_loss_last": 6.232948303222656, "eval_loss": 6.2211151123046875, "elapsed_sec": 0.03441500663757324, "parameters": 6553600, "model_config": {"block_size": 32, "vocab_size": 512, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.8}}
archive/runs_legacy_20260525/smoke/20260522-132336/summary.csv ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ phase,condition,stage,token_limit,parameters,n,mean_eval_loss,std_eval_loss
2
+ phase1_baseline_comparison,large_static,,20000,6553600,5,6.231371688842773,0.0015325284625100665
3
+ phase1_baseline_comparison,small_static,,20000,917504,5,6.235871505737305,0.0012856134041782643
4
+ phase2_fixed_data_dropout_optimization,large_dropout_decay_fixed_data,,20000,6553600,5,6.235925388336182,0.0016565421564777797
5
+ phase3_simulated_streaming_scale_up,large_dropout_decay_streaming,0,20000,6553600,5,6.2347461700439455,0.0016935404053936482
6
+ phase3_simulated_streaming_scale_up,large_dropout_decay_streaming,1,40000,6553600,5,6.228890895843506,0.0025791421476287733
7
+ phase3_simulated_streaming_scale_up,large_dropout_decay_streaming,2,80000,6553600,5,6.217634773254394,0.0036359727629622267
archive/runs_legacy_20260525/smoke/20260522-132336/summary.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "phase": "phase1_baseline_comparison",
4
+ "condition": "large_static",
5
+ "stage": null,
6
+ "token_limit": 20000,
7
+ "parameters": 6553600,
8
+ "n": 5,
9
+ "mean_eval_loss": 6.231371688842773,
10
+ "std_eval_loss": 0.0015325284625100665
11
+ },
12
+ {
13
+ "phase": "phase1_baseline_comparison",
14
+ "condition": "small_static",
15
+ "stage": null,
16
+ "token_limit": 20000,
17
+ "parameters": 917504,
18
+ "n": 5,
19
+ "mean_eval_loss": 6.235871505737305,
20
+ "std_eval_loss": 0.0012856134041782643
21
+ },
22
+ {
23
+ "phase": "phase2_fixed_data_dropout_optimization",
24
+ "condition": "large_dropout_decay_fixed_data",
25
+ "stage": null,
26
+ "token_limit": 20000,
27
+ "parameters": 6553600,
28
+ "n": 5,
29
+ "mean_eval_loss": 6.235925388336182,
30
+ "std_eval_loss": 0.0016565421564777797
31
+ },
32
+ {
33
+ "phase": "phase3_simulated_streaming_scale_up",
34
+ "condition": "large_dropout_decay_streaming",
35
+ "stage": 0,
36
+ "token_limit": 20000,
37
+ "parameters": 6553600,
38
+ "n": 5,
39
+ "mean_eval_loss": 6.2347461700439455,
40
+ "std_eval_loss": 0.0016935404053936482
41
+ },
42
+ {
43
+ "phase": "phase3_simulated_streaming_scale_up",
44
+ "condition": "large_dropout_decay_streaming",
45
+ "stage": 1,
46
+ "token_limit": 40000,
47
+ "parameters": 6553600,
48
+ "n": 5,
49
+ "mean_eval_loss": 6.228890895843506,
50
+ "std_eval_loss": 0.0025791421476287733
51
+ },
52
+ {
53
+ "phase": "phase3_simulated_streaming_scale_up",
54
+ "condition": "large_dropout_decay_streaming",
55
+ "stage": 2,
56
+ "token_limit": 80000,
57
+ "parameters": 6553600,
58
+ "n": 5,
59
+ "mean_eval_loss": 6.217634773254394,
60
+ "std_eval_loss": 0.0036359727629622267
61
+ }
62
+ ]
archive/runs_legacy_20260525/smoke/20260522-132336/tokenizer.json ADDED
@@ -0,0 +1,1629 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<|bos|>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<|user_start|>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<|user_end|>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "<|assistant_start|>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "<|assistant_end|>",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ },
51
+ {
52
+ "id": 5,
53
+ "content": "<|python_start|>",
54
+ "single_word": false,
55
+ "lstrip": false,
56
+ "rstrip": false,
57
+ "normalized": false,
58
+ "special": true
59
+ },
60
+ {
61
+ "id": 6,
62
+ "content": "<|python_end|>",
63
+ "single_word": false,
64
+ "lstrip": false,
65
+ "rstrip": false,
66
+ "normalized": false,
67
+ "special": true
68
+ },
69
+ {
70
+ "id": 7,
71
+ "content": "<|output_start|>",
72
+ "single_word": false,
73
+ "lstrip": false,
74
+ "rstrip": false,
75
+ "normalized": false,
76
+ "special": true
77
+ },
78
+ {
79
+ "id": 8,
80
+ "content": "<|output_end|>",
81
+ "single_word": false,
82
+ "lstrip": false,
83
+ "rstrip": false,
84
+ "normalized": false,
85
+ "special": true
86
+ }
87
+ ],
88
+ "normalizer": null,
89
+ "pre_tokenizer": {
90
+ "type": "Sequence",
91
+ "pretokenizers": [
92
+ {
93
+ "type": "Split",
94
+ "pattern": {
95
+ "Regex": "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}{1,2}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
96
+ },
97
+ "behavior": "Isolated",
98
+ "invert": false
99
+ },
100
+ {
101
+ "type": "ByteLevel",
102
+ "add_prefix_space": false,
103
+ "trim_offsets": true,
104
+ "use_regex": false
105
+ }
106
+ ]
107
+ },
108
+ "post_processor": null,
109
+ "decoder": {
110
+ "type": "ByteLevel",
111
+ "add_prefix_space": true,
112
+ "trim_offsets": true,
113
+ "use_regex": true
114
+ },
115
+ "model": {
116
+ "type": "BPE",
117
+ "dropout": null,
118
+ "unk_token": null,
119
+ "continuing_subword_prefix": null,
120
+ "end_of_word_suffix": null,
121
+ "fuse_unk": false,
122
+ "byte_fallback": true,
123
+ "ignore_merges": false,
124
+ "vocab": {
125
+ "<|bos|>": 0,
126
+ "<|user_start|>": 1,
127
+ "<|user_end|>": 2,
128
+ "<|assistant_start|>": 3,
129
+ "<|assistant_end|>": 4,
130
+ "<|python_start|>": 5,
131
+ "<|python_end|>": 6,
132
+ "<|output_start|>": 7,
133
+ "<|output_end|>": 8,
134
+ "!": 9,
135
+ "\"": 10,
136
+ "#": 11,
137
+ "$": 12,
138
+ "%": 13,
139
+ "&": 14,
140
+ "'": 15,
141
+ "(": 16,
142
+ ")": 17,
143
+ "*": 18,
144
+ "+": 19,
145
+ ",": 20,
146
+ "-": 21,
147
+ ".": 22,
148
+ "/": 23,
149
+ "0": 24,
150
+ "1": 25,
151
+ "2": 26,
152
+ "3": 27,
153
+ "4": 28,
154
+ "5": 29,
155
+ "6": 30,
156
+ "7": 31,
157
+ "8": 32,
158
+ "9": 33,
159
+ ":": 34,
160
+ ";": 35,
161
+ "<": 36,
162
+ "=": 37,
163
+ ">": 38,
164
+ "?": 39,
165
+ "@": 40,
166
+ "A": 41,
167
+ "B": 42,
168
+ "C": 43,
169
+ "D": 44,
170
+ "E": 45,
171
+ "F": 46,
172
+ "G": 47,
173
+ "H": 48,
174
+ "I": 49,
175
+ "J": 50,
176
+ "K": 51,
177
+ "L": 52,
178
+ "M": 53,
179
+ "N": 54,
180
+ "O": 55,
181
+ "P": 56,
182
+ "Q": 57,
183
+ "R": 58,
184
+ "S": 59,
185
+ "T": 60,
186
+ "U": 61,
187
+ "V": 62,
188
+ "W": 63,
189
+ "X": 64,
190
+ "Y": 65,
191
+ "Z": 66,
192
+ "[": 67,
193
+ "\\": 68,
194
+ "]": 69,
195
+ "^": 70,
196
+ "_": 71,
197
+ "`": 72,
198
+ "a": 73,
199
+ "b": 74,
200
+ "c": 75,
201
+ "d": 76,
202
+ "e": 77,
203
+ "f": 78,
204
+ "g": 79,
205
+ "h": 80,
206
+ "i": 81,
207
+ "j": 82,
208
+ "k": 83,
209
+ "l": 84,
210
+ "m": 85,
211
+ "n": 86,
212
+ "o": 87,
213
+ "p": 88,
214
+ "q": 89,
215
+ "r": 90,
216
+ "s": 91,
217
+ "t": 92,
218
+ "u": 93,
219
+ "v": 94,
220
+ "w": 95,
221
+ "x": 96,
222
+ "y": 97,
223
+ "z": 98,
224
+ "{": 99,
225
+ "|": 100,
226
+ "}": 101,
227
+ "~": 102,
228
+ "¡": 103,
229
+ "¢": 104,
230
+ "£": 105,
231
+ "¤": 106,
232
+ "¥": 107,
233
+ "¦": 108,
234
+ "§": 109,
235
+ "¨": 110,
236
+ "©": 111,
237
+ "ª": 112,
238
+ "«": 113,
239
+ "¬": 114,
240
+ "®": 115,
241
+ "¯": 116,
242
+ "°": 117,
243
+ "±": 118,
244
+ "²": 119,
245
+ "³": 120,
246
+ "´": 121,
247
+ "µ": 122,
248
+ "¶": 123,
249
+ "·": 124,
250
+ "¸": 125,
251
+ "¹": 126,
252
+ "º": 127,
253
+ "»": 128,
254
+ "¼": 129,
255
+ "½": 130,
256
+ "¾": 131,
257
+ "¿": 132,
258
+ "À": 133,
259
+ "Á": 134,
260
+ "Â": 135,
261
+ "Ã": 136,
262
+ "Ä": 137,
263
+ "Å": 138,
264
+ "Æ": 139,
265
+ "Ç": 140,
266
+ "È": 141,
267
+ "É": 142,
268
+ "Ê": 143,
269
+ "Ë": 144,
270
+ "Ì": 145,
271
+ "Í": 146,
272
+ "Î": 147,
273
+ "Ï": 148,
274
+ "Ð": 149,
275
+ "Ñ": 150,
276
+ "Ò": 151,
277
+ "Ó": 152,
278
+ "Ô": 153,
279
+ "Õ": 154,
280
+ "Ö": 155,
281
+ "×": 156,
282
+ "Ø": 157,
283
+ "Ù": 158,
284
+ "Ú": 159,
285
+ "Û": 160,
286
+ "Ü": 161,
287
+ "Ý": 162,
288
+ "Þ": 163,
289
+ "ß": 164,
290
+ "à": 165,
291
+ "á": 166,
292
+ "â": 167,
293
+ "ã": 168,
294
+ "ä": 169,
295
+ "å": 170,
296
+ "æ": 171,
297
+ "ç": 172,
298
+ "è": 173,
299
+ "é": 174,
300
+ "ê": 175,
301
+ "ë": 176,
302
+ "ì": 177,
303
+ "í": 178,
304
+ "î": 179,
305
+ "ï": 180,
306
+ "ð": 181,
307
+ "ñ": 182,
308
+ "ò": 183,
309
+ "ó": 184,
310
+ "ô": 185,
311
+ "õ": 186,
312
+ "ö": 187,
313
+ "÷": 188,
314
+ "ø": 189,
315
+ "ù": 190,
316
+ "ú": 191,
317
+ "û": 192,
318
+ "ü": 193,
319
+ "ý": 194,
320
+ "þ": 195,
321
+ "ÿ": 196,
322
+ "Ā": 197,
323
+ "ā": 198,
324
+ "Ă": 199,
325
+ "ă": 200,
326
+ "Ą": 201,
327
+ "ą": 202,
328
+ "Ć": 203,
329
+ "ć": 204,
330
+ "Ĉ": 205,
331
+ "ĉ": 206,
332
+ "Ċ": 207,
333
+ "ċ": 208,
334
+ "Č": 209,
335
+ "č": 210,
336
+ "Ď": 211,
337
+ "ď": 212,
338
+ "Đ": 213,
339
+ "đ": 214,
340
+ "Ē": 215,
341
+ "ē": 216,
342
+ "Ĕ": 217,
343
+ "ĕ": 218,
344
+ "Ė": 219,
345
+ "ė": 220,
346
+ "Ę": 221,
347
+ "ę": 222,
348
+ "Ě": 223,
349
+ "ě": 224,
350
+ "Ĝ": 225,
351
+ "ĝ": 226,
352
+ "Ğ": 227,
353
+ "ğ": 228,
354
+ "Ġ": 229,
355
+ "ġ": 230,
356
+ "Ģ": 231,
357
+ "ģ": 232,
358
+ "Ĥ": 233,
359
+ "ĥ": 234,
360
+ "Ħ": 235,
361
+ "ħ": 236,
362
+ "Ĩ": 237,
363
+ "ĩ": 238,
364
+ "Ī": 239,
365
+ "ī": 240,
366
+ "Ĭ": 241,
367
+ "ĭ": 242,
368
+ "Į": 243,
369
+ "į": 244,
370
+ "İ": 245,
371
+ "ı": 246,
372
+ "IJ": 247,
373
+ "ij": 248,
374
+ "Ĵ": 249,
375
+ "ĵ": 250,
376
+ "Ķ": 251,
377
+ "ķ": 252,
378
+ "ĸ": 253,
379
+ "Ĺ": 254,
380
+ "ĺ": 255,
381
+ "Ļ": 256,
382
+ "ļ": 257,
383
+ "Ľ": 258,
384
+ "ľ": 259,
385
+ "Ŀ": 260,
386
+ "ŀ": 261,
387
+ "Ł": 262,
388
+ "ł": 263,
389
+ "Ń": 264,
390
+ "he": 265,
391
+ "Ġt": 266,
392
+ "Ġa": 267,
393
+ "Ġs": 268,
394
+ "Ġw": 269,
395
+ "Ġthe": 270,
396
+ "nd": 271,
397
+ "ed": 272,
398
+ "in": 273,
399
+ "Ġand": 274,
400
+ "Ġwa": 275,
401
+ "Ġb": 276,
402
+ "Ġto": 277,
403
+ "re": 278,
404
+ "Ġh": 279,
405
+ "ou": 280,
406
+ "it": 281,
407
+ "Ġf": 282,
408
+ "er": 283,
409
+ "ĊĊ": 284,
410
+ "Ġwas": 285,
411
+ "Ġl": 286,
412
+ "Ġc": 287,
413
+ "Ġhe": 288,
414
+ "Ġp": 289,
415
+ "ing": 290,
416
+ "Ġd": 291,
417
+ "Ġm": 292,
418
+ "Ġo": 293,
419
+ "Ġg": 294,
420
+ "ar": 295,
421
+ "is": 296,
422
+ "id": 297,
423
+ "ay": 298,
424
+ "om": 299,
425
+ "at": 300,
426
+ "ll": 301,
427
+ "en": 302,
428
+ "Ġsa": 303,
429
+ "ne": 304,
430
+ "The": 305,
431
+ ".ĊĊ": 306,
432
+ "le": 307,
433
+ "Ġth": 308,
434
+ "im": 309,
435
+ "an": 310,
436
+ "Ġha": 311,
437
+ "or": 312,
438
+ "Ġit": 313,
439
+ "et": 314,
440
+ "ver": 315,
441
+ "ld": 316,
442
+ "Ġin": 317,
443
+ "ĠS": 318,
444
+ "on": 319,
445
+ "Ġe": 320,
446
+ "ce": 321,
447
+ "Ġbe": 322,
448
+ "Ġher": 323,
449
+ "ir": 324,
450
+ "Ġ\"": 325,
451
+ "ĠH": 326,
452
+ "Ġu": 327,
453
+ "Ġsaid": 328,
454
+ "Ġn": 329,
455
+ "ck": 330,
456
+ "ow": 331,
457
+ "ri": 332,
458
+ "ĠThe": 333,
459
+ "Ġshe": 334,
460
+ "Ġso": 335,
461
+ "st": 336,
462
+ "Ġy": 337,
463
+ "ot": 338,
464
+ "ĠHe": 339,
465
+ "Ġof": 340,
466
+ "il": 341,
467
+ "Ġst": 342,
468
+ "ut": 343,
469
+ "ke": 344,
470
+ "am": 345,
471
+ "ked": 346,
472
+ "oo": 347,
473
+ "pp": 348,
474
+ "Ġr": 349,
475
+ "ĠShe": 350,
476
+ "very": 351,
477
+ "ĠI": 352,
478
+ "ve": 353,
479
+ "Ġthat": 354,
480
+ "ig": 355,
481
+ "ith": 356,
482
+ "Ġhis": 357,
483
+ "Ġup": 358,
484
+ "ĠĊĊ": 359,
485
+ "Ġday": 360,
486
+ "Ġwith": 361,
487
+ "Ġpl": 362,
488
+ "Ġyou": 363,
489
+ "itt": 364,
490
+ "ould": 365,
491
+ "el": 366,
492
+ "ted": 367,
493
+ "ent": 368,
494
+ "ad": 369,
495
+ "Ġhad": 370,
496
+ "ound": 371,
497
+ "al": 372,
498
+ "ĠJ": 373,
499
+ "Ġwe": 374,
500
+ "her": 375,
501
+ "ittle": 376,
502
+ "'s": 377,
503
+ "Ġsm": 378,
504
+ "Ġplay": 379,
505
+ "end": 380,
506
+ "Ġthey": 381,
507
+ "ack": 382,
508
+ "Ġthere": 383,
509
+ "ime": 384,
510
+ "ly": 385,
511
+ "Ġsh": 386,
512
+ "Ġlittle": 387,
513
+ "Ġre": 388,
514
+ "Ġne": 389,
515
+ "Ġtime": 390,
516
+ "out": 391,
517
+ "Ġfor": 392,
518
+ "un": 393,
519
+ "ch": 394,
520
+ "se": 395,
521
+ "Ġhapp": 396,
522
+ "Ġwh": 397,
523
+ "my": 398,
524
+ "ome": 399,
525
+ "ht": 400,
526
+ "um": 401,
527
+ "Ġfri": 402,
528
+ "Ġas": 403,
529
+ "Ġfriend": 404,
530
+ "Ġvery": 405,
531
+ "all": 406,
532
+ "ter": 407,
533
+ "â": 408,
534
+ "âĤ": 409,
535
+ "âĤ¬": 410,
536
+ "On": 411,
537
+ "Ġk": 412,
538
+ "ved": 413,
539
+ "ĠT": 414,
540
+ "Ġon": 415,
541
+ "irl": 416,
542
+ "Once": 417,
543
+ "ug": 418,
544
+ "\"ĊĊ": 419,
545
+ "ill": 420,
546
+ "Ġgirl": 421,
547
+ "Ġan": 422,
548
+ "es": 423,
549
+ "Ġex": 424,
550
+ "'t": 425,
551
+ "ec": 426,
552
+ "Ġbut": 427,
553
+ "Ġloo": 428,
554
+ "Ġli": 429,
555
+ "Ġbo": 430,
556
+ "Ġwere": 431,
557
+ "One": 432,
558
+ "Ġwan": 433,
559
+ "Ġhappy": 434,
560
+ "ake": 435,
561
+ "ore": 436,
562
+ "Ġbig": 437,
563
+ "fu": 438,
564
+ "Ġsp": 439,
565
+ "ide": 440,
566
+ "Ġsaw": 441,
567
+ "ĠB": 442,
568
+ "hing": 443,
569
+ "Ġupon": 444,
570
+ "ard": 445,
571
+ "Ġcould": 446,
572
+ "ic": 447,
573
+ "Ġout": 448,
574
+ "iled": 449,
575
+ "one": 450,
576
+ "round": 451,
577
+ "ra": 452,
578
+ "ry": 453,
579
+ "Ġsmiled": 454,
580
+ "Ġhim": 455,
581
+ "ĠA": 456,
582
+ "Ġmom": 457,
583
+ "hen": 458,
584
+ "way": 459,
585
+ "ur": 460,
586
+ "ĠIt": 461,
587
+ "ful": 462,
588
+ "ain": 463,
589
+ "Ġwent": 464,
590
+ "Ġhel": 465,
591
+ "Ġnot": 466,
592
+ "ĠThey": 467,
593
+ "Ġwanted": 468,
594
+ "ind": 469,
595
+ "are": 470,
596
+ "ear": 471,
597
+ "ĠM": 472,
598
+ "Ġall": 473,
599
+ "Ġfriends": 474,
600
+ "Ġtoo": 475,
601
+ "Ġgo": 476,
602
+ "ily": 477,
603
+ "ame": 478,
604
+ "ĠTim": 479,
605
+ "Ġhelp": 480,
606
+ "omet": 481,
607
+ "ĠL": 482,
608
+ "Ġlo": 483,
609
+ "ght": 484,
610
+ "Ġsomet": 485,
611
+ "Ġat": 486,
612
+ "Ġdo": 487,
613
+ "Ġasked": 488,
614
+ "!\"": 489,
615
+ "Ġaround": 490,
616
+ "Ġj": 491,
617
+ "ree": 492,
618
+ "Ġlooked": 493,
619
+ "Ġsomething": 494,
620
+ "Ġse": 495,
621
+ "Ġwor": 496,
622
+ "dd": 497,
623
+ "hed": 498,
624
+ "ood": 499,
625
+ "Ġcl": 500,
626
+ "amed": 501,
627
+ "ro": 502,
628
+ "Ġcan": 503,
629
+ "ark": 504,
630
+ "king": 505,
631
+ "ĠE": 506,
632
+ "rom": 507,
633
+ "Ġback": 508,
634
+ "Ġexc": 509,
635
+ "ab": 510,
636
+ "ick": 511
637
+ },
638
+ "merges": [
639
+ [
640
+ "h",
641
+ "e"
642
+ ],
643
+ [
644
+ "Ġ",
645
+ "t"
646
+ ],
647
+ [
648
+ "Ġ",
649
+ "a"
650
+ ],
651
+ [
652
+ "Ġ",
653
+ "s"
654
+ ],
655
+ [
656
+ "Ġ",
657
+ "w"
658
+ ],
659
+ [
660
+ "Ġt",
661
+ "he"
662
+ ],
663
+ [
664
+ "n",
665
+ "d"
666
+ ],
667
+ [
668
+ "e",
669
+ "d"
670
+ ],
671
+ [
672
+ "i",
673
+ "n"
674
+ ],
675
+ [
676
+ "Ġa",
677
+ "nd"
678
+ ],
679
+ [
680
+ "Ġw",
681
+ "a"
682
+ ],
683
+ [
684
+ "Ġ",
685
+ "b"
686
+ ],
687
+ [
688
+ "Ġt",
689
+ "o"
690
+ ],
691
+ [
692
+ "r",
693
+ "e"
694
+ ],
695
+ [
696
+ "Ġ",
697
+ "h"
698
+ ],
699
+ [
700
+ "o",
701
+ "u"
702
+ ],
703
+ [
704
+ "i",
705
+ "t"
706
+ ],
707
+ [
708
+ "Ġ",
709
+ "f"
710
+ ],
711
+ [
712
+ "e",
713
+ "r"
714
+ ],
715
+ [
716
+ "Ċ",
717
+ "Ċ"
718
+ ],
719
+ [
720
+ "Ġwa",
721
+ "s"
722
+ ],
723
+ [
724
+ "Ġ",
725
+ "l"
726
+ ],
727
+ [
728
+ "Ġ",
729
+ "c"
730
+ ],
731
+ [
732
+ "Ġ",
733
+ "he"
734
+ ],
735
+ [
736
+ "Ġ",
737
+ "p"
738
+ ],
739
+ [
740
+ "in",
741
+ "g"
742
+ ],
743
+ [
744
+ "Ġ",
745
+ "d"
746
+ ],
747
+ [
748
+ "Ġ",
749
+ "m"
750
+ ],
751
+ [
752
+ "Ġ",
753
+ "o"
754
+ ],
755
+ [
756
+ "Ġ",
757
+ "g"
758
+ ],
759
+ [
760
+ "a",
761
+ "r"
762
+ ],
763
+ [
764
+ "i",
765
+ "s"
766
+ ],
767
+ [
768
+ "i",
769
+ "d"
770
+ ],
771
+ [
772
+ "a",
773
+ "y"
774
+ ],
775
+ [
776
+ "o",
777
+ "m"
778
+ ],
779
+ [
780
+ "a",
781
+ "t"
782
+ ],
783
+ [
784
+ "l",
785
+ "l"
786
+ ],
787
+ [
788
+ "e",
789
+ "n"
790
+ ],
791
+ [
792
+ "Ġs",
793
+ "a"
794
+ ],
795
+ [
796
+ "n",
797
+ "e"
798
+ ],
799
+ [
800
+ "T",
801
+ "he"
802
+ ],
803
+ [
804
+ ".",
805
+ "ĊĊ"
806
+ ],
807
+ [
808
+ "l",
809
+ "e"
810
+ ],
811
+ [
812
+ "Ġt",
813
+ "h"
814
+ ],
815
+ [
816
+ "i",
817
+ "m"
818
+ ],
819
+ [
820
+ "a",
821
+ "n"
822
+ ],
823
+ [
824
+ "Ġh",
825
+ "a"
826
+ ],
827
+ [
828
+ "o",
829
+ "r"
830
+ ],
831
+ [
832
+ "Ġ",
833
+ "it"
834
+ ],
835
+ [
836
+ "e",
837
+ "t"
838
+ ],
839
+ [
840
+ "v",
841
+ "er"
842
+ ],
843
+ [
844
+ "l",
845
+ "d"
846
+ ],
847
+ [
848
+ "Ġ",
849
+ "in"
850
+ ],
851
+ [
852
+ "Ġ",
853
+ "S"
854
+ ],
855
+ [
856
+ "o",
857
+ "n"
858
+ ],
859
+ [
860
+ "Ġ",
861
+ "e"
862
+ ],
863
+ [
864
+ "c",
865
+ "e"
866
+ ],
867
+ [
868
+ "Ġb",
869
+ "e"
870
+ ],
871
+ [
872
+ "Ġhe",
873
+ "r"
874
+ ],
875
+ [
876
+ "i",
877
+ "r"
878
+ ],
879
+ [
880
+ "Ġ",
881
+ "\""
882
+ ],
883
+ [
884
+ "Ġ",
885
+ "H"
886
+ ],
887
+ [
888
+ "Ġ",
889
+ "u"
890
+ ],
891
+ [
892
+ "Ġsa",
893
+ "id"
894
+ ],
895
+ [
896
+ "Ġ",
897
+ "n"
898
+ ],
899
+ [
900
+ "c",
901
+ "k"
902
+ ],
903
+ [
904
+ "o",
905
+ "w"
906
+ ],
907
+ [
908
+ "r",
909
+ "i"
910
+ ],
911
+ [
912
+ "Ġ",
913
+ "The"
914
+ ],
915
+ [
916
+ "Ġs",
917
+ "he"
918
+ ],
919
+ [
920
+ "Ġs",
921
+ "o"
922
+ ],
923
+ [
924
+ "s",
925
+ "t"
926
+ ],
927
+ [
928
+ "Ġ",
929
+ "y"
930
+ ],
931
+ [
932
+ "o",
933
+ "t"
934
+ ],
935
+ [
936
+ "ĠH",
937
+ "e"
938
+ ],
939
+ [
940
+ "Ġo",
941
+ "f"
942
+ ],
943
+ [
944
+ "i",
945
+ "l"
946
+ ],
947
+ [
948
+ "Ġs",
949
+ "t"
950
+ ],
951
+ [
952
+ "u",
953
+ "t"
954
+ ],
955
+ [
956
+ "k",
957
+ "e"
958
+ ],
959
+ [
960
+ "a",
961
+ "m"
962
+ ],
963
+ [
964
+ "k",
965
+ "ed"
966
+ ],
967
+ [
968
+ "o",
969
+ "o"
970
+ ],
971
+ [
972
+ "p",
973
+ "p"
974
+ ],
975
+ [
976
+ "Ġ",
977
+ "r"
978
+ ],
979
+ [
980
+ "ĠS",
981
+ "he"
982
+ ],
983
+ [
984
+ "ver",
985
+ "y"
986
+ ],
987
+ [
988
+ "Ġ",
989
+ "I"
990
+ ],
991
+ [
992
+ "v",
993
+ "e"
994
+ ],
995
+ [
996
+ "Ġth",
997
+ "at"
998
+ ],
999
+ [
1000
+ "i",
1001
+ "g"
1002
+ ],
1003
+ [
1004
+ "it",
1005
+ "h"
1006
+ ],
1007
+ [
1008
+ "Ġh",
1009
+ "is"
1010
+ ],
1011
+ [
1012
+ "Ġu",
1013
+ "p"
1014
+ ],
1015
+ [
1016
+ "Ġ",
1017
+ "ĊĊ"
1018
+ ],
1019
+ [
1020
+ "Ġd",
1021
+ "ay"
1022
+ ],
1023
+ [
1024
+ "Ġw",
1025
+ "ith"
1026
+ ],
1027
+ [
1028
+ "Ġp",
1029
+ "l"
1030
+ ],
1031
+ [
1032
+ "Ġy",
1033
+ "ou"
1034
+ ],
1035
+ [
1036
+ "it",
1037
+ "t"
1038
+ ],
1039
+ [
1040
+ "ou",
1041
+ "ld"
1042
+ ],
1043
+ [
1044
+ "e",
1045
+ "l"
1046
+ ],
1047
+ [
1048
+ "t",
1049
+ "ed"
1050
+ ],
1051
+ [
1052
+ "en",
1053
+ "t"
1054
+ ],
1055
+ [
1056
+ "a",
1057
+ "d"
1058
+ ],
1059
+ [
1060
+ "Ġha",
1061
+ "d"
1062
+ ],
1063
+ [
1064
+ "ou",
1065
+ "nd"
1066
+ ],
1067
+ [
1068
+ "a",
1069
+ "l"
1070
+ ],
1071
+ [
1072
+ "Ġ",
1073
+ "J"
1074
+ ],
1075
+ [
1076
+ "Ġw",
1077
+ "e"
1078
+ ],
1079
+ [
1080
+ "he",
1081
+ "r"
1082
+ ],
1083
+ [
1084
+ "itt",
1085
+ "le"
1086
+ ],
1087
+ [
1088
+ "'",
1089
+ "s"
1090
+ ],
1091
+ [
1092
+ "Ġs",
1093
+ "m"
1094
+ ],
1095
+ [
1096
+ "Ġpl",
1097
+ "ay"
1098
+ ],
1099
+ [
1100
+ "e",
1101
+ "nd"
1102
+ ],
1103
+ [
1104
+ "Ġthe",
1105
+ "y"
1106
+ ],
1107
+ [
1108
+ "a",
1109
+ "ck"
1110
+ ],
1111
+ [
1112
+ "Ġthe",
1113
+ "re"
1114
+ ],
1115
+ [
1116
+ "im",
1117
+ "e"
1118
+ ],
1119
+ [
1120
+ "l",
1121
+ "y"
1122
+ ],
1123
+ [
1124
+ "Ġs",
1125
+ "h"
1126
+ ],
1127
+ [
1128
+ "Ġl",
1129
+ "ittle"
1130
+ ],
1131
+ [
1132
+ "Ġ",
1133
+ "re"
1134
+ ],
1135
+ [
1136
+ "Ġ",
1137
+ "ne"
1138
+ ],
1139
+ [
1140
+ "Ġt",
1141
+ "ime"
1142
+ ],
1143
+ [
1144
+ "ou",
1145
+ "t"
1146
+ ],
1147
+ [
1148
+ "Ġf",
1149
+ "or"
1150
+ ],
1151
+ [
1152
+ "u",
1153
+ "n"
1154
+ ],
1155
+ [
1156
+ "c",
1157
+ "h"
1158
+ ],
1159
+ [
1160
+ "s",
1161
+ "e"
1162
+ ],
1163
+ [
1164
+ "Ġha",
1165
+ "pp"
1166
+ ],
1167
+ [
1168
+ "Ġw",
1169
+ "h"
1170
+ ],
1171
+ [
1172
+ "m",
1173
+ "y"
1174
+ ],
1175
+ [
1176
+ "om",
1177
+ "e"
1178
+ ],
1179
+ [
1180
+ "h",
1181
+ "t"
1182
+ ],
1183
+ [
1184
+ "u",
1185
+ "m"
1186
+ ],
1187
+ [
1188
+ "Ġf",
1189
+ "ri"
1190
+ ],
1191
+ [
1192
+ "Ġa",
1193
+ "s"
1194
+ ],
1195
+ [
1196
+ "Ġfri",
1197
+ "end"
1198
+ ],
1199
+ [
1200
+ "Ġ",
1201
+ "very"
1202
+ ],
1203
+ [
1204
+ "a",
1205
+ "ll"
1206
+ ],
1207
+ [
1208
+ "t",
1209
+ "er"
1210
+ ],
1211
+ [
1212
+ "Ã",
1213
+ "¢"
1214
+ ],
1215
+ [
1216
+ "â",
1217
+ "Ĥ"
1218
+ ],
1219
+ [
1220
+ "âĤ",
1221
+ "¬"
1222
+ ],
1223
+ [
1224
+ "O",
1225
+ "n"
1226
+ ],
1227
+ [
1228
+ "Ġ",
1229
+ "k"
1230
+ ],
1231
+ [
1232
+ "v",
1233
+ "ed"
1234
+ ],
1235
+ [
1236
+ "Ġ",
1237
+ "T"
1238
+ ],
1239
+ [
1240
+ "Ġo",
1241
+ "n"
1242
+ ],
1243
+ [
1244
+ "ir",
1245
+ "l"
1246
+ ],
1247
+ [
1248
+ "On",
1249
+ "ce"
1250
+ ],
1251
+ [
1252
+ "u",
1253
+ "g"
1254
+ ],
1255
+ [
1256
+ "\"",
1257
+ "ĊĊ"
1258
+ ],
1259
+ [
1260
+ "i",
1261
+ "ll"
1262
+ ],
1263
+ [
1264
+ "Ġg",
1265
+ "irl"
1266
+ ],
1267
+ [
1268
+ "Ġa",
1269
+ "n"
1270
+ ],
1271
+ [
1272
+ "e",
1273
+ "s"
1274
+ ],
1275
+ [
1276
+ "Ġe",
1277
+ "x"
1278
+ ],
1279
+ [
1280
+ "'",
1281
+ "t"
1282
+ ],
1283
+ [
1284
+ "e",
1285
+ "c"
1286
+ ],
1287
+ [
1288
+ "Ġb",
1289
+ "ut"
1290
+ ],
1291
+ [
1292
+ "Ġl",
1293
+ "oo"
1294
+ ],
1295
+ [
1296
+ "Ġl",
1297
+ "i"
1298
+ ],
1299
+ [
1300
+ "Ġb",
1301
+ "o"
1302
+ ],
1303
+ [
1304
+ "Ġwe",
1305
+ "re"
1306
+ ],
1307
+ [
1308
+ "O",
1309
+ "ne"
1310
+ ],
1311
+ [
1312
+ "Ġwa",
1313
+ "n"
1314
+ ],
1315
+ [
1316
+ "Ġhapp",
1317
+ "y"
1318
+ ],
1319
+ [
1320
+ "a",
1321
+ "ke"
1322
+ ],
1323
+ [
1324
+ "o",
1325
+ "re"
1326
+ ],
1327
+ [
1328
+ "Ġb",
1329
+ "ig"
1330
+ ],
1331
+ [
1332
+ "f",
1333
+ "u"
1334
+ ],
1335
+ [
1336
+ "Ġs",
1337
+ "p"
1338
+ ],
1339
+ [
1340
+ "id",
1341
+ "e"
1342
+ ],
1343
+ [
1344
+ "Ġsa",
1345
+ "w"
1346
+ ],
1347
+ [
1348
+ "Ġ",
1349
+ "B"
1350
+ ],
1351
+ [
1352
+ "h",
1353
+ "ing"
1354
+ ],
1355
+ [
1356
+ "Ġup",
1357
+ "on"
1358
+ ],
1359
+ [
1360
+ "ar",
1361
+ "d"
1362
+ ],
1363
+ [
1364
+ "Ġc",
1365
+ "ould"
1366
+ ],
1367
+ [
1368
+ "i",
1369
+ "c"
1370
+ ],
1371
+ [
1372
+ "Ġ",
1373
+ "out"
1374
+ ],
1375
+ [
1376
+ "il",
1377
+ "ed"
1378
+ ],
1379
+ [
1380
+ "o",
1381
+ "ne"
1382
+ ],
1383
+ [
1384
+ "r",
1385
+ "ound"
1386
+ ],
1387
+ [
1388
+ "r",
1389
+ "a"
1390
+ ],
1391
+ [
1392
+ "r",
1393
+ "y"
1394
+ ],
1395
+ [
1396
+ "Ġsm",
1397
+ "iled"
1398
+ ],
1399
+ [
1400
+ "Ġh",
1401
+ "im"
1402
+ ],
1403
+ [
1404
+ "Ġ",
1405
+ "A"
1406
+ ],
1407
+ [
1408
+ "Ġm",
1409
+ "om"
1410
+ ],
1411
+ [
1412
+ "he",
1413
+ "n"
1414
+ ],
1415
+ [
1416
+ "w",
1417
+ "ay"
1418
+ ],
1419
+ [
1420
+ "u",
1421
+ "r"
1422
+ ],
1423
+ [
1424
+ "ĠI",
1425
+ "t"
1426
+ ],
1427
+ [
1428
+ "fu",
1429
+ "l"
1430
+ ],
1431
+ [
1432
+ "a",
1433
+ "in"
1434
+ ],
1435
+ [
1436
+ "Ġw",
1437
+ "ent"
1438
+ ],
1439
+ [
1440
+ "Ġhe",
1441
+ "l"
1442
+ ],
1443
+ [
1444
+ "Ġn",
1445
+ "ot"
1446
+ ],
1447
+ [
1448
+ "ĠThe",
1449
+ "y"
1450
+ ],
1451
+ [
1452
+ "Ġwan",
1453
+ "ted"
1454
+ ],
1455
+ [
1456
+ "i",
1457
+ "nd"
1458
+ ],
1459
+ [
1460
+ "a",
1461
+ "re"
1462
+ ],
1463
+ [
1464
+ "e",
1465
+ "ar"
1466
+ ],
1467
+ [
1468
+ "Ġ",
1469
+ "M"
1470
+ ],
1471
+ [
1472
+ "Ġa",
1473
+ "ll"
1474
+ ],
1475
+ [
1476
+ "Ġfriend",
1477
+ "s"
1478
+ ],
1479
+ [
1480
+ "Ġto",
1481
+ "o"
1482
+ ],
1483
+ [
1484
+ "Ġg",
1485
+ "o"
1486
+ ],
1487
+ [
1488
+ "il",
1489
+ "y"
1490
+ ],
1491
+ [
1492
+ "am",
1493
+ "e"
1494
+ ],
1495
+ [
1496
+ "ĠT",
1497
+ "im"
1498
+ ],
1499
+ [
1500
+ "Ġhel",
1501
+ "p"
1502
+ ],
1503
+ [
1504
+ "om",
1505
+ "et"
1506
+ ],
1507
+ [
1508
+ "Ġ",
1509
+ "L"
1510
+ ],
1511
+ [
1512
+ "Ġl",
1513
+ "o"
1514
+ ],
1515
+ [
1516
+ "g",
1517
+ "ht"
1518
+ ],
1519
+ [
1520
+ "Ġs",
1521
+ "omet"
1522
+ ],
1523
+ [
1524
+ "Ġa",
1525
+ "t"
1526
+ ],
1527
+ [
1528
+ "Ġd",
1529
+ "o"
1530
+ ],
1531
+ [
1532
+ "Ġas",
1533
+ "ked"
1534
+ ],
1535
+ [
1536
+ "!",
1537
+ "\""
1538
+ ],
1539
+ [
1540
+ "Ġa",
1541
+ "round"
1542
+ ],
1543
+ [
1544
+ "Ġ",
1545
+ "j"
1546
+ ],
1547
+ [
1548
+ "re",
1549
+ "e"
1550
+ ],
1551
+ [
1552
+ "Ġloo",
1553
+ "ked"
1554
+ ],
1555
+ [
1556
+ "Ġsomet",
1557
+ "hing"
1558
+ ],
1559
+ [
1560
+ "Ġs",
1561
+ "e"
1562
+ ],
1563
+ [
1564
+ "Ġw",
1565
+ "or"
1566
+ ],
1567
+ [
1568
+ "d",
1569
+ "d"
1570
+ ],
1571
+ [
1572
+ "he",
1573
+ "d"
1574
+ ],
1575
+ [
1576
+ "oo",
1577
+ "d"
1578
+ ],
1579
+ [
1580
+ "Ġc",
1581
+ "l"
1582
+ ],
1583
+ [
1584
+ "am",
1585
+ "ed"
1586
+ ],
1587
+ [
1588
+ "r",
1589
+ "o"
1590
+ ],
1591
+ [
1592
+ "Ġc",
1593
+ "an"
1594
+ ],
1595
+ [
1596
+ "ar",
1597
+ "k"
1598
+ ],
1599
+ [
1600
+ "k",
1601
+ "ing"
1602
+ ],
1603
+ [
1604
+ "Ġ",
1605
+ "E"
1606
+ ],
1607
+ [
1608
+ "r",
1609
+ "om"
1610
+ ],
1611
+ [
1612
+ "Ġb",
1613
+ "ack"
1614
+ ],
1615
+ [
1616
+ "Ġex",
1617
+ "c"
1618
+ ],
1619
+ [
1620
+ "a",
1621
+ "b"
1622
+ ],
1623
+ [
1624
+ "i",
1625
+ "ck"
1626
+ ]
1627
+ ]
1628
+ }
1629
+ }
archive/runs_legacy_20260525/stage-dropout-multiseed/20260523-080428/config.json ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "args": {
3
+ "corpus": null,
4
+ "corpus_glob": "/Users/mandeepsidhu/Desktop/code/nanochat/.nanochat-cache/base_data_climbmix/shard_*.parquet",
5
+ "text_column": "text",
6
+ "output_dir": "runs/stage-dropout-multiseed",
7
+ "suite": "stage_dropouts",
8
+ "condition_name": "mild_decay_030_022_014_010",
9
+ "seeds": [
10
+ 1,
11
+ 2,
12
+ 3,
13
+ 4,
14
+ 5
15
+ ],
16
+ "initial_tokens": 5000000,
17
+ "stream_token_caps": [
18
+ 5000000,
19
+ 10000000,
20
+ 20000000,
21
+ 40000000
22
+ ],
23
+ "val_tokens": 500000,
24
+ "allow_short_corpus": false,
25
+ "force_retokenize": false,
26
+ "vocab_size": 4096,
27
+ "tokenizer_train_chars": 10000000,
28
+ "block_size": 128,
29
+ "batch_size": 16,
30
+ "small_layers": 4,
31
+ "small_heads": 4,
32
+ "small_embd": 128,
33
+ "large_layers": 8,
34
+ "large_heads": 8,
35
+ "large_embd": 256,
36
+ "steps_per_run": 2000,
37
+ "stream_steps_per_stage": 1000,
38
+ "eval_batches": 64,
39
+ "log_every": 500,
40
+ "lr": 0.0003,
41
+ "weight_decay": 0.1,
42
+ "grad_clip": 1.0,
43
+ "baseline_dropout": 0.1,
44
+ "high_dropout": 0.8,
45
+ "stage_dropouts": [
46
+ 0.3,
47
+ 0.22,
48
+ 0.14,
49
+ 0.1
50
+ ],
51
+ "dropout_decay_tokens": null,
52
+ "dropout_schedule": "cosine"
53
+ },
54
+ "device": "mps",
55
+ "torch": "2.9.1",
56
+ "attribution": "Derived from Andrej Karpathy's nanochat project (https://github.com/karpathy/nanochat), MIT License, Copyright (c) 2025 Andrej Karpathy.",
57
+ "tokenizer_path": "runs/stage-dropout-multiseed/20260523-080428/tokenizer.json",
58
+ "encoded_path": "runs/stage-dropout-multiseed/20260523-080428/tokens-v4096-uint16.npy",
59
+ "train_tokens": 40000160,
60
+ "val_tokens": 500000,
61
+ "effective_initial_tokens": 5000000,
62
+ "effective_stream_token_caps": [
63
+ 5000000,
64
+ 10000000,
65
+ 20000000,
66
+ 40000000
67
+ ]
68
+ }
archive/runs_legacy_20260525/stage-dropout-multiseed/20260523-080428/metrics.jsonl ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"phase": "phase5_stage_dropout_search", "condition": "mild_decay_030_022_014_010", "seed": 1, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.3, "train_loss_last": 5.58997917175293, "eval_loss": 5.751125849783421, "elapsed_sec": 44.69669818878174, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.3}}
2
+ {"phase": "phase5_stage_dropout_search", "condition": "mild_decay_030_022_014_010", "seed": 1, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.22, "train_loss_last": 5.062872886657715, "eval_loss": 5.196203097701073, "elapsed_sec": 44.78069806098938, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.3}}
3
+ {"phase": "phase5_stage_dropout_search", "condition": "mild_decay_030_022_014_010", "seed": 1, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.14, "train_loss_last": 4.952004432678223, "eval_loss": 4.905544601380825, "elapsed_sec": 45.88518309593201, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.3}}
4
+ {"phase": "phase5_stage_dropout_search", "condition": "mild_decay_030_022_014_010", "seed": 1, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.558305263519287, "eval_loss": 4.7524150758981705, "elapsed_sec": 48.00120186805725, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.3}}
5
+ {"phase": "phase5_stage_dropout_search", "condition": "mild_decay_030_022_014_010", "seed": 2, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.3, "train_loss_last": 5.574131011962891, "eval_loss": 5.8797784596681595, "elapsed_sec": 49.94093990325928, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.3}}
6
+ {"phase": "phase5_stage_dropout_search", "condition": "mild_decay_030_022_014_010", "seed": 2, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.22, "train_loss_last": 5.157837867736816, "eval_loss": 5.257638864219189, "elapsed_sec": 51.3558988571167, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.3}}
7
+ {"phase": "phase5_stage_dropout_search", "condition": "mild_decay_030_022_014_010", "seed": 2, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.14, "train_loss_last": 4.9385457038879395, "eval_loss": 4.926844872534275, "elapsed_sec": 52.127033948898315, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.3}}
8
+ {"phase": "phase5_stage_dropout_search", "condition": "mild_decay_030_022_014_010", "seed": 2, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.661777973175049, "eval_loss": 4.708192884922028, "elapsed_sec": 52.372743129730225, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.3}}
9
+ {"phase": "phase5_stage_dropout_search", "condition": "mild_decay_030_022_014_010", "seed": 3, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.3, "train_loss_last": 5.643334865570068, "eval_loss": 5.8101136311888695, "elapsed_sec": 54.14305257797241, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.3}}
10
+ {"phase": "phase5_stage_dropout_search", "condition": "mild_decay_030_022_014_010", "seed": 3, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.22, "train_loss_last": 5.042954921722412, "eval_loss": 5.163986533880234, "elapsed_sec": 53.76744985580444, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.3}}
11
+ {"phase": "phase5_stage_dropout_search", "condition": "mild_decay_030_022_014_010", "seed": 3, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.14, "train_loss_last": 4.887018203735352, "eval_loss": 4.921479769051075, "elapsed_sec": 53.323330879211426, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.3}}
12
+ {"phase": "phase5_stage_dropout_search", "condition": "mild_decay_030_022_014_010", "seed": 3, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.6436262130737305, "eval_loss": 4.727723777294159, "elapsed_sec": 53.821449995040894, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.3}}
13
+ {"phase": "phase5_stage_dropout_search", "condition": "mild_decay_030_022_014_010", "seed": 4, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.3, "train_loss_last": 5.597998142242432, "eval_loss": 5.849507503211498, "elapsed_sec": 52.95270609855652, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.3}}
14
+ {"phase": "phase5_stage_dropout_search", "condition": "mild_decay_030_022_014_010", "seed": 4, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.22, "train_loss_last": 5.1180315017700195, "eval_loss": 5.21679612249136, "elapsed_sec": 52.781530141830444, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.3}}
15
+ {"phase": "phase5_stage_dropout_search", "condition": "mild_decay_030_022_014_010", "seed": 4, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.14, "train_loss_last": 4.8136138916015625, "eval_loss": 4.85420498996973, "elapsed_sec": 52.83119297027588, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.3}}
16
+ {"phase": "phase5_stage_dropout_search", "condition": "mild_decay_030_022_014_010", "seed": 4, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.631038665771484, "eval_loss": 4.644085839390755, "elapsed_sec": 52.84165716171265, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.3}}
17
+ {"phase": "phase5_stage_dropout_search", "condition": "mild_decay_030_022_014_010", "seed": 5, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.3, "train_loss_last": 5.687386989593506, "eval_loss": 5.774208791553974, "elapsed_sec": 52.76228904724121, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.3}}
18
+ {"phase": "phase5_stage_dropout_search", "condition": "mild_decay_030_022_014_010", "seed": 5, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.22, "train_loss_last": 5.182829856872559, "eval_loss": 5.188988164067268, "elapsed_sec": 52.99323225021362, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.3}}
19
+ {"phase": "phase5_stage_dropout_search", "condition": "mild_decay_030_022_014_010", "seed": 5, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.14, "train_loss_last": 4.755054473876953, "eval_loss": 4.887606129050255, "elapsed_sec": 52.93570303916931, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.3}}
20
+ {"phase": "phase5_stage_dropout_search", "condition": "mild_decay_030_022_014_010", "seed": 5, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.790690898895264, "eval_loss": 4.684750318527222, "elapsed_sec": 52.974026918411255, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.3}}
archive/runs_legacy_20260525/stage-dropout-multiseed/20260523-080428/summary.csv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ phase,condition,stage,token_limit,parameters,n,mean_eval_loss,std_eval_loss
2
+ phase5_stage_dropout_search,mild_decay_030_022_014_010,0,5000000,8388608,5,5.812946847081184,0.05275804626515845
3
+ phase5_stage_dropout_search,mild_decay_030_022_014_010,1,10000000,8388608,5,5.2047225564718245,0.03509089070083401
4
+ phase5_stage_dropout_search,mild_decay_030_022_014_010,2,20000000,8388608,5,4.899136072397232,0.02941015721914179
5
+ phase5_stage_dropout_search,mild_decay_030_022_014_010,3,40000000,8388608,5,4.703433579206466,0.04148213526035924
archive/runs_legacy_20260525/stage-dropout-multiseed/20260523-080428/summary.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "phase": "phase5_stage_dropout_search",
4
+ "condition": "mild_decay_030_022_014_010",
5
+ "stage": 0,
6
+ "token_limit": 5000000,
7
+ "parameters": 8388608,
8
+ "n": 5,
9
+ "mean_eval_loss": 5.812946847081184,
10
+ "std_eval_loss": 0.05275804626515845
11
+ },
12
+ {
13
+ "phase": "phase5_stage_dropout_search",
14
+ "condition": "mild_decay_030_022_014_010",
15
+ "stage": 1,
16
+ "token_limit": 10000000,
17
+ "parameters": 8388608,
18
+ "n": 5,
19
+ "mean_eval_loss": 5.2047225564718245,
20
+ "std_eval_loss": 0.03509089070083401
21
+ },
22
+ {
23
+ "phase": "phase5_stage_dropout_search",
24
+ "condition": "mild_decay_030_022_014_010",
25
+ "stage": 2,
26
+ "token_limit": 20000000,
27
+ "parameters": 8388608,
28
+ "n": 5,
29
+ "mean_eval_loss": 4.899136072397232,
30
+ "std_eval_loss": 0.02941015721914179
31
+ },
32
+ {
33
+ "phase": "phase5_stage_dropout_search",
34
+ "condition": "mild_decay_030_022_014_010",
35
+ "stage": 3,
36
+ "token_limit": 40000000,
37
+ "parameters": 8388608,
38
+ "n": 5,
39
+ "mean_eval_loss": 4.703433579206466,
40
+ "std_eval_loss": 0.04148213526035924
41
+ }
42
+ ]
archive/runs_legacy_20260525/stage-dropout-multiseed/20260523-080428/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
archive/runs_legacy_20260525/stage-dropout-multiseed/20260523-082216/config.json ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "args": {
3
+ "corpus": null,
4
+ "corpus_glob": "/Users/mandeepsidhu/Desktop/code/nanochat/.nanochat-cache/base_data_climbmix/shard_*.parquet",
5
+ "text_column": "text",
6
+ "output_dir": "runs/stage-dropout-multiseed",
7
+ "suite": "stage_dropouts",
8
+ "condition_name": "fast_decay_050_010_010_010",
9
+ "seeds": [
10
+ 1,
11
+ 2,
12
+ 3,
13
+ 4,
14
+ 5
15
+ ],
16
+ "initial_tokens": 5000000,
17
+ "stream_token_caps": [
18
+ 5000000,
19
+ 10000000,
20
+ 20000000,
21
+ 40000000
22
+ ],
23
+ "val_tokens": 500000,
24
+ "allow_short_corpus": false,
25
+ "force_retokenize": false,
26
+ "vocab_size": 4096,
27
+ "tokenizer_train_chars": 10000000,
28
+ "block_size": 128,
29
+ "batch_size": 16,
30
+ "small_layers": 4,
31
+ "small_heads": 4,
32
+ "small_embd": 128,
33
+ "large_layers": 8,
34
+ "large_heads": 8,
35
+ "large_embd": 256,
36
+ "steps_per_run": 2000,
37
+ "stream_steps_per_stage": 1000,
38
+ "eval_batches": 64,
39
+ "log_every": 500,
40
+ "lr": 0.0003,
41
+ "weight_decay": 0.1,
42
+ "grad_clip": 1.0,
43
+ "baseline_dropout": 0.1,
44
+ "high_dropout": 0.8,
45
+ "stage_dropouts": [
46
+ 0.5,
47
+ 0.1,
48
+ 0.1,
49
+ 0.1
50
+ ],
51
+ "dropout_decay_tokens": null,
52
+ "dropout_schedule": "cosine"
53
+ },
54
+ "device": "mps",
55
+ "torch": "2.9.1",
56
+ "attribution": "Derived from Andrej Karpathy's nanochat project (https://github.com/karpathy/nanochat), MIT License, Copyright (c) 2025 Andrej Karpathy.",
57
+ "tokenizer_path": "runs/stage-dropout-multiseed/20260523-082216/tokenizer.json",
58
+ "encoded_path": "runs/stage-dropout-multiseed/20260523-082216/tokens-v4096-uint16.npy",
59
+ "train_tokens": 40000160,
60
+ "val_tokens": 500000,
61
+ "effective_initial_tokens": 5000000,
62
+ "effective_stream_token_caps": [
63
+ 5000000,
64
+ 10000000,
65
+ 20000000,
66
+ 40000000
67
+ ]
68
+ }
archive/runs_legacy_20260525/stage-dropout-multiseed/20260523-082216/metrics.jsonl ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"phase": "phase5_stage_dropout_search", "condition": "fast_decay_050_010_010_010", "seed": 1, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.5, "train_loss_last": 5.845836162567139, "eval_loss": 6.154372617602348, "elapsed_sec": 49.94526386260986, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.5}}
2
+ {"phase": "phase5_stage_dropout_search", "condition": "fast_decay_050_010_010_010", "seed": 1, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.1, "train_loss_last": 5.0081586837768555, "eval_loss": 5.162623845040798, "elapsed_sec": 49.56790518760681, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.5}}
3
+ {"phase": "phase5_stage_dropout_search", "condition": "fast_decay_050_010_010_010", "seed": 1, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.1, "train_loss_last": 4.929193496704102, "eval_loss": 4.882447637617588, "elapsed_sec": 50.52619385719299, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.5}}
4
+ {"phase": "phase5_stage_dropout_search", "condition": "fast_decay_050_010_010_010", "seed": 1, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.557829856872559, "eval_loss": 4.758259400725365, "elapsed_sec": 52.02269697189331, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.5}}
5
+ {"phase": "phase5_stage_dropout_search", "condition": "fast_decay_050_010_010_010", "seed": 2, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.5, "train_loss_last": 5.822956085205078, "eval_loss": 6.242104984819889, "elapsed_sec": 52.7607798576355, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.5}}
6
+ {"phase": "phase5_stage_dropout_search", "condition": "fast_decay_050_010_010_010", "seed": 2, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.1, "train_loss_last": 5.105504035949707, "eval_loss": 5.223574630916119, "elapsed_sec": 53.23060607910156, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.5}}
7
+ {"phase": "phase5_stage_dropout_search", "condition": "fast_decay_050_010_010_010", "seed": 2, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.1, "train_loss_last": 4.897387504577637, "eval_loss": 4.94177483022213, "elapsed_sec": 53.174906969070435, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.5}}
8
+ {"phase": "phase5_stage_dropout_search", "condition": "fast_decay_050_010_010_010", "seed": 2, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.672600746154785, "eval_loss": 4.714911535382271, "elapsed_sec": 53.14971899986267, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.5}}
9
+ {"phase": "phase5_stage_dropout_search", "condition": "fast_decay_050_010_010_010", "seed": 3, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.5, "train_loss_last": 5.9172844886779785, "eval_loss": 6.228554867208004, "elapsed_sec": 53.18459105491638, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.5}}
10
+ {"phase": "phase5_stage_dropout_search", "condition": "fast_decay_050_010_010_010", "seed": 3, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.1, "train_loss_last": 5.001498222351074, "eval_loss": 5.157509610056877, "elapsed_sec": 53.15500092506409, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.5}}
11
+ {"phase": "phase5_stage_dropout_search", "condition": "fast_decay_050_010_010_010", "seed": 3, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.1, "train_loss_last": 4.853842735290527, "eval_loss": 4.896874591708183, "elapsed_sec": 53.38427805900574, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.5}}
12
+ {"phase": "phase5_stage_dropout_search", "condition": "fast_decay_050_010_010_010", "seed": 3, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.641301155090332, "eval_loss": 4.713992565870285, "elapsed_sec": 53.49088001251221, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.5}}
13
+ {"phase": "phase5_stage_dropout_search", "condition": "fast_decay_050_010_010_010", "seed": 4, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.5, "train_loss_last": 5.845858573913574, "eval_loss": 6.229139141738415, "elapsed_sec": 53.52108716964722, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.5}}
14
+ {"phase": "phase5_stage_dropout_search", "condition": "fast_decay_050_010_010_010", "seed": 4, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.1, "train_loss_last": 5.085396766662598, "eval_loss": 5.225671216845512, "elapsed_sec": 53.089759826660156, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.5}}
15
+ {"phase": "phase5_stage_dropout_search", "condition": "fast_decay_050_010_010_010", "seed": 4, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.1, "train_loss_last": 4.777719020843506, "eval_loss": 4.850255951285362, "elapsed_sec": 52.85016393661499, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.5}}
16
+ {"phase": "phase5_stage_dropout_search", "condition": "fast_decay_050_010_010_010", "seed": 4, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.632856369018555, "eval_loss": 4.651223048567772, "elapsed_sec": 52.797423124313354, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.5}}
17
+ {"phase": "phase5_stage_dropout_search", "condition": "fast_decay_050_010_010_010", "seed": 5, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.5, "train_loss_last": 5.914695739746094, "eval_loss": 6.148795463144779, "elapsed_sec": 52.897231101989746, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.5}}
18
+ {"phase": "phase5_stage_dropout_search", "condition": "fast_decay_050_010_010_010", "seed": 5, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.1, "train_loss_last": 5.125888347625732, "eval_loss": 5.160975947976112, "elapsed_sec": 52.83045816421509, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.5}}
19
+ {"phase": "phase5_stage_dropout_search", "condition": "fast_decay_050_010_010_010", "seed": 5, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.1, "train_loss_last": 4.719311714172363, "eval_loss": 4.865957573056221, "elapsed_sec": 52.88761901855469, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.5}}
20
+ {"phase": "phase5_stage_dropout_search", "condition": "fast_decay_050_010_010_010", "seed": 5, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.78352165222168, "eval_loss": 4.673870116472244, "elapsed_sec": 52.91818380355835, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.5}}
archive/runs_legacy_20260525/stage-dropout-multiseed/20260523-082216/summary.csv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ phase,condition,stage,token_limit,parameters,n,mean_eval_loss,std_eval_loss
2
+ phase5_stage_dropout_search,fast_decay_050_010_010_010,0,5000000,8388608,5,6.200593414902687,0.0451090392326937
3
+ phase5_stage_dropout_search,fast_decay_050_010_010_010,1,10000000,8388608,5,5.186071050167084,0.0352490539353081
4
+ phase5_stage_dropout_search,fast_decay_050_010_010_010,2,20000000,8388608,5,4.887462116777897,0.03503696147846281
5
+ phase5_stage_dropout_search,fast_decay_050_010_010_010,3,40000000,8388608,5,4.702451333403587,0.041364901144835056
archive/runs_legacy_20260525/stage-dropout-multiseed/20260523-082216/summary.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "phase": "phase5_stage_dropout_search",
4
+ "condition": "fast_decay_050_010_010_010",
5
+ "stage": 0,
6
+ "token_limit": 5000000,
7
+ "parameters": 8388608,
8
+ "n": 5,
9
+ "mean_eval_loss": 6.200593414902687,
10
+ "std_eval_loss": 0.0451090392326937
11
+ },
12
+ {
13
+ "phase": "phase5_stage_dropout_search",
14
+ "condition": "fast_decay_050_010_010_010",
15
+ "stage": 1,
16
+ "token_limit": 10000000,
17
+ "parameters": 8388608,
18
+ "n": 5,
19
+ "mean_eval_loss": 5.186071050167084,
20
+ "std_eval_loss": 0.0352490539353081
21
+ },
22
+ {
23
+ "phase": "phase5_stage_dropout_search",
24
+ "condition": "fast_decay_050_010_010_010",
25
+ "stage": 2,
26
+ "token_limit": 20000000,
27
+ "parameters": 8388608,
28
+ "n": 5,
29
+ "mean_eval_loss": 4.887462116777897,
30
+ "std_eval_loss": 0.03503696147846281
31
+ },
32
+ {
33
+ "phase": "phase5_stage_dropout_search",
34
+ "condition": "fast_decay_050_010_010_010",
35
+ "stage": 3,
36
+ "token_limit": 40000000,
37
+ "parameters": 8388608,
38
+ "n": 5,
39
+ "mean_eval_loss": 4.702451333403587,
40
+ "std_eval_loss": 0.041364901144835056
41
+ }
42
+ ]
archive/runs_legacy_20260525/stage-dropout-multiseed/20260523-082216/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
archive/runs_legacy_20260525/stage-dropout-single/20260523-075308/config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "args": {
3
+ "corpus": null,
4
+ "corpus_glob": "/Users/mandeepsidhu/Desktop/code/nanochat/.nanochat-cache/base_data_climbmix/shard_*.parquet",
5
+ "text_column": "text",
6
+ "output_dir": "runs/stage-dropout-single",
7
+ "suite": "stage_dropouts",
8
+ "condition_name": "mild_decay_030_022_014_010",
9
+ "seeds": [
10
+ 1
11
+ ],
12
+ "initial_tokens": 5000000,
13
+ "stream_token_caps": [
14
+ 5000000,
15
+ 10000000,
16
+ 20000000,
17
+ 40000000
18
+ ],
19
+ "val_tokens": 500000,
20
+ "allow_short_corpus": false,
21
+ "force_retokenize": false,
22
+ "vocab_size": 4096,
23
+ "tokenizer_train_chars": 10000000,
24
+ "block_size": 128,
25
+ "batch_size": 16,
26
+ "small_layers": 4,
27
+ "small_heads": 4,
28
+ "small_embd": 128,
29
+ "large_layers": 8,
30
+ "large_heads": 8,
31
+ "large_embd": 256,
32
+ "steps_per_run": 2000,
33
+ "stream_steps_per_stage": 1000,
34
+ "eval_batches": 64,
35
+ "log_every": 500,
36
+ "lr": 0.0003,
37
+ "weight_decay": 0.1,
38
+ "grad_clip": 1.0,
39
+ "baseline_dropout": 0.1,
40
+ "high_dropout": 0.8,
41
+ "stage_dropouts": [
42
+ 0.3,
43
+ 0.22,
44
+ 0.14,
45
+ 0.1
46
+ ],
47
+ "dropout_decay_tokens": null,
48
+ "dropout_schedule": "cosine"
49
+ },
50
+ "device": "mps",
51
+ "torch": "2.9.1",
52
+ "attribution": "Derived from Andrej Karpathy's nanochat project (https://github.com/karpathy/nanochat), MIT License, Copyright (c) 2025 Andrej Karpathy.",
53
+ "tokenizer_path": "runs/stage-dropout-single/20260523-075308/tokenizer.json",
54
+ "encoded_path": "runs/stage-dropout-single/20260523-075308/tokens-v4096-uint16.npy",
55
+ "train_tokens": 40000160,
56
+ "val_tokens": 500000,
57
+ "effective_initial_tokens": 5000000,
58
+ "effective_stream_token_caps": [
59
+ 5000000,
60
+ 10000000,
61
+ 20000000,
62
+ 40000000
63
+ ]
64
+ }
archive/runs_legacy_20260525/stage-dropout-single/20260523-075308/metrics.jsonl ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {"phase": "phase5_stage_dropout_search", "condition": "mild_decay_030_022_014_010", "seed": 1, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.3, "train_loss_last": 5.58997917175293, "eval_loss": 5.751125819981098, "elapsed_sec": 41.68040323257446, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.3}}
2
+ {"phase": "phase5_stage_dropout_search", "condition": "mild_decay_030_022_014_010", "seed": 1, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.22, "train_loss_last": 5.062872886657715, "eval_loss": 5.1962031945586205, "elapsed_sec": 42.56834697723389, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.3}}
3
+ {"phase": "phase5_stage_dropout_search", "condition": "mild_decay_030_022_014_010", "seed": 1, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.14, "train_loss_last": 4.952004432678223, "eval_loss": 4.905544675886631, "elapsed_sec": 42.689231157302856, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.3}}
4
+ {"phase": "phase5_stage_dropout_search", "condition": "mild_decay_030_022_014_010", "seed": 1, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.558305740356445, "eval_loss": 4.752415433526039, "elapsed_sec": 42.67094111442566, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.3}}
archive/runs_legacy_20260525/stage-dropout-single/20260523-075308/summary.csv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ phase,condition,stage,token_limit,parameters,n,mean_eval_loss,std_eval_loss
2
+ phase5_stage_dropout_search,mild_decay_030_022_014_010,0,5000000,8388608,1,5.751125819981098,0.0
3
+ phase5_stage_dropout_search,mild_decay_030_022_014_010,1,10000000,8388608,1,5.1962031945586205,0.0
4
+ phase5_stage_dropout_search,mild_decay_030_022_014_010,2,20000000,8388608,1,4.905544675886631,0.0
5
+ phase5_stage_dropout_search,mild_decay_030_022_014_010,3,40000000,8388608,1,4.752415433526039,0.0
archive/runs_legacy_20260525/stage-dropout-single/20260523-075308/summary.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "phase": "phase5_stage_dropout_search",
4
+ "condition": "mild_decay_030_022_014_010",
5
+ "stage": 0,
6
+ "token_limit": 5000000,
7
+ "parameters": 8388608,
8
+ "n": 1,
9
+ "mean_eval_loss": 5.751125819981098,
10
+ "std_eval_loss": 0.0
11
+ },
12
+ {
13
+ "phase": "phase5_stage_dropout_search",
14
+ "condition": "mild_decay_030_022_014_010",
15
+ "stage": 1,
16
+ "token_limit": 10000000,
17
+ "parameters": 8388608,
18
+ "n": 1,
19
+ "mean_eval_loss": 5.1962031945586205,
20
+ "std_eval_loss": 0.0
21
+ },
22
+ {
23
+ "phase": "phase5_stage_dropout_search",
24
+ "condition": "mild_decay_030_022_014_010",
25
+ "stage": 2,
26
+ "token_limit": 20000000,
27
+ "parameters": 8388608,
28
+ "n": 1,
29
+ "mean_eval_loss": 4.905544675886631,
30
+ "std_eval_loss": 0.0
31
+ },
32
+ {
33
+ "phase": "phase5_stage_dropout_search",
34
+ "condition": "mild_decay_030_022_014_010",
35
+ "stage": 3,
36
+ "token_limit": 40000000,
37
+ "parameters": 8388608,
38
+ "n": 1,
39
+ "mean_eval_loss": 4.752415433526039,
40
+ "std_eval_loss": 0.0
41
+ }
42
+ ]
archive/runs_legacy_20260525/stage-dropout-single/20260523-075308/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
archive/runs_legacy_20260525/stage-dropout-single/20260523-075636/config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "args": {
3
+ "corpus": null,
4
+ "corpus_glob": "/Users/mandeepsidhu/Desktop/code/nanochat/.nanochat-cache/base_data_climbmix/shard_*.parquet",
5
+ "text_column": "text",
6
+ "output_dir": "runs/stage-dropout-single",
7
+ "suite": "stage_dropouts",
8
+ "condition_name": "medium_decay_050_032_016_010",
9
+ "seeds": [
10
+ 1
11
+ ],
12
+ "initial_tokens": 5000000,
13
+ "stream_token_caps": [
14
+ 5000000,
15
+ 10000000,
16
+ 20000000,
17
+ 40000000
18
+ ],
19
+ "val_tokens": 500000,
20
+ "allow_short_corpus": false,
21
+ "force_retokenize": false,
22
+ "vocab_size": 4096,
23
+ "tokenizer_train_chars": 10000000,
24
+ "block_size": 128,
25
+ "batch_size": 16,
26
+ "small_layers": 4,
27
+ "small_heads": 4,
28
+ "small_embd": 128,
29
+ "large_layers": 8,
30
+ "large_heads": 8,
31
+ "large_embd": 256,
32
+ "steps_per_run": 2000,
33
+ "stream_steps_per_stage": 1000,
34
+ "eval_batches": 64,
35
+ "log_every": 500,
36
+ "lr": 0.0003,
37
+ "weight_decay": 0.1,
38
+ "grad_clip": 1.0,
39
+ "baseline_dropout": 0.1,
40
+ "high_dropout": 0.8,
41
+ "stage_dropouts": [
42
+ 0.5,
43
+ 0.32,
44
+ 0.16,
45
+ 0.1
46
+ ],
47
+ "dropout_decay_tokens": null,
48
+ "dropout_schedule": "cosine"
49
+ },
50
+ "device": "mps",
51
+ "torch": "2.9.1",
52
+ "attribution": "Derived from Andrej Karpathy's nanochat project (https://github.com/karpathy/nanochat), MIT License, Copyright (c) 2025 Andrej Karpathy.",
53
+ "tokenizer_path": "runs/stage-dropout-single/20260523-075636/tokenizer.json",
54
+ "encoded_path": "runs/stage-dropout-single/20260523-075636/tokens-v4096-uint16.npy",
55
+ "train_tokens": 40000160,
56
+ "val_tokens": 500000,
57
+ "effective_initial_tokens": 5000000,
58
+ "effective_stream_token_caps": [
59
+ 5000000,
60
+ 10000000,
61
+ 20000000,
62
+ 40000000
63
+ ]
64
+ }
archive/runs_legacy_20260525/stage-dropout-single/20260523-075636/metrics.jsonl ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {"phase": "phase5_stage_dropout_search", "condition": "medium_decay_050_032_016_010", "seed": 1, "stage": 0, "token_limit": 5000000, "steps": 1000, "tokens_seen": 2048000, "dropout": 0.5, "train_loss_last": 5.845836162567139, "eval_loss": 6.154372535645962, "elapsed_sec": 41.894405126571655, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.5}}
2
+ {"phase": "phase5_stage_dropout_search", "condition": "medium_decay_050_032_016_010", "seed": 1, "stage": 1, "token_limit": 10000000, "steps": 1000, "tokens_seen": 4096000, "dropout": 0.32, "train_loss_last": 5.22263240814209, "eval_loss": 5.326081059873104, "elapsed_sec": 42.87401509284973, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.5}}
3
+ {"phase": "phase5_stage_dropout_search", "condition": "medium_decay_050_032_016_010", "seed": 1, "stage": 2, "token_limit": 20000000, "steps": 1000, "tokens_seen": 6144000, "dropout": 0.16, "train_loss_last": 5.021544456481934, "eval_loss": 4.956579685211182, "elapsed_sec": 44.26910591125488, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.5}}
4
+ {"phase": "phase5_stage_dropout_search", "condition": "medium_decay_050_032_016_010", "seed": 1, "stage": 3, "token_limit": 40000000, "steps": 1000, "tokens_seen": 8192000, "dropout": 0.1, "train_loss_last": 4.601977348327637, "eval_loss": 4.809504576027393, "elapsed_sec": 45.77408504486084, "parameters": 8388608, "model_config": {"block_size": 128, "vocab_size": 4096, "n_layer": 8, "n_head": 8, "n_embd": 256, "dropout": 0.5}}