Mandeep Sidhu commited on
Commit ·
ad87b3e
1
Parent(s): 132f07c
Add static dropout law sweep results
Browse files- docs/screen_static_results.md +318 -77
- runs/screen_static/20260525-133008/RESULT_SUMMARY.md +318 -77
- runs/screen_static/20260525-133008/config.resume.json +28 -2
- runs/screen_static/20260525-133008/dropout_curves.svg +376 -99
- runs/screen_static/20260525-133008/metrics.jsonl +0 -0
- runs/screen_static/20260525-133008/model_selection.csv +15 -4
- runs/screen_static/20260525-133008/model_selection.json +304 -7
- runs/screen_static/20260525-133008/summary.csv +164 -0
- runs/screen_static/20260525-133008/summary.json +0 -0
- runs/screen_static/20260525-133008/trace.jsonl +0 -0
- src/dropout_decay/experiment.py +317 -56
docs/screen_static_results.md
CHANGED
|
@@ -2,80 +2,321 @@
|
|
| 2 |
|
| 3 |
Run directory: `runs/screen_static/20260525-133008`
|
| 4 |
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
| 11 |
-
|
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
| 20 |
-
|
|
| 21 |
-
| 0.
|
| 22 |
-
| 0.
|
| 23 |
-
| 0.
|
| 24 |
-
| 0.
|
| 25 |
-
| 0.14 |
|
| 26 |
-
| 0.
|
| 27 |
-
| 0.
|
| 28 |
-
| 0.
|
| 29 |
-
| 0.
|
| 30 |
-
| 0.
|
| 31 |
-
| 0.
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
## Prefix
|
| 36 |
-
|
| 37 |
-
| Dropout | Val loss | Train
|
| 38 |
-
|---:|---:|---:|---:|---:|---:|
|
| 39 |
-
| 0.00 |
|
| 40 |
-
| 0.02 |
|
| 41 |
-
| 0.05 |
|
| 42 |
-
| 0.
|
| 43 |
-
| 0.
|
| 44 |
-
| 0.
|
| 45 |
-
| 0.
|
| 46 |
-
| 0.
|
| 47 |
-
| 0.
|
| 48 |
-
| 0.
|
| 49 |
-
| 0.
|
| 50 |
-
| 0.
|
| 51 |
-
| 0.
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
|
| 57 |
-
|
|
| 58 |
-
| 0.
|
| 59 |
-
| 0.
|
| 60 |
-
| 0.
|
| 61 |
-
| 0.
|
| 62 |
-
| 0.
|
| 63 |
-
| 0.
|
| 64 |
-
| 0.
|
| 65 |
-
| 0.
|
| 66 |
-
| 0.
|
| 67 |
-
| 0.
|
| 68 |
-
| 0.
|
| 69 |
-
| 0.
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
|
| 76 |
-
|
|
| 77 |
-
| 0.
|
| 78 |
-
| 0.
|
| 79 |
-
| 0.
|
| 80 |
-
| 0.
|
| 81 |
-
| 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
Run directory: `runs/screen_static/20260525-133008`
|
| 4 |
|
| 5 |
+
## Models
|
| 6 |
+
|
| 7 |
+
| Model | Params | Layers | Heads | Embedding | Block | Vocab | Seeds |
|
| 8 |
+
|---|---:|---:|---:|---:|---:|---:|---|
|
| 9 |
+
| `L8_H8_D256` | 8,388,608 | 8 | 8 | 256 | 128 | 4096 | 1 |
|
| 10 |
+
| `L12_H8_D320` | 17,367,040 | 12 | 8 | 320 | 128 | 4096 | 1 |
|
| 11 |
+
| `L16_H8_D384` | 31,457,280 | 16 | 8 | 384 | 128 | 4096 | 1 |
|
| 12 |
+
|
| 13 |
+
## Best Dropout By Model And Prefix
|
| 14 |
+
|
| 15 |
+
| Model | Prefix tokens | Effective epochs | Best dropout | Mean val loss | Val std | Mean train loss | Mean gap | Plateau/bracket note |
|
| 16 |
+
|---|---:|---:|---:|---:|---:|---:|---:|---|
|
| 17 |
+
| `L12_H8_D320` | 250,000 | 40.96 | 0.50 | 5.4384 | 0.0000 | 3.3720 | 2.0663 | bracketed by tested grid |
|
| 18 |
+
| `L12_H8_D320` | 500,000 | 20.48 | 0.40 | 4.9791 | 0.0000 | 3.7358 | 1.2434 | bracketed by tested grid |
|
| 19 |
+
| `L12_H8_D320` | 1,000,000 | 10.24 | 0.20 | 4.6871 | 0.0000 | 3.7160 | 0.9711 | bracketed by tested grid |
|
| 20 |
+
| `L12_H8_D320` | 2,000,000 | 5.12 | 0.14 | 4.5088 | 0.0000 | 4.0218 | 0.4870 | bracketed by tested grid |
|
| 21 |
+
| `L12_H8_D320` | 4,000,000 | 2.56 | 0.02 | 4.3875 | 0.0000 | 4.0300 | 0.3575 | bracketed by tested grid |
|
| 22 |
+
| `L16_H8_D384` | 250,000 | 40.96 | 0.60 | 5.5055 | 0.0000 | 3.3185 | 2.1870 | bracketed by tested grid |
|
| 23 |
+
| `L16_H8_D384` | 500,000 | 20.48 | 0.40 | 4.9814 | 0.0000 | 3.2797 | 1.7017 | bracketed by tested grid |
|
| 24 |
+
| `L16_H8_D384` | 1,000,000 | 10.24 | 0.30 | 4.6511 | 0.0000 | 3.6295 | 1.0216 | bracketed by tested grid |
|
| 25 |
+
| `L16_H8_D384` | 2,000,000 | 5.12 | 0.14 | 4.4270 | 0.0000 | 3.7761 | 0.6509 | bracketed by tested grid |
|
| 26 |
+
| `L16_H8_D384` | 4,000,000 | 2.56 | 0.02 | 4.2947 | 0.0000 | 3.8547 | 0.4400 | bracketed by tested grid |
|
| 27 |
+
| `L8_H8_D256` | 250,000 | 40.96 | 0.40 | 5.4175 | 0.0000 | 3.6411 | 1.7763 | bracketed by tested grid |
|
| 28 |
+
| `L8_H8_D256` | 500,000 | 20.48 | 0.20 | 5.0216 | 0.0000 | 3.6979 | 1.3238 | bracketed by tested grid |
|
| 29 |
+
| `L8_H8_D256` | 1,000,000 | 10.24 | 0.14 | 4.7763 | 0.0000 | 3.9900 | 0.7863 | bracketed by tested grid |
|
| 30 |
+
| `L8_H8_D256` | 2,000,000 | 5.12 | 0.08 | 4.6232 | 0.0000 | 4.2158 | 0.4074 | bracketed by tested grid |
|
| 31 |
+
| `L8_H8_D256` | 4,000,000 | 2.56 | 0.00 | 4.5136 | 0.0000 | 4.2515 | 0.2621 | not bracketed; best at bottom of tested grid |
|
| 32 |
+
|
| 33 |
+
## Model `L12_H8_D320`
|
| 34 |
+
|
| 35 |
+
### Prefix 250,000 Tokens (40.96 Effective Epochs)
|
| 36 |
+
|
| 37 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 38 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 39 |
+
| 0.00 | 1 | 8.1559 | 0.0000 | 0.3941 | 0.0000 | 7.7618 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 40 |
+
| 0.02 | 1 | 7.9925 | 0.0000 | 0.4179 | 0.0000 | 7.5746 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 41 |
+
| 0.05 | 1 | 7.5209 | 0.0000 | 0.5652 | 0.0000 | 6.9557 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 42 |
+
| 0.08 | 1 | 7.1101 | 0.0000 | 0.7794 | 0.0000 | 6.3306 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 43 |
+
| 0.10 | 1 | 6.9556 | 0.0000 | 0.9396 | 0.0000 | 6.0159 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 44 |
+
| 0.14 | 1 | 6.5573 | 0.0000 | 1.2607 | 0.0000 | 5.2965 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 45 |
+
| 0.20 | 1 | 6.1481 | 0.0000 | 1.6682 | 0.0000 | 4.4800 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 46 |
+
| 0.30 | 1 | 5.7129 | 0.0000 | 2.3463 | 0.0000 | 3.3666 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 47 |
+
| 0.40 | 1 | 5.5185 | 0.0000 | 2.8816 | 0.0000 | 2.6369 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 48 |
+
| 0.50 | 1 | 5.4384 | 0.0000 | 3.3720 | 0.0000 | 2.0663 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 49 |
+
| 0.60 | 1 | 5.5007 | 0.0000 | 3.8245 | 0.0000 | 1.6762 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 50 |
+
| 0.70 | 1 | 5.6532 | 0.0000 | 4.3150 | 0.0000 | 1.3382 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 51 |
+
| 0.80 | 1 | 5.9260 | 0.0000 | 4.9075 | 0.0000 | 1.0185 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 52 |
+
| 0.90 | 1 | 6.9057 | 0.0000 | 6.4033 | 0.0000 | 0.5024 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 53 |
+
|
| 54 |
+
### Prefix 500,000 Tokens (20.48 Effective Epochs)
|
| 55 |
+
|
| 56 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 57 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 58 |
+
| 0.00 | 1 | 6.8822 | 0.0000 | 1.4379 | 0.0000 | 5.4443 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 59 |
+
| 0.02 | 1 | 6.3843 | 0.0000 | 1.7487 | 0.0000 | 4.6356 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 60 |
+
| 0.05 | 1 | 5.8958 | 0.0000 | 2.0567 | 0.0000 | 3.8390 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 61 |
+
| 0.08 | 1 | 5.5972 | 0.0000 | 2.3715 | 0.0000 | 3.2256 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 62 |
+
| 0.10 | 1 | 5.4479 | 0.0000 | 2.4894 | 0.0000 | 2.9585 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 63 |
+
| 0.14 | 1 | 5.2595 | 0.0000 | 2.7469 | 0.0000 | 2.5126 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 64 |
+
| 0.20 | 1 | 5.0960 | 0.0000 | 3.0360 | 0.0000 | 2.0600 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 65 |
+
| 0.30 | 1 | 4.9841 | 0.0000 | 3.4098 | 0.0000 | 1.5742 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 66 |
+
| 0.40 | 1 | 4.9791 | 0.0000 | 3.7358 | 0.0000 | 1.2434 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 67 |
+
| 0.50 | 1 | 5.0582 | 0.0000 | 4.0295 | 0.0000 | 1.0287 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 68 |
+
| 0.60 | 1 | 5.1838 | 0.0000 | 4.3228 | 0.0000 | 0.8609 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 69 |
+
| 0.70 | 1 | 5.3842 | 0.0000 | 4.6806 | 0.0000 | 0.7036 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 70 |
+
| 0.80 | 1 | 5.6878 | 0.0000 | 5.1572 | 0.0000 | 0.5306 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 71 |
+
| 0.90 | 1 | 6.7661 | 0.0000 | 6.4923 | 0.0000 | 0.2738 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 72 |
+
|
| 73 |
+
### Prefix 1,000,000 Tokens (10.24 Effective Epochs)
|
| 74 |
+
|
| 75 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 76 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 77 |
+
| 0.00 | 1 | 5.3280 | 0.0000 | 2.8864 | 0.0000 | 2.4416 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 78 |
+
| 0.02 | 1 | 5.0988 | 0.0000 | 3.0629 | 0.0000 | 2.0360 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 79 |
+
| 0.05 | 1 | 4.9027 | 0.0000 | 3.2683 | 0.0000 | 1.6344 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 80 |
+
| 0.08 | 1 | 4.7898 | 0.0000 | 3.3917 | 0.0000 | 1.3982 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 81 |
+
| 0.10 | 1 | 4.7517 | 0.0000 | 3.4663 | 0.0000 | 1.2854 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 82 |
+
| 0.14 | 1 | 4.7170 | 0.0000 | 3.5913 | 0.0000 | 1.1257 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 83 |
+
| 0.20 | 1 | 4.6871 | 0.0000 | 3.7160 | 0.0000 | 0.9711 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 84 |
+
| 0.30 | 1 | 4.7214 | 0.0000 | 3.9408 | 0.0000 | 0.7806 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 85 |
+
| 0.40 | 1 | 4.7805 | 0.0000 | 4.1258 | 0.0000 | 0.6548 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 86 |
+
| 0.50 | 1 | 4.8997 | 0.0000 | 4.3420 | 0.0000 | 0.5577 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 87 |
+
| 0.60 | 1 | 5.0594 | 0.0000 | 4.5832 | 0.0000 | 0.4762 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 88 |
+
| 0.70 | 1 | 5.2577 | 0.0000 | 4.8660 | 0.0000 | 0.3917 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 89 |
+
| 0.80 | 1 | 5.5697 | 0.0000 | 5.2791 | 0.0000 | 0.2905 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 90 |
+
| 0.90 | 1 | 6.7485 | 0.0000 | 6.6291 | 0.0000 | 0.1194 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 91 |
+
|
| 92 |
+
### Prefix 2,000,000 Tokens (5.12 Effective Epochs)
|
| 93 |
+
|
| 94 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 95 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 96 |
+
| 0.00 | 1 | 4.5953 | 0.0000 | 3.7160 | 0.0000 | 0.8793 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 97 |
+
| 0.02 | 1 | 4.5495 | 0.0000 | 3.7767 | 0.0000 | 0.7728 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 98 |
+
| 0.05 | 1 | 4.5181 | 0.0000 | 3.8594 | 0.0000 | 0.6587 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 99 |
+
| 0.08 | 1 | 4.5121 | 0.0000 | 3.9309 | 0.0000 | 0.5811 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 100 |
+
| 0.10 | 1 | 4.5212 | 0.0000 | 3.9627 | 0.0000 | 0.5585 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 101 |
+
| 0.14 | 1 | 4.5088 | 0.0000 | 4.0218 | 0.0000 | 0.4870 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 102 |
+
| 0.20 | 1 | 4.5470 | 0.0000 | 4.1078 | 0.0000 | 0.4392 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 103 |
+
| 0.30 | 1 | 4.6074 | 0.0000 | 4.2488 | 0.0000 | 0.3586 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 104 |
+
| 0.40 | 1 | 4.6953 | 0.0000 | 4.3886 | 0.0000 | 0.3067 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 105 |
+
| 0.50 | 1 | 4.8144 | 0.0000 | 4.5650 | 0.0000 | 0.2494 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 106 |
+
| 0.60 | 1 | 4.9685 | 0.0000 | 4.7562 | 0.0000 | 0.2123 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 107 |
+
| 0.70 | 1 | 5.1800 | 0.0000 | 5.0098 | 0.0000 | 0.1702 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 108 |
+
| 0.80 | 1 | 5.5207 | 0.0000 | 5.4023 | 0.0000 | 0.1184 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 109 |
+
| 0.90 | 1 | 6.7411 | 0.0000 | 6.7211 | 0.0000 | 0.0200 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 110 |
+
|
| 111 |
+
### Prefix 4,000,000 Tokens (2.56 Effective Epochs)
|
| 112 |
+
|
| 113 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 114 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 115 |
+
| 0.00 | 1 | 4.3999 | 0.0000 | 4.0323 | 0.0000 | 0.3676 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 116 |
+
| 0.02 | 1 | 4.3875 | 0.0000 | 4.0300 | 0.0000 | 0.3575 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 117 |
+
| 0.05 | 1 | 4.4063 | 0.0000 | 4.0928 | 0.0000 | 0.3135 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 118 |
+
| 0.08 | 1 | 4.3982 | 0.0000 | 4.1090 | 0.0000 | 0.2892 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 119 |
+
| 0.10 | 1 | 4.4147 | 0.0000 | 4.1356 | 0.0000 | 0.2791 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 120 |
+
| 0.14 | 1 | 4.4444 | 0.0000 | 4.1944 | 0.0000 | 0.2500 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 121 |
+
| 0.20 | 1 | 4.4814 | 0.0000 | 4.2480 | 0.0000 | 0.2334 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 122 |
+
| 0.30 | 1 | 4.5457 | 0.0000 | 4.3479 | 0.0000 | 0.1978 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 123 |
+
| 0.40 | 1 | 4.6496 | 0.0000 | 4.4866 | 0.0000 | 0.1630 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 124 |
+
| 0.50 | 1 | 4.7691 | 0.0000 | 4.6345 | 0.0000 | 0.1346 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 125 |
+
| 0.60 | 1 | 4.9325 | 0.0000 | 4.8095 | 0.0000 | 0.1230 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 126 |
+
| 0.70 | 1 | 5.1557 | 0.0000 | 5.0564 | 0.0000 | 0.0993 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 127 |
+
| 0.80 | 1 | 5.4885 | 0.0000 | 5.4295 | 0.0000 | 0.0590 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 128 |
+
| 0.90 | 1 | 6.7736 | 0.0000 | 6.7588 | 0.0000 | 0.0147 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 129 |
+
|
| 130 |
+
## Model `L16_H8_D384`
|
| 131 |
+
|
| 132 |
+
### Prefix 250,000 Tokens (40.96 Effective Epochs)
|
| 133 |
+
|
| 134 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 135 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 136 |
+
| 0.00 | 1 | 8.2018 | 0.0000 | 0.2792 | 0.0000 | 7.9225 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 137 |
+
| 0.02 | 1 | 8.0714 | 0.0000 | 0.2606 | 0.0000 | 7.8108 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 138 |
+
| 0.05 | 1 | 7.9582 | 0.0000 | 0.2673 | 0.0000 | 7.6909 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 139 |
+
| 0.08 | 1 | 7.7608 | 0.0000 | 0.2968 | 0.0000 | 7.4640 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 140 |
+
| 0.10 | 1 | 7.5884 | 0.0000 | 0.3386 | 0.0000 | 7.2498 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 141 |
+
| 0.14 | 1 | 7.2849 | 0.0000 | 0.4997 | 0.0000 | 6.7851 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 142 |
+
| 0.20 | 1 | 6.8383 | 0.0000 | 0.8225 | 0.0000 | 6.0158 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 143 |
+
| 0.30 | 1 | 6.2429 | 0.0000 | 1.3910 | 0.0000 | 4.8519 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 144 |
+
| 0.40 | 1 | 5.8444 | 0.0000 | 2.0533 | 0.0000 | 3.7911 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 145 |
+
| 0.50 | 1 | 5.5925 | 0.0000 | 2.6789 | 0.0000 | 2.9136 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 146 |
+
| 0.60 | 1 | 5.5055 | 0.0000 | 3.3185 | 0.0000 | 2.1870 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 147 |
+
| 0.70 | 1 | 5.5800 | 0.0000 | 3.9233 | 0.0000 | 1.6567 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 148 |
+
| 0.80 | 1 | 5.8399 | 0.0000 | 4.7252 | 0.0000 | 1.1147 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 149 |
+
| 0.90 | 1 | 6.8439 | 0.0000 | 6.3553 | 0.0000 | 0.4885 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 150 |
+
|
| 151 |
+
### Prefix 500,000 Tokens (20.48 Effective Epochs)
|
| 152 |
+
|
| 153 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 154 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 155 |
+
| 0.00 | 1 | 7.2889 | 0.0000 | 0.7200 | 0.0000 | 6.5689 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 156 |
+
| 0.02 | 1 | 6.8954 | 0.0000 | 0.9118 | 0.0000 | 5.9837 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 157 |
+
| 0.05 | 1 | 6.4974 | 0.0000 | 1.2327 | 0.0000 | 5.2647 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 158 |
+
| 0.08 | 1 | 6.1079 | 0.0000 | 1.5379 | 0.0000 | 4.5700 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 159 |
+
| 0.10 | 1 | 5.9033 | 0.0000 | 1.6819 | 0.0000 | 4.2214 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 160 |
+
| 0.14 | 1 | 5.6262 | 0.0000 | 1.9904 | 0.0000 | 3.6358 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 161 |
+
| 0.20 | 1 | 5.3271 | 0.0000 | 2.4160 | 0.0000 | 2.9111 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 162 |
+
| 0.30 | 1 | 5.0666 | 0.0000 | 2.8953 | 0.0000 | 2.1713 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 163 |
+
| 0.40 | 1 | 4.9814 | 0.0000 | 3.2797 | 0.0000 | 1.7017 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 164 |
+
| 0.50 | 1 | 4.9836 | 0.0000 | 3.6291 | 0.0000 | 1.3544 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 165 |
+
| 0.60 | 1 | 5.0664 | 0.0000 | 4.0093 | 0.0000 | 1.0571 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 166 |
+
| 0.70 | 1 | 5.2668 | 0.0000 | 4.4357 | 0.0000 | 0.8311 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 167 |
+
| 0.80 | 1 | 5.5867 | 0.0000 | 5.0149 | 0.0000 | 0.5718 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 168 |
+
| 0.90 | 1 | 6.6203 | 0.0000 | 6.3616 | 0.0000 | 0.2587 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 169 |
+
|
| 170 |
+
### Prefix 1,000,000 Tokens (10.24 Effective Epochs)
|
| 171 |
+
|
| 172 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 173 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 174 |
+
| 0.00 | 1 | 5.6524 | 0.0000 | 2.2476 | 0.0000 | 3.4048 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 175 |
+
| 0.02 | 1 | 5.3384 | 0.0000 | 2.4646 | 0.0000 | 2.8738 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 176 |
+
| 0.05 | 1 | 5.1150 | 0.0000 | 2.7036 | 0.0000 | 2.4114 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 177 |
+
| 0.08 | 1 | 4.9426 | 0.0000 | 2.8964 | 0.0000 | 2.0463 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 178 |
+
| 0.10 | 1 | 4.8428 | 0.0000 | 2.9922 | 0.0000 | 1.8506 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 179 |
+
| 0.14 | 1 | 4.7516 | 0.0000 | 3.1948 | 0.0000 | 1.5568 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 180 |
+
| 0.20 | 1 | 4.6756 | 0.0000 | 3.3926 | 0.0000 | 1.2830 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 181 |
+
| 0.30 | 1 | 4.6511 | 0.0000 | 3.6295 | 0.0000 | 1.0216 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 182 |
+
| 0.40 | 1 | 4.6992 | 0.0000 | 3.8497 | 0.0000 | 0.8495 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 183 |
+
| 0.50 | 1 | 4.7691 | 0.0000 | 4.0636 | 0.0000 | 0.7055 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 184 |
+
| 0.60 | 1 | 4.9042 | 0.0000 | 4.3281 | 0.0000 | 0.5761 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 185 |
+
| 0.70 | 1 | 5.1201 | 0.0000 | 4.6689 | 0.0000 | 0.4512 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 186 |
+
| 0.80 | 1 | 5.5044 | 0.0000 | 5.1892 | 0.0000 | 0.3151 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 187 |
+
| 0.90 | 1 | 6.6273 | 0.0000 | 6.5073 | 0.0000 | 0.1200 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 188 |
+
|
| 189 |
+
### Prefix 2,000,000 Tokens (5.12 Effective Epochs)
|
| 190 |
+
|
| 191 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 192 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 193 |
+
| 0.00 | 1 | 4.6252 | 0.0000 | 3.3746 | 0.0000 | 1.2506 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 194 |
+
| 0.02 | 1 | 4.5426 | 0.0000 | 3.4838 | 0.0000 | 1.0589 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 195 |
+
| 0.05 | 1 | 4.4872 | 0.0000 | 3.5720 | 0.0000 | 0.9152 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 196 |
+
| 0.08 | 1 | 4.4557 | 0.0000 | 3.6279 | 0.0000 | 0.8277 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 197 |
+
| 0.10 | 1 | 4.4366 | 0.0000 | 3.6868 | 0.0000 | 0.7498 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 198 |
+
| 0.14 | 1 | 4.4270 | 0.0000 | 3.7761 | 0.0000 | 0.6509 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 199 |
+
| 0.20 | 1 | 4.4508 | 0.0000 | 3.9086 | 0.0000 | 0.5422 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 200 |
+
| 0.30 | 1 | 4.4949 | 0.0000 | 4.0400 | 0.0000 | 0.4549 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 201 |
+
| 0.40 | 1 | 4.5559 | 0.0000 | 4.1652 | 0.0000 | 0.3907 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 202 |
+
| 0.50 | 1 | 4.6533 | 0.0000 | 4.3375 | 0.0000 | 0.3158 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 203 |
+
| 0.60 | 1 | 4.8110 | 0.0000 | 4.5495 | 0.0000 | 0.2615 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 204 |
+
| 0.70 | 1 | 5.0512 | 0.0000 | 4.8472 | 0.0000 | 0.2040 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 205 |
+
| 0.80 | 1 | 5.4191 | 0.0000 | 5.2791 | 0.0000 | 0.1400 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 206 |
+
| 0.90 | 1 | 6.7257 | 0.0000 | 6.6972 | 0.0000 | 0.0285 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 207 |
+
|
| 208 |
+
### Prefix 4,000,000 Tokens (2.56 Effective Epochs)
|
| 209 |
+
|
| 210 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 211 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 212 |
+
| 0.00 | 1 | 4.3247 | 0.0000 | 3.8417 | 0.0000 | 0.4831 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 213 |
+
| 0.02 | 1 | 4.2947 | 0.0000 | 3.8547 | 0.0000 | 0.4400 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 214 |
+
| 0.05 | 1 | 4.3152 | 0.0000 | 3.9103 | 0.0000 | 0.4049 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 215 |
+
| 0.08 | 1 | 4.3002 | 0.0000 | 3.9315 | 0.0000 | 0.3687 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 216 |
+
| 0.10 | 1 | 4.3102 | 0.0000 | 3.9545 | 0.0000 | 0.3557 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 217 |
+
| 0.14 | 1 | 4.3244 | 0.0000 | 4.0087 | 0.0000 | 0.3157 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 218 |
+
| 0.20 | 1 | 4.3525 | 0.0000 | 4.0857 | 0.0000 | 0.2667 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 219 |
+
| 0.30 | 1 | 4.4108 | 0.0000 | 4.1817 | 0.0000 | 0.2291 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 220 |
+
| 0.40 | 1 | 4.4975 | 0.0000 | 4.2938 | 0.0000 | 0.2037 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 221 |
+
| 0.50 | 1 | 4.6069 | 0.0000 | 4.4319 | 0.0000 | 0.1750 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 222 |
+
| 0.60 | 1 | 4.7745 | 0.0000 | 4.6357 | 0.0000 | 0.1388 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 223 |
+
| 0.70 | 1 | 5.0131 | 0.0000 | 4.8970 | 0.0000 | 0.1162 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 224 |
+
| 0.80 | 1 | 5.4171 | 0.0000 | 5.3475 | 0.0000 | 0.0696 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 225 |
+
| 0.90 | 1 | 6.6239 | 0.0000 | 6.6037 | 0.0000 | 0.0202 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 226 |
+
|
| 227 |
+
## Model `L8_H8_D256`
|
| 228 |
+
|
| 229 |
+
### Prefix 250,000 Tokens (40.96 Effective Epochs)
|
| 230 |
+
|
| 231 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 232 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 233 |
+
| 0.00 | 1 | 7.9175 | 0.0000 | 0.9524 | 0.0000 | 6.9651 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 234 |
+
| 0.02 | 1 | 7.2368 | 0.0000 | 1.2369 | 0.0000 | 5.9999 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 235 |
+
| 0.05 | 1 | 6.6006 | 0.0000 | 1.6339 | 0.0000 | 4.9666 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 236 |
+
| 0.08 | 1 | 6.2564 | 0.0000 | 1.9090 | 0.0000 | 4.3474 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 237 |
+
| 0.10 | 1 | 6.0914 | 0.0000 | 2.0858 | 0.0000 | 4.0056 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 238 |
+
| 0.14 | 1 | 5.8297 | 0.0000 | 2.4090 | 0.0000 | 3.4207 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 239 |
+
| 0.20 | 1 | 5.5966 | 0.0000 | 2.7805 | 0.0000 | 2.8161 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 240 |
+
| 0.30 | 1 | 5.4419 | 0.0000 | 3.2711 | 0.0000 | 2.1708 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 241 |
+
| 0.40 | 1 | 5.4175 | 0.0000 | 3.6411 | 0.0000 | 1.7763 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 242 |
+
| 0.50 | 1 | 5.4654 | 0.0000 | 3.9927 | 0.0000 | 1.4727 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 243 |
+
| 0.60 | 1 | 5.5867 | 0.0000 | 4.3203 | 0.0000 | 1.2663 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 244 |
+
| 0.70 | 1 | 5.7802 | 0.0000 | 4.7055 | 0.0000 | 1.0747 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 245 |
+
| 0.80 | 1 | 6.0904 | 0.0000 | 5.2467 | 0.0000 | 0.8437 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 246 |
+
| 0.90 | 1 | 7.1601 | 0.0000 | 6.7126 | 0.0000 | 0.4475 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 247 |
+
|
| 248 |
+
### Prefix 500,000 Tokens (20.48 Effective Epochs)
|
| 249 |
+
|
| 250 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 251 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 252 |
+
| 0.00 | 1 | 6.0796 | 0.0000 | 2.5776 | 0.0000 | 3.5021 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 253 |
+
| 0.02 | 1 | 5.6702 | 0.0000 | 2.8128 | 0.0000 | 2.8574 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 254 |
+
| 0.05 | 1 | 5.3633 | 0.0000 | 3.0388 | 0.0000 | 2.3245 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 255 |
+
| 0.08 | 1 | 5.2185 | 0.0000 | 3.2261 | 0.0000 | 1.9924 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 256 |
+
| 0.10 | 1 | 5.1607 | 0.0000 | 3.3108 | 0.0000 | 1.8499 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 257 |
+
| 0.14 | 1 | 5.0671 | 0.0000 | 3.4774 | 0.0000 | 1.5897 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 258 |
+
| 0.20 | 1 | 5.0216 | 0.0000 | 3.6979 | 0.0000 | 1.3238 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 259 |
+
| 0.30 | 1 | 5.0323 | 0.0000 | 3.9668 | 0.0000 | 1.0656 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 260 |
+
| 0.40 | 1 | 5.0917 | 0.0000 | 4.1940 | 0.0000 | 0.8976 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 261 |
+
| 0.50 | 1 | 5.2099 | 0.0000 | 4.4287 | 0.0000 | 0.7812 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 262 |
+
| 0.60 | 1 | 5.3372 | 0.0000 | 4.6669 | 0.0000 | 0.6703 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 263 |
+
| 0.70 | 1 | 5.5502 | 0.0000 | 4.9751 | 0.0000 | 0.5751 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 264 |
+
| 0.80 | 1 | 5.8850 | 0.0000 | 5.4346 | 0.0000 | 0.4504 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 265 |
+
| 0.90 | 1 | 7.1122 | 0.0000 | 6.8702 | 0.0000 | 0.2419 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 266 |
+
|
| 267 |
+
### Prefix 1,000,000 Tokens (10.24 Effective Epochs)
|
| 268 |
+
|
| 269 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 270 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 271 |
+
| 0.00 | 1 | 5.0210 | 0.0000 | 3.5921 | 0.0000 | 1.4288 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 272 |
+
| 0.02 | 1 | 4.8949 | 0.0000 | 3.6926 | 0.0000 | 1.2023 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 273 |
+
| 0.05 | 1 | 4.8096 | 0.0000 | 3.7745 | 0.0000 | 1.0351 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 274 |
+
| 0.08 | 1 | 4.7992 | 0.0000 | 3.8572 | 0.0000 | 0.9420 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 275 |
+
| 0.10 | 1 | 4.7946 | 0.0000 | 3.9262 | 0.0000 | 0.8684 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 276 |
+
| 0.14 | 1 | 4.7763 | 0.0000 | 3.9900 | 0.0000 | 0.7863 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 277 |
+
| 0.20 | 1 | 4.8010 | 0.0000 | 4.1303 | 0.0000 | 0.6708 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 278 |
+
| 0.30 | 1 | 4.8657 | 0.0000 | 4.2940 | 0.0000 | 0.5718 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 279 |
+
| 0.40 | 1 | 4.9626 | 0.0000 | 4.4759 | 0.0000 | 0.4867 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 280 |
+
| 0.50 | 1 | 5.0880 | 0.0000 | 4.6669 | 0.0000 | 0.4211 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 281 |
+
| 0.60 | 1 | 5.2370 | 0.0000 | 4.8647 | 0.0000 | 0.3724 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 282 |
+
| 0.70 | 1 | 5.4416 | 0.0000 | 5.1299 | 0.0000 | 0.3117 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 283 |
+
| 0.80 | 1 | 5.8087 | 0.0000 | 5.5785 | 0.0000 | 0.2302 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 284 |
+
| 0.90 | 1 | 7.1040 | 0.0000 | 6.9954 | 0.0000 | 0.1086 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 285 |
+
|
| 286 |
+
### Prefix 2,000,000 Tokens (5.12 Effective Epochs)
|
| 287 |
+
|
| 288 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 289 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 290 |
+
| 0.00 | 1 | 4.6498 | 0.0000 | 4.0862 | 0.0000 | 0.5635 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 291 |
+
| 0.02 | 1 | 4.6403 | 0.0000 | 4.1384 | 0.0000 | 0.5019 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 292 |
+
| 0.05 | 1 | 4.6319 | 0.0000 | 4.1963 | 0.0000 | 0.4356 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 293 |
+
| 0.08 | 1 | 4.6232 | 0.0000 | 4.2158 | 0.0000 | 0.4074 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 294 |
+
| 0.10 | 1 | 4.6527 | 0.0000 | 4.2675 | 0.0000 | 0.3852 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 295 |
+
| 0.14 | 1 | 4.6524 | 0.0000 | 4.2919 | 0.0000 | 0.3604 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 296 |
+
| 0.20 | 1 | 4.6882 | 0.0000 | 4.3827 | 0.0000 | 0.3055 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 297 |
+
| 0.30 | 1 | 4.7794 | 0.0000 | 4.5149 | 0.0000 | 0.2645 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 298 |
+
| 0.40 | 1 | 4.8796 | 0.0000 | 4.6519 | 0.0000 | 0.2278 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 299 |
+
| 0.50 | 1 | 5.0101 | 0.0000 | 4.8229 | 0.0000 | 0.1873 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 300 |
+
| 0.60 | 1 | 5.1623 | 0.0000 | 4.9993 | 0.0000 | 0.1630 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 301 |
+
| 0.70 | 1 | 5.3763 | 0.0000 | 5.2435 | 0.0000 | 0.1327 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 302 |
+
| 0.80 | 1 | 5.7417 | 0.0000 | 5.6466 | 0.0000 | 0.0950 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 303 |
+
| 0.90 | 1 | 7.0412 | 0.0000 | 7.0093 | 0.0000 | 0.0319 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 304 |
+
|
| 305 |
+
### Prefix 4,000,000 Tokens (2.56 Effective Epochs)
|
| 306 |
+
|
| 307 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 308 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 309 |
+
| 0.00 | 1 | 4.5136 | 0.0000 | 4.2515 | 0.0000 | 0.2621 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 310 |
+
| 0.02 | 1 | 4.5287 | 0.0000 | 4.2760 | 0.0000 | 0.2527 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 311 |
+
| 0.05 | 1 | 4.5401 | 0.0000 | 4.3095 | 0.0000 | 0.2306 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 312 |
+
| 0.08 | 1 | 4.5701 | 0.0000 | 4.3547 | 0.0000 | 0.2154 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 313 |
+
| 0.10 | 1 | 4.5783 | 0.0000 | 4.3826 | 0.0000 | 0.1957 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 314 |
+
| 0.14 | 1 | 4.6069 | 0.0000 | 4.4129 | 0.0000 | 0.1939 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 315 |
+
| 0.20 | 1 | 4.6558 | 0.0000 | 4.4934 | 0.0000 | 0.1624 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 316 |
+
| 0.30 | 1 | 4.7516 | 0.0000 | 4.5996 | 0.0000 | 0.1520 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 317 |
+
| 0.40 | 1 | 4.8556 | 0.0000 | 4.7264 | 0.0000 | 0.1292 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 318 |
+
| 0.50 | 1 | 4.9782 | 0.0000 | 4.8738 | 0.0000 | 0.1044 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 319 |
+
| 0.60 | 1 | 5.1452 | 0.0000 | 5.0517 | 0.0000 | 0.0935 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 320 |
+
| 0.70 | 1 | 5.3607 | 0.0000 | 5.2833 | 0.0000 | 0.0774 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 321 |
+
| 0.80 | 1 | 5.7190 | 0.0000 | 5.6752 | 0.0000 | 0.0438 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 322 |
+
| 0.90 | 1 | 7.0566 | 0.0000 | 7.0296 | 0.0000 | 0.0270 | 0.0000 | 10,240,000 | 8,388,608 |
|
runs/screen_static/20260525-133008/RESULT_SUMMARY.md
CHANGED
|
@@ -2,80 +2,321 @@
|
|
| 2 |
|
| 3 |
Run directory: `runs/screen_static/20260525-133008`
|
| 4 |
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
| 11 |
-
|
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
| 20 |
-
|
|
| 21 |
-
| 0.
|
| 22 |
-
| 0.
|
| 23 |
-
| 0.
|
| 24 |
-
| 0.
|
| 25 |
-
| 0.14 |
|
| 26 |
-
| 0.
|
| 27 |
-
| 0.
|
| 28 |
-
| 0.
|
| 29 |
-
| 0.
|
| 30 |
-
| 0.
|
| 31 |
-
| 0.
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
## Prefix
|
| 36 |
-
|
| 37 |
-
| Dropout | Val loss | Train
|
| 38 |
-
|---:|---:|---:|---:|---:|---:|
|
| 39 |
-
| 0.00 |
|
| 40 |
-
| 0.02 |
|
| 41 |
-
| 0.05 |
|
| 42 |
-
| 0.
|
| 43 |
-
| 0.
|
| 44 |
-
| 0.
|
| 45 |
-
| 0.
|
| 46 |
-
| 0.
|
| 47 |
-
| 0.
|
| 48 |
-
| 0.
|
| 49 |
-
| 0.
|
| 50 |
-
| 0.
|
| 51 |
-
| 0.
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
|
| 57 |
-
|
|
| 58 |
-
| 0.
|
| 59 |
-
| 0.
|
| 60 |
-
| 0.
|
| 61 |
-
| 0.
|
| 62 |
-
| 0.
|
| 63 |
-
| 0.
|
| 64 |
-
| 0.
|
| 65 |
-
| 0.
|
| 66 |
-
| 0.
|
| 67 |
-
| 0.
|
| 68 |
-
| 0.
|
| 69 |
-
| 0.
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
|
| 76 |
-
|
|
| 77 |
-
| 0.
|
| 78 |
-
| 0.
|
| 79 |
-
| 0.
|
| 80 |
-
| 0.
|
| 81 |
-
| 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
Run directory: `runs/screen_static/20260525-133008`
|
| 4 |
|
| 5 |
+
## Models
|
| 6 |
+
|
| 7 |
+
| Model | Params | Layers | Heads | Embedding | Block | Vocab | Seeds |
|
| 8 |
+
|---|---:|---:|---:|---:|---:|---:|---|
|
| 9 |
+
| `L8_H8_D256` | 8,388,608 | 8 | 8 | 256 | 128 | 4096 | 1 |
|
| 10 |
+
| `L12_H8_D320` | 17,367,040 | 12 | 8 | 320 | 128 | 4096 | 1 |
|
| 11 |
+
| `L16_H8_D384` | 31,457,280 | 16 | 8 | 384 | 128 | 4096 | 1 |
|
| 12 |
+
|
| 13 |
+
## Best Dropout By Model And Prefix
|
| 14 |
+
|
| 15 |
+
| Model | Prefix tokens | Effective epochs | Best dropout | Mean val loss | Val std | Mean train loss | Mean gap | Plateau/bracket note |
|
| 16 |
+
|---|---:|---:|---:|---:|---:|---:|---:|---|
|
| 17 |
+
| `L12_H8_D320` | 250,000 | 40.96 | 0.50 | 5.4384 | 0.0000 | 3.3720 | 2.0663 | bracketed by tested grid |
|
| 18 |
+
| `L12_H8_D320` | 500,000 | 20.48 | 0.40 | 4.9791 | 0.0000 | 3.7358 | 1.2434 | bracketed by tested grid |
|
| 19 |
+
| `L12_H8_D320` | 1,000,000 | 10.24 | 0.20 | 4.6871 | 0.0000 | 3.7160 | 0.9711 | bracketed by tested grid |
|
| 20 |
+
| `L12_H8_D320` | 2,000,000 | 5.12 | 0.14 | 4.5088 | 0.0000 | 4.0218 | 0.4870 | bracketed by tested grid |
|
| 21 |
+
| `L12_H8_D320` | 4,000,000 | 2.56 | 0.02 | 4.3875 | 0.0000 | 4.0300 | 0.3575 | bracketed by tested grid |
|
| 22 |
+
| `L16_H8_D384` | 250,000 | 40.96 | 0.60 | 5.5055 | 0.0000 | 3.3185 | 2.1870 | bracketed by tested grid |
|
| 23 |
+
| `L16_H8_D384` | 500,000 | 20.48 | 0.40 | 4.9814 | 0.0000 | 3.2797 | 1.7017 | bracketed by tested grid |
|
| 24 |
+
| `L16_H8_D384` | 1,000,000 | 10.24 | 0.30 | 4.6511 | 0.0000 | 3.6295 | 1.0216 | bracketed by tested grid |
|
| 25 |
+
| `L16_H8_D384` | 2,000,000 | 5.12 | 0.14 | 4.4270 | 0.0000 | 3.7761 | 0.6509 | bracketed by tested grid |
|
| 26 |
+
| `L16_H8_D384` | 4,000,000 | 2.56 | 0.02 | 4.2947 | 0.0000 | 3.8547 | 0.4400 | bracketed by tested grid |
|
| 27 |
+
| `L8_H8_D256` | 250,000 | 40.96 | 0.40 | 5.4175 | 0.0000 | 3.6411 | 1.7763 | bracketed by tested grid |
|
| 28 |
+
| `L8_H8_D256` | 500,000 | 20.48 | 0.20 | 5.0216 | 0.0000 | 3.6979 | 1.3238 | bracketed by tested grid |
|
| 29 |
+
| `L8_H8_D256` | 1,000,000 | 10.24 | 0.14 | 4.7763 | 0.0000 | 3.9900 | 0.7863 | bracketed by tested grid |
|
| 30 |
+
| `L8_H8_D256` | 2,000,000 | 5.12 | 0.08 | 4.6232 | 0.0000 | 4.2158 | 0.4074 | bracketed by tested grid |
|
| 31 |
+
| `L8_H8_D256` | 4,000,000 | 2.56 | 0.00 | 4.5136 | 0.0000 | 4.2515 | 0.2621 | not bracketed; best at bottom of tested grid |
|
| 32 |
+
|
| 33 |
+
## Model `L12_H8_D320`
|
| 34 |
+
|
| 35 |
+
### Prefix 250,000 Tokens (40.96 Effective Epochs)
|
| 36 |
+
|
| 37 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 38 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 39 |
+
| 0.00 | 1 | 8.1559 | 0.0000 | 0.3941 | 0.0000 | 7.7618 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 40 |
+
| 0.02 | 1 | 7.9925 | 0.0000 | 0.4179 | 0.0000 | 7.5746 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 41 |
+
| 0.05 | 1 | 7.5209 | 0.0000 | 0.5652 | 0.0000 | 6.9557 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 42 |
+
| 0.08 | 1 | 7.1101 | 0.0000 | 0.7794 | 0.0000 | 6.3306 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 43 |
+
| 0.10 | 1 | 6.9556 | 0.0000 | 0.9396 | 0.0000 | 6.0159 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 44 |
+
| 0.14 | 1 | 6.5573 | 0.0000 | 1.2607 | 0.0000 | 5.2965 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 45 |
+
| 0.20 | 1 | 6.1481 | 0.0000 | 1.6682 | 0.0000 | 4.4800 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 46 |
+
| 0.30 | 1 | 5.7129 | 0.0000 | 2.3463 | 0.0000 | 3.3666 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 47 |
+
| 0.40 | 1 | 5.5185 | 0.0000 | 2.8816 | 0.0000 | 2.6369 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 48 |
+
| 0.50 | 1 | 5.4384 | 0.0000 | 3.3720 | 0.0000 | 2.0663 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 49 |
+
| 0.60 | 1 | 5.5007 | 0.0000 | 3.8245 | 0.0000 | 1.6762 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 50 |
+
| 0.70 | 1 | 5.6532 | 0.0000 | 4.3150 | 0.0000 | 1.3382 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 51 |
+
| 0.80 | 1 | 5.9260 | 0.0000 | 4.9075 | 0.0000 | 1.0185 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 52 |
+
| 0.90 | 1 | 6.9057 | 0.0000 | 6.4033 | 0.0000 | 0.5024 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 53 |
+
|
| 54 |
+
### Prefix 500,000 Tokens (20.48 Effective Epochs)
|
| 55 |
+
|
| 56 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 57 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 58 |
+
| 0.00 | 1 | 6.8822 | 0.0000 | 1.4379 | 0.0000 | 5.4443 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 59 |
+
| 0.02 | 1 | 6.3843 | 0.0000 | 1.7487 | 0.0000 | 4.6356 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 60 |
+
| 0.05 | 1 | 5.8958 | 0.0000 | 2.0567 | 0.0000 | 3.8390 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 61 |
+
| 0.08 | 1 | 5.5972 | 0.0000 | 2.3715 | 0.0000 | 3.2256 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 62 |
+
| 0.10 | 1 | 5.4479 | 0.0000 | 2.4894 | 0.0000 | 2.9585 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 63 |
+
| 0.14 | 1 | 5.2595 | 0.0000 | 2.7469 | 0.0000 | 2.5126 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 64 |
+
| 0.20 | 1 | 5.0960 | 0.0000 | 3.0360 | 0.0000 | 2.0600 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 65 |
+
| 0.30 | 1 | 4.9841 | 0.0000 | 3.4098 | 0.0000 | 1.5742 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 66 |
+
| 0.40 | 1 | 4.9791 | 0.0000 | 3.7358 | 0.0000 | 1.2434 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 67 |
+
| 0.50 | 1 | 5.0582 | 0.0000 | 4.0295 | 0.0000 | 1.0287 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 68 |
+
| 0.60 | 1 | 5.1838 | 0.0000 | 4.3228 | 0.0000 | 0.8609 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 69 |
+
| 0.70 | 1 | 5.3842 | 0.0000 | 4.6806 | 0.0000 | 0.7036 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 70 |
+
| 0.80 | 1 | 5.6878 | 0.0000 | 5.1572 | 0.0000 | 0.5306 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 71 |
+
| 0.90 | 1 | 6.7661 | 0.0000 | 6.4923 | 0.0000 | 0.2738 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 72 |
+
|
| 73 |
+
### Prefix 1,000,000 Tokens (10.24 Effective Epochs)
|
| 74 |
+
|
| 75 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 76 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 77 |
+
| 0.00 | 1 | 5.3280 | 0.0000 | 2.8864 | 0.0000 | 2.4416 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 78 |
+
| 0.02 | 1 | 5.0988 | 0.0000 | 3.0629 | 0.0000 | 2.0360 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 79 |
+
| 0.05 | 1 | 4.9027 | 0.0000 | 3.2683 | 0.0000 | 1.6344 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 80 |
+
| 0.08 | 1 | 4.7898 | 0.0000 | 3.3917 | 0.0000 | 1.3982 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 81 |
+
| 0.10 | 1 | 4.7517 | 0.0000 | 3.4663 | 0.0000 | 1.2854 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 82 |
+
| 0.14 | 1 | 4.7170 | 0.0000 | 3.5913 | 0.0000 | 1.1257 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 83 |
+
| 0.20 | 1 | 4.6871 | 0.0000 | 3.7160 | 0.0000 | 0.9711 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 84 |
+
| 0.30 | 1 | 4.7214 | 0.0000 | 3.9408 | 0.0000 | 0.7806 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 85 |
+
| 0.40 | 1 | 4.7805 | 0.0000 | 4.1258 | 0.0000 | 0.6548 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 86 |
+
| 0.50 | 1 | 4.8997 | 0.0000 | 4.3420 | 0.0000 | 0.5577 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 87 |
+
| 0.60 | 1 | 5.0594 | 0.0000 | 4.5832 | 0.0000 | 0.4762 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 88 |
+
| 0.70 | 1 | 5.2577 | 0.0000 | 4.8660 | 0.0000 | 0.3917 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 89 |
+
| 0.80 | 1 | 5.5697 | 0.0000 | 5.2791 | 0.0000 | 0.2905 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 90 |
+
| 0.90 | 1 | 6.7485 | 0.0000 | 6.6291 | 0.0000 | 0.1194 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 91 |
+
|
| 92 |
+
### Prefix 2,000,000 Tokens (5.12 Effective Epochs)
|
| 93 |
+
|
| 94 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 95 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 96 |
+
| 0.00 | 1 | 4.5953 | 0.0000 | 3.7160 | 0.0000 | 0.8793 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 97 |
+
| 0.02 | 1 | 4.5495 | 0.0000 | 3.7767 | 0.0000 | 0.7728 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 98 |
+
| 0.05 | 1 | 4.5181 | 0.0000 | 3.8594 | 0.0000 | 0.6587 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 99 |
+
| 0.08 | 1 | 4.5121 | 0.0000 | 3.9309 | 0.0000 | 0.5811 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 100 |
+
| 0.10 | 1 | 4.5212 | 0.0000 | 3.9627 | 0.0000 | 0.5585 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 101 |
+
| 0.14 | 1 | 4.5088 | 0.0000 | 4.0218 | 0.0000 | 0.4870 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 102 |
+
| 0.20 | 1 | 4.5470 | 0.0000 | 4.1078 | 0.0000 | 0.4392 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 103 |
+
| 0.30 | 1 | 4.6074 | 0.0000 | 4.2488 | 0.0000 | 0.3586 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 104 |
+
| 0.40 | 1 | 4.6953 | 0.0000 | 4.3886 | 0.0000 | 0.3067 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 105 |
+
| 0.50 | 1 | 4.8144 | 0.0000 | 4.5650 | 0.0000 | 0.2494 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 106 |
+
| 0.60 | 1 | 4.9685 | 0.0000 | 4.7562 | 0.0000 | 0.2123 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 107 |
+
| 0.70 | 1 | 5.1800 | 0.0000 | 5.0098 | 0.0000 | 0.1702 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 108 |
+
| 0.80 | 1 | 5.5207 | 0.0000 | 5.4023 | 0.0000 | 0.1184 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 109 |
+
| 0.90 | 1 | 6.7411 | 0.0000 | 6.7211 | 0.0000 | 0.0200 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 110 |
+
|
| 111 |
+
### Prefix 4,000,000 Tokens (2.56 Effective Epochs)
|
| 112 |
+
|
| 113 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 114 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 115 |
+
| 0.00 | 1 | 4.3999 | 0.0000 | 4.0323 | 0.0000 | 0.3676 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 116 |
+
| 0.02 | 1 | 4.3875 | 0.0000 | 4.0300 | 0.0000 | 0.3575 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 117 |
+
| 0.05 | 1 | 4.4063 | 0.0000 | 4.0928 | 0.0000 | 0.3135 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 118 |
+
| 0.08 | 1 | 4.3982 | 0.0000 | 4.1090 | 0.0000 | 0.2892 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 119 |
+
| 0.10 | 1 | 4.4147 | 0.0000 | 4.1356 | 0.0000 | 0.2791 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 120 |
+
| 0.14 | 1 | 4.4444 | 0.0000 | 4.1944 | 0.0000 | 0.2500 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 121 |
+
| 0.20 | 1 | 4.4814 | 0.0000 | 4.2480 | 0.0000 | 0.2334 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 122 |
+
| 0.30 | 1 | 4.5457 | 0.0000 | 4.3479 | 0.0000 | 0.1978 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 123 |
+
| 0.40 | 1 | 4.6496 | 0.0000 | 4.4866 | 0.0000 | 0.1630 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 124 |
+
| 0.50 | 1 | 4.7691 | 0.0000 | 4.6345 | 0.0000 | 0.1346 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 125 |
+
| 0.60 | 1 | 4.9325 | 0.0000 | 4.8095 | 0.0000 | 0.1230 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 126 |
+
| 0.70 | 1 | 5.1557 | 0.0000 | 5.0564 | 0.0000 | 0.0993 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 127 |
+
| 0.80 | 1 | 5.4885 | 0.0000 | 5.4295 | 0.0000 | 0.0590 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 128 |
+
| 0.90 | 1 | 6.7736 | 0.0000 | 6.7588 | 0.0000 | 0.0147 | 0.0000 | 10,240,000 | 17,367,040 |
|
| 129 |
+
|
| 130 |
+
## Model `L16_H8_D384`
|
| 131 |
+
|
| 132 |
+
### Prefix 250,000 Tokens (40.96 Effective Epochs)
|
| 133 |
+
|
| 134 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 135 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 136 |
+
| 0.00 | 1 | 8.2018 | 0.0000 | 0.2792 | 0.0000 | 7.9225 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 137 |
+
| 0.02 | 1 | 8.0714 | 0.0000 | 0.2606 | 0.0000 | 7.8108 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 138 |
+
| 0.05 | 1 | 7.9582 | 0.0000 | 0.2673 | 0.0000 | 7.6909 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 139 |
+
| 0.08 | 1 | 7.7608 | 0.0000 | 0.2968 | 0.0000 | 7.4640 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 140 |
+
| 0.10 | 1 | 7.5884 | 0.0000 | 0.3386 | 0.0000 | 7.2498 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 141 |
+
| 0.14 | 1 | 7.2849 | 0.0000 | 0.4997 | 0.0000 | 6.7851 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 142 |
+
| 0.20 | 1 | 6.8383 | 0.0000 | 0.8225 | 0.0000 | 6.0158 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 143 |
+
| 0.30 | 1 | 6.2429 | 0.0000 | 1.3910 | 0.0000 | 4.8519 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 144 |
+
| 0.40 | 1 | 5.8444 | 0.0000 | 2.0533 | 0.0000 | 3.7911 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 145 |
+
| 0.50 | 1 | 5.5925 | 0.0000 | 2.6789 | 0.0000 | 2.9136 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 146 |
+
| 0.60 | 1 | 5.5055 | 0.0000 | 3.3185 | 0.0000 | 2.1870 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 147 |
+
| 0.70 | 1 | 5.5800 | 0.0000 | 3.9233 | 0.0000 | 1.6567 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 148 |
+
| 0.80 | 1 | 5.8399 | 0.0000 | 4.7252 | 0.0000 | 1.1147 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 149 |
+
| 0.90 | 1 | 6.8439 | 0.0000 | 6.3553 | 0.0000 | 0.4885 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 150 |
+
|
| 151 |
+
### Prefix 500,000 Tokens (20.48 Effective Epochs)
|
| 152 |
+
|
| 153 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 154 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 155 |
+
| 0.00 | 1 | 7.2889 | 0.0000 | 0.7200 | 0.0000 | 6.5689 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 156 |
+
| 0.02 | 1 | 6.8954 | 0.0000 | 0.9118 | 0.0000 | 5.9837 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 157 |
+
| 0.05 | 1 | 6.4974 | 0.0000 | 1.2327 | 0.0000 | 5.2647 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 158 |
+
| 0.08 | 1 | 6.1079 | 0.0000 | 1.5379 | 0.0000 | 4.5700 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 159 |
+
| 0.10 | 1 | 5.9033 | 0.0000 | 1.6819 | 0.0000 | 4.2214 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 160 |
+
| 0.14 | 1 | 5.6262 | 0.0000 | 1.9904 | 0.0000 | 3.6358 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 161 |
+
| 0.20 | 1 | 5.3271 | 0.0000 | 2.4160 | 0.0000 | 2.9111 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 162 |
+
| 0.30 | 1 | 5.0666 | 0.0000 | 2.8953 | 0.0000 | 2.1713 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 163 |
+
| 0.40 | 1 | 4.9814 | 0.0000 | 3.2797 | 0.0000 | 1.7017 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 164 |
+
| 0.50 | 1 | 4.9836 | 0.0000 | 3.6291 | 0.0000 | 1.3544 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 165 |
+
| 0.60 | 1 | 5.0664 | 0.0000 | 4.0093 | 0.0000 | 1.0571 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 166 |
+
| 0.70 | 1 | 5.2668 | 0.0000 | 4.4357 | 0.0000 | 0.8311 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 167 |
+
| 0.80 | 1 | 5.5867 | 0.0000 | 5.0149 | 0.0000 | 0.5718 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 168 |
+
| 0.90 | 1 | 6.6203 | 0.0000 | 6.3616 | 0.0000 | 0.2587 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 169 |
+
|
| 170 |
+
### Prefix 1,000,000 Tokens (10.24 Effective Epochs)
|
| 171 |
+
|
| 172 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 173 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 174 |
+
| 0.00 | 1 | 5.6524 | 0.0000 | 2.2476 | 0.0000 | 3.4048 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 175 |
+
| 0.02 | 1 | 5.3384 | 0.0000 | 2.4646 | 0.0000 | 2.8738 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 176 |
+
| 0.05 | 1 | 5.1150 | 0.0000 | 2.7036 | 0.0000 | 2.4114 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 177 |
+
| 0.08 | 1 | 4.9426 | 0.0000 | 2.8964 | 0.0000 | 2.0463 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 178 |
+
| 0.10 | 1 | 4.8428 | 0.0000 | 2.9922 | 0.0000 | 1.8506 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 179 |
+
| 0.14 | 1 | 4.7516 | 0.0000 | 3.1948 | 0.0000 | 1.5568 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 180 |
+
| 0.20 | 1 | 4.6756 | 0.0000 | 3.3926 | 0.0000 | 1.2830 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 181 |
+
| 0.30 | 1 | 4.6511 | 0.0000 | 3.6295 | 0.0000 | 1.0216 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 182 |
+
| 0.40 | 1 | 4.6992 | 0.0000 | 3.8497 | 0.0000 | 0.8495 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 183 |
+
| 0.50 | 1 | 4.7691 | 0.0000 | 4.0636 | 0.0000 | 0.7055 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 184 |
+
| 0.60 | 1 | 4.9042 | 0.0000 | 4.3281 | 0.0000 | 0.5761 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 185 |
+
| 0.70 | 1 | 5.1201 | 0.0000 | 4.6689 | 0.0000 | 0.4512 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 186 |
+
| 0.80 | 1 | 5.5044 | 0.0000 | 5.1892 | 0.0000 | 0.3151 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 187 |
+
| 0.90 | 1 | 6.6273 | 0.0000 | 6.5073 | 0.0000 | 0.1200 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 188 |
+
|
| 189 |
+
### Prefix 2,000,000 Tokens (5.12 Effective Epochs)
|
| 190 |
+
|
| 191 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 192 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 193 |
+
| 0.00 | 1 | 4.6252 | 0.0000 | 3.3746 | 0.0000 | 1.2506 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 194 |
+
| 0.02 | 1 | 4.5426 | 0.0000 | 3.4838 | 0.0000 | 1.0589 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 195 |
+
| 0.05 | 1 | 4.4872 | 0.0000 | 3.5720 | 0.0000 | 0.9152 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 196 |
+
| 0.08 | 1 | 4.4557 | 0.0000 | 3.6279 | 0.0000 | 0.8277 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 197 |
+
| 0.10 | 1 | 4.4366 | 0.0000 | 3.6868 | 0.0000 | 0.7498 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 198 |
+
| 0.14 | 1 | 4.4270 | 0.0000 | 3.7761 | 0.0000 | 0.6509 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 199 |
+
| 0.20 | 1 | 4.4508 | 0.0000 | 3.9086 | 0.0000 | 0.5422 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 200 |
+
| 0.30 | 1 | 4.4949 | 0.0000 | 4.0400 | 0.0000 | 0.4549 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 201 |
+
| 0.40 | 1 | 4.5559 | 0.0000 | 4.1652 | 0.0000 | 0.3907 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 202 |
+
| 0.50 | 1 | 4.6533 | 0.0000 | 4.3375 | 0.0000 | 0.3158 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 203 |
+
| 0.60 | 1 | 4.8110 | 0.0000 | 4.5495 | 0.0000 | 0.2615 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 204 |
+
| 0.70 | 1 | 5.0512 | 0.0000 | 4.8472 | 0.0000 | 0.2040 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 205 |
+
| 0.80 | 1 | 5.4191 | 0.0000 | 5.2791 | 0.0000 | 0.1400 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 206 |
+
| 0.90 | 1 | 6.7257 | 0.0000 | 6.6972 | 0.0000 | 0.0285 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 207 |
+
|
| 208 |
+
### Prefix 4,000,000 Tokens (2.56 Effective Epochs)
|
| 209 |
+
|
| 210 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 211 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 212 |
+
| 0.00 | 1 | 4.3247 | 0.0000 | 3.8417 | 0.0000 | 0.4831 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 213 |
+
| 0.02 | 1 | 4.2947 | 0.0000 | 3.8547 | 0.0000 | 0.4400 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 214 |
+
| 0.05 | 1 | 4.3152 | 0.0000 | 3.9103 | 0.0000 | 0.4049 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 215 |
+
| 0.08 | 1 | 4.3002 | 0.0000 | 3.9315 | 0.0000 | 0.3687 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 216 |
+
| 0.10 | 1 | 4.3102 | 0.0000 | 3.9545 | 0.0000 | 0.3557 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 217 |
+
| 0.14 | 1 | 4.3244 | 0.0000 | 4.0087 | 0.0000 | 0.3157 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 218 |
+
| 0.20 | 1 | 4.3525 | 0.0000 | 4.0857 | 0.0000 | 0.2667 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 219 |
+
| 0.30 | 1 | 4.4108 | 0.0000 | 4.1817 | 0.0000 | 0.2291 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 220 |
+
| 0.40 | 1 | 4.4975 | 0.0000 | 4.2938 | 0.0000 | 0.2037 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 221 |
+
| 0.50 | 1 | 4.6069 | 0.0000 | 4.4319 | 0.0000 | 0.1750 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 222 |
+
| 0.60 | 1 | 4.7745 | 0.0000 | 4.6357 | 0.0000 | 0.1388 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 223 |
+
| 0.70 | 1 | 5.0131 | 0.0000 | 4.8970 | 0.0000 | 0.1162 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 224 |
+
| 0.80 | 1 | 5.4171 | 0.0000 | 5.3475 | 0.0000 | 0.0696 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 225 |
+
| 0.90 | 1 | 6.6239 | 0.0000 | 6.6037 | 0.0000 | 0.0202 | 0.0000 | 10,240,000 | 31,457,280 |
|
| 226 |
+
|
| 227 |
+
## Model `L8_H8_D256`
|
| 228 |
+
|
| 229 |
+
### Prefix 250,000 Tokens (40.96 Effective Epochs)
|
| 230 |
+
|
| 231 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 232 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 233 |
+
| 0.00 | 1 | 7.9175 | 0.0000 | 0.9524 | 0.0000 | 6.9651 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 234 |
+
| 0.02 | 1 | 7.2368 | 0.0000 | 1.2369 | 0.0000 | 5.9999 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 235 |
+
| 0.05 | 1 | 6.6006 | 0.0000 | 1.6339 | 0.0000 | 4.9666 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 236 |
+
| 0.08 | 1 | 6.2564 | 0.0000 | 1.9090 | 0.0000 | 4.3474 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 237 |
+
| 0.10 | 1 | 6.0914 | 0.0000 | 2.0858 | 0.0000 | 4.0056 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 238 |
+
| 0.14 | 1 | 5.8297 | 0.0000 | 2.4090 | 0.0000 | 3.4207 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 239 |
+
| 0.20 | 1 | 5.5966 | 0.0000 | 2.7805 | 0.0000 | 2.8161 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 240 |
+
| 0.30 | 1 | 5.4419 | 0.0000 | 3.2711 | 0.0000 | 2.1708 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 241 |
+
| 0.40 | 1 | 5.4175 | 0.0000 | 3.6411 | 0.0000 | 1.7763 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 242 |
+
| 0.50 | 1 | 5.4654 | 0.0000 | 3.9927 | 0.0000 | 1.4727 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 243 |
+
| 0.60 | 1 | 5.5867 | 0.0000 | 4.3203 | 0.0000 | 1.2663 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 244 |
+
| 0.70 | 1 | 5.7802 | 0.0000 | 4.7055 | 0.0000 | 1.0747 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 245 |
+
| 0.80 | 1 | 6.0904 | 0.0000 | 5.2467 | 0.0000 | 0.8437 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 246 |
+
| 0.90 | 1 | 7.1601 | 0.0000 | 6.7126 | 0.0000 | 0.4475 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 247 |
+
|
| 248 |
+
### Prefix 500,000 Tokens (20.48 Effective Epochs)
|
| 249 |
+
|
| 250 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 251 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 252 |
+
| 0.00 | 1 | 6.0796 | 0.0000 | 2.5776 | 0.0000 | 3.5021 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 253 |
+
| 0.02 | 1 | 5.6702 | 0.0000 | 2.8128 | 0.0000 | 2.8574 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 254 |
+
| 0.05 | 1 | 5.3633 | 0.0000 | 3.0388 | 0.0000 | 2.3245 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 255 |
+
| 0.08 | 1 | 5.2185 | 0.0000 | 3.2261 | 0.0000 | 1.9924 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 256 |
+
| 0.10 | 1 | 5.1607 | 0.0000 | 3.3108 | 0.0000 | 1.8499 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 257 |
+
| 0.14 | 1 | 5.0671 | 0.0000 | 3.4774 | 0.0000 | 1.5897 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 258 |
+
| 0.20 | 1 | 5.0216 | 0.0000 | 3.6979 | 0.0000 | 1.3238 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 259 |
+
| 0.30 | 1 | 5.0323 | 0.0000 | 3.9668 | 0.0000 | 1.0656 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 260 |
+
| 0.40 | 1 | 5.0917 | 0.0000 | 4.1940 | 0.0000 | 0.8976 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 261 |
+
| 0.50 | 1 | 5.2099 | 0.0000 | 4.4287 | 0.0000 | 0.7812 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 262 |
+
| 0.60 | 1 | 5.3372 | 0.0000 | 4.6669 | 0.0000 | 0.6703 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 263 |
+
| 0.70 | 1 | 5.5502 | 0.0000 | 4.9751 | 0.0000 | 0.5751 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 264 |
+
| 0.80 | 1 | 5.8850 | 0.0000 | 5.4346 | 0.0000 | 0.4504 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 265 |
+
| 0.90 | 1 | 7.1122 | 0.0000 | 6.8702 | 0.0000 | 0.2419 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 266 |
+
|
| 267 |
+
### Prefix 1,000,000 Tokens (10.24 Effective Epochs)
|
| 268 |
+
|
| 269 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 270 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 271 |
+
| 0.00 | 1 | 5.0210 | 0.0000 | 3.5921 | 0.0000 | 1.4288 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 272 |
+
| 0.02 | 1 | 4.8949 | 0.0000 | 3.6926 | 0.0000 | 1.2023 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 273 |
+
| 0.05 | 1 | 4.8096 | 0.0000 | 3.7745 | 0.0000 | 1.0351 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 274 |
+
| 0.08 | 1 | 4.7992 | 0.0000 | 3.8572 | 0.0000 | 0.9420 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 275 |
+
| 0.10 | 1 | 4.7946 | 0.0000 | 3.9262 | 0.0000 | 0.8684 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 276 |
+
| 0.14 | 1 | 4.7763 | 0.0000 | 3.9900 | 0.0000 | 0.7863 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 277 |
+
| 0.20 | 1 | 4.8010 | 0.0000 | 4.1303 | 0.0000 | 0.6708 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 278 |
+
| 0.30 | 1 | 4.8657 | 0.0000 | 4.2940 | 0.0000 | 0.5718 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 279 |
+
| 0.40 | 1 | 4.9626 | 0.0000 | 4.4759 | 0.0000 | 0.4867 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 280 |
+
| 0.50 | 1 | 5.0880 | 0.0000 | 4.6669 | 0.0000 | 0.4211 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 281 |
+
| 0.60 | 1 | 5.2370 | 0.0000 | 4.8647 | 0.0000 | 0.3724 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 282 |
+
| 0.70 | 1 | 5.4416 | 0.0000 | 5.1299 | 0.0000 | 0.3117 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 283 |
+
| 0.80 | 1 | 5.8087 | 0.0000 | 5.5785 | 0.0000 | 0.2302 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 284 |
+
| 0.90 | 1 | 7.1040 | 0.0000 | 6.9954 | 0.0000 | 0.1086 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 285 |
+
|
| 286 |
+
### Prefix 2,000,000 Tokens (5.12 Effective Epochs)
|
| 287 |
+
|
| 288 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 289 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 290 |
+
| 0.00 | 1 | 4.6498 | 0.0000 | 4.0862 | 0.0000 | 0.5635 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 291 |
+
| 0.02 | 1 | 4.6403 | 0.0000 | 4.1384 | 0.0000 | 0.5019 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 292 |
+
| 0.05 | 1 | 4.6319 | 0.0000 | 4.1963 | 0.0000 | 0.4356 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 293 |
+
| 0.08 | 1 | 4.6232 | 0.0000 | 4.2158 | 0.0000 | 0.4074 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 294 |
+
| 0.10 | 1 | 4.6527 | 0.0000 | 4.2675 | 0.0000 | 0.3852 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 295 |
+
| 0.14 | 1 | 4.6524 | 0.0000 | 4.2919 | 0.0000 | 0.3604 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 296 |
+
| 0.20 | 1 | 4.6882 | 0.0000 | 4.3827 | 0.0000 | 0.3055 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 297 |
+
| 0.30 | 1 | 4.7794 | 0.0000 | 4.5149 | 0.0000 | 0.2645 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 298 |
+
| 0.40 | 1 | 4.8796 | 0.0000 | 4.6519 | 0.0000 | 0.2278 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 299 |
+
| 0.50 | 1 | 5.0101 | 0.0000 | 4.8229 | 0.0000 | 0.1873 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 300 |
+
| 0.60 | 1 | 5.1623 | 0.0000 | 4.9993 | 0.0000 | 0.1630 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 301 |
+
| 0.70 | 1 | 5.3763 | 0.0000 | 5.2435 | 0.0000 | 0.1327 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 302 |
+
| 0.80 | 1 | 5.7417 | 0.0000 | 5.6466 | 0.0000 | 0.0950 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 303 |
+
| 0.90 | 1 | 7.0412 | 0.0000 | 7.0093 | 0.0000 | 0.0319 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 304 |
+
|
| 305 |
+
### Prefix 4,000,000 Tokens (2.56 Effective Epochs)
|
| 306 |
+
|
| 307 |
+
| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |
|
| 308 |
+
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|
| 309 |
+
| 0.00 | 1 | 4.5136 | 0.0000 | 4.2515 | 0.0000 | 0.2621 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 310 |
+
| 0.02 | 1 | 4.5287 | 0.0000 | 4.2760 | 0.0000 | 0.2527 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 311 |
+
| 0.05 | 1 | 4.5401 | 0.0000 | 4.3095 | 0.0000 | 0.2306 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 312 |
+
| 0.08 | 1 | 4.5701 | 0.0000 | 4.3547 | 0.0000 | 0.2154 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 313 |
+
| 0.10 | 1 | 4.5783 | 0.0000 | 4.3826 | 0.0000 | 0.1957 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 314 |
+
| 0.14 | 1 | 4.6069 | 0.0000 | 4.4129 | 0.0000 | 0.1939 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 315 |
+
| 0.20 | 1 | 4.6558 | 0.0000 | 4.4934 | 0.0000 | 0.1624 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 316 |
+
| 0.30 | 1 | 4.7516 | 0.0000 | 4.5996 | 0.0000 | 0.1520 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 317 |
+
| 0.40 | 1 | 4.8556 | 0.0000 | 4.7264 | 0.0000 | 0.1292 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 318 |
+
| 0.50 | 1 | 4.9782 | 0.0000 | 4.8738 | 0.0000 | 0.1044 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 319 |
+
| 0.60 | 1 | 5.1452 | 0.0000 | 5.0517 | 0.0000 | 0.0935 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 320 |
+
| 0.70 | 1 | 5.3607 | 0.0000 | 5.2833 | 0.0000 | 0.0774 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 321 |
+
| 0.80 | 1 | 5.7190 | 0.0000 | 5.6752 | 0.0000 | 0.0438 | 0.0000 | 10,240,000 | 8,388,608 |
|
| 322 |
+
| 0.90 | 1 | 7.0566 | 0.0000 | 7.0296 | 0.0000 | 0.0270 | 0.0000 | 10,240,000 | 8,388,608 |
|
runs/screen_static/20260525-133008/config.resume.json
CHANGED
|
@@ -8,6 +8,8 @@
|
|
| 8 |
"resume_from": "runs/screen_static/20260525-133008",
|
| 9 |
"cache_dir": ".cache/dropout_decay",
|
| 10 |
"models": [
|
|
|
|
|
|
|
| 11 |
"L16_H8_D384=16x8x384"
|
| 12 |
],
|
| 13 |
"seeds": [
|
|
@@ -16,7 +18,9 @@
|
|
| 16 |
"token_limits": [
|
| 17 |
250000,
|
| 18 |
500000,
|
| 19 |
-
1000000
|
|
|
|
|
|
|
| 20 |
],
|
| 21 |
"stream_token_caps": [
|
| 22 |
5000000,
|
|
@@ -34,6 +38,14 @@
|
|
| 34 |
"steps": 5000,
|
| 35 |
"stage_steps": 1000,
|
| 36 |
"dropout_rates": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
0.4,
|
| 38 |
0.5,
|
| 39 |
0.6,
|
|
@@ -64,6 +76,18 @@
|
|
| 64 |
1
|
| 65 |
],
|
| 66 |
"models": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
{
|
| 68 |
"model_name": "L16_H8_D384",
|
| 69 |
"n_layer": 16,
|
|
@@ -83,7 +107,9 @@
|
|
| 83 |
"effective_token_limits": [
|
| 84 |
250000,
|
| 85 |
500000,
|
| 86 |
-
1000000
|
|
|
|
|
|
|
| 87 |
],
|
| 88 |
"effective_stream_token_caps": [
|
| 89 |
5000000,
|
|
|
|
| 8 |
"resume_from": "runs/screen_static/20260525-133008",
|
| 9 |
"cache_dir": ".cache/dropout_decay",
|
| 10 |
"models": [
|
| 11 |
+
"L8_H8_D256=8x8x256",
|
| 12 |
+
"L12_H8_D320=12x8x320",
|
| 13 |
"L16_H8_D384=16x8x384"
|
| 14 |
],
|
| 15 |
"seeds": [
|
|
|
|
| 18 |
"token_limits": [
|
| 19 |
250000,
|
| 20 |
500000,
|
| 21 |
+
1000000,
|
| 22 |
+
2000000,
|
| 23 |
+
4000000
|
| 24 |
],
|
| 25 |
"stream_token_caps": [
|
| 26 |
5000000,
|
|
|
|
| 38 |
"steps": 5000,
|
| 39 |
"stage_steps": 1000,
|
| 40 |
"dropout_rates": [
|
| 41 |
+
0.0,
|
| 42 |
+
0.02,
|
| 43 |
+
0.05,
|
| 44 |
+
0.08,
|
| 45 |
+
0.1,
|
| 46 |
+
0.14,
|
| 47 |
+
0.2,
|
| 48 |
+
0.3,
|
| 49 |
0.4,
|
| 50 |
0.5,
|
| 51 |
0.6,
|
|
|
|
| 76 |
1
|
| 77 |
],
|
| 78 |
"models": [
|
| 79 |
+
{
|
| 80 |
+
"model_name": "L8_H8_D256",
|
| 81 |
+
"n_layer": 8,
|
| 82 |
+
"n_head": 8,
|
| 83 |
+
"n_embd": 256
|
| 84 |
+
},
|
| 85 |
+
{
|
| 86 |
+
"model_name": "L12_H8_D320",
|
| 87 |
+
"n_layer": 12,
|
| 88 |
+
"n_head": 8,
|
| 89 |
+
"n_embd": 320
|
| 90 |
+
},
|
| 91 |
{
|
| 92 |
"model_name": "L16_H8_D384",
|
| 93 |
"n_layer": 16,
|
|
|
|
| 107 |
"effective_token_limits": [
|
| 108 |
250000,
|
| 109 |
500000,
|
| 110 |
+
1000000,
|
| 111 |
+
2000000,
|
| 112 |
+
4000000
|
| 113 |
],
|
| 114 |
"effective_stream_token_caps": [
|
| 115 |
5000000,
|
runs/screen_static/20260525-133008/dropout_curves.svg
CHANGED
|
|
|
|
runs/screen_static/20260525-133008/metrics.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
runs/screen_static/20260525-133008/model_selection.csv
CHANGED
|
@@ -1,5 +1,16 @@
|
|
| 1 |
run_mode,token_limit,model_name,n_layer,n_head,n_embd,parameters,n,best_dropout,best_val_loss,best_val_std,plateau_start_dropout,plateau_end_dropout,plateau_delta,zero_dropout_val_loss,zero_minus_best,best_nonzero_dropout,best_nonzero_val_loss,zero_minus_best_nonzero,max_dropout,max_dropout_val_loss,max_dropout_minus_best,has_nonzero_optimum,meets_target_dropout,curve_json
|
| 2 |
-
screen_static,2000000,L16_H8_D384,16,8,384,31457280,1,0.14,4.427024222910404,0.0,0.1,0.14,0.01,4.625189505517483,0.19816528260707855,0.14,4.427024222910404,0.19816528260707855,0.
|
| 3 |
-
screen_static,
|
| 4 |
-
screen_static,
|
| 5 |
-
screen_static,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
run_mode,token_limit,model_name,n_layer,n_head,n_embd,parameters,n,best_dropout,best_val_loss,best_val_std,plateau_start_dropout,plateau_end_dropout,plateau_delta,zero_dropout_val_loss,zero_minus_best,best_nonzero_dropout,best_nonzero_val_loss,zero_minus_best_nonzero,max_dropout,max_dropout_val_loss,max_dropout_minus_best,has_nonzero_optimum,meets_target_dropout,curve_json
|
| 2 |
+
screen_static,2000000,L16_H8_D384,16,8,384,31457280,1,0.14,4.427024222910404,0.0,0.1,0.14,0.01,4.625189505517483,0.19816528260707855,0.14,4.427024222910404,0.19816528260707855,0.9,6.725677810609341,2.2986535876989365,True,True,"[{""dropout"": 0.0, ""mean_generalization_gap"": 1.2505649104714394, ""mean_train_loss"": 3.3746245950460434, ""mean_val_loss"": 4.625189505517483, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.02, ""mean_generalization_gap"": 1.0588822290301323, ""mean_train_loss"": 3.4837590381503105, ""mean_val_loss"": 4.542641267180443, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.05, ""mean_generalization_gap"": 0.9152051359415054, ""mean_train_loss"": 3.57197318226099, ""mean_val_loss"": 4.487178318202496, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.08, ""mean_generalization_gap"": 0.8277053758502007, ""mean_train_loss"": 3.6279477402567863, ""mean_val_loss"": 4.455653116106987, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.1, ""mean_generalization_gap"": 0.74983249604702, ""mean_train_loss"": 3.686768524348736, ""mean_val_loss"": 4.436601020395756, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.14, ""mean_generalization_gap"": 0.6508964970707893, ""mean_train_loss"": 3.776127725839615, ""mean_val_loss"": 4.427024222910404, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.2, ""mean_generalization_gap"": 0.5422321110963821, ""mean_train_loss"": 3.9085786044597626, ""mean_val_loss"": 4.450810715556145, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.3, ""mean_generalization_gap"": 0.4548688307404518, ""mean_train_loss"": 4.04004543274641, ""mean_val_loss"": 4.494914263486862, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.4, ""mean_generalization_gap"": 0.390671081840992, ""mean_train_loss"": 4.1651866137981415, ""mean_val_loss"": 4.5558576956391335, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.5, ""mean_generalization_gap"": 0.3157801032066345, ""mean_train_loss"": 4.337527960538864, ""mean_val_loss"": 4.653308063745499, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.6, ""mean_generalization_gap"": 0.26150281727313995, ""mean_train_loss"": 4.549529016017914, ""mean_val_loss"": 4.811031833291054, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.7, ""mean_generalization_gap"": 0.2040063962340355, ""mean_train_loss"": 4.847206577658653, ""mean_val_loss"": 5.051212973892689, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.8, ""mean_generalization_gap"": 0.14004798233509064, ""mean_train_loss"": 5.27906721830368, ""mean_val_loss"": 5.419115200638771, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.9, ""mean_generalization_gap"": 0.028456024825572968, ""mean_train_loss"": 6.697221785783768, ""mean_val_loss"": 6.725677810609341, ""n"": 1, ""std_val_loss"": 0.0}]"
|
| 3 |
+
screen_static,2000000,L12_H8_D320,12,8,320,17367040,1,0.14,4.508757032454014,0.0,0.05,0.14,0.01,4.59531170129776,0.08655466884374619,0.14,4.508757032454014,0.08655466884374619,0.9,6.741106614470482,2.232349582016468,True,True,"[{""dropout"": 0.0, ""mean_generalization_gap"": 0.8792819976806641, ""mean_train_loss"": 3.716029703617096, ""mean_val_loss"": 4.59531170129776, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.02, ""mean_generalization_gap"": 0.7727920487523079, ""mean_train_loss"": 3.7766708433628082, ""mean_val_loss"": 4.549462892115116, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.05, ""mean_generalization_gap"": 0.658709242939949, ""mean_train_loss"": 3.8593605384230614, ""mean_val_loss"": 4.51806978136301, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.08, ""mean_generalization_gap"": 0.5811423286795616, ""mean_train_loss"": 3.9309239983558655, ""mean_val_loss"": 4.512066327035427, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.1, ""mean_generalization_gap"": 0.5585346668958664, ""mean_train_loss"": 3.9626980274915695, ""mean_val_loss"": 4.521232694387436, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.14, ""mean_generalization_gap"": 0.486968994140625, ""mean_train_loss"": 4.021788038313389, ""mean_val_loss"": 4.508757032454014, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.2, ""mean_generalization_gap"": 0.4391852915287018, ""mean_train_loss"": 4.107810087502003, ""mean_val_loss"": 4.5469953790307045, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.3, ""mean_generalization_gap"": 0.35860010236501694, ""mean_train_loss"": 4.248827308416367, ""mean_val_loss"": 4.6074274107813835, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.4, ""mean_generalization_gap"": 0.30667147785425186, ""mean_train_loss"": 4.388647809624672, ""mean_val_loss"": 4.695319287478924, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.5, ""mean_generalization_gap"": 0.2494005709886551, ""mean_train_loss"": 4.564960986375809, ""mean_val_loss"": 4.814361557364464, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.6, ""mean_generalization_gap"": 0.21227668970823288, ""mean_train_loss"": 4.756197020411491, ""mean_val_loss"": 4.968473710119724, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.7, ""mean_generalization_gap"": 0.17023956030607224, ""mean_train_loss"": 5.009785428643227, ""mean_val_loss"": 5.180024988949299, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.8, ""mean_generalization_gap"": 0.11836093664169312, ""mean_train_loss"": 5.402336552739143, ""mean_val_loss"": 5.5206974893808365, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.9, ""mean_generalization_gap"": 0.020013347268104553, ""mean_train_loss"": 6.721093267202377, ""mean_val_loss"": 6.741106614470482, ""n"": 1, ""std_val_loss"": 0.0}]"
|
| 4 |
+
screen_static,1000000,L16_H8_D384,16,8,384,31457280,1,0.3,4.651066102087498,0.0,0.3,0.3,0.01,5.652379140257835,1.0013130381703377,0.3,4.651066102087498,1.0013130381703377,0.9,6.627339482307434,1.9762733802199364,True,True,"[{""dropout"": 0.0, ""mean_generalization_gap"": 3.4047557078301907, ""mean_train_loss"": 2.2476234324276447, ""mean_val_loss"": 5.652379140257835, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.02, ""mean_generalization_gap"": 2.873790219426155, ""mean_train_loss"": 2.4646411538124084, ""mean_val_loss"": 5.3384313732385635, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.05, ""mean_generalization_gap"": 2.4113941341638565, ""mean_train_loss"": 2.7036017999053, ""mean_val_loss"": 5.114995934069157, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.08, ""mean_generalization_gap"": 2.0462540313601494, ""mean_train_loss"": 2.8963891118764877, ""mean_val_loss"": 4.942643143236637, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.1, ""mean_generalization_gap"": 1.8506078496575356, ""mean_train_loss"": 2.992223806679249, ""mean_val_loss"": 4.842831656336784, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.14, ""mean_generalization_gap"": 1.5568379163742065, ""mean_train_loss"": 3.1947626248002052, ""mean_val_loss"": 4.751600541174412, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.2, ""mean_generalization_gap"": 1.282993070781231, ""mean_train_loss"": 3.3926073163747787, ""mean_val_loss"": 4.67560038715601, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.3, ""mean_generalization_gap"": 1.0215996354818344, ""mean_train_loss"": 3.6294664666056633, ""mean_val_loss"": 4.651066102087498, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.4, ""mean_generalization_gap"": 0.8495340198278427, ""mean_train_loss"": 3.8496963307261467, ""mean_val_loss"": 4.699230350553989, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.5, ""mean_generalization_gap"": 0.7054777815937996, ""mean_train_loss"": 4.063626378774643, ""mean_val_loss"": 4.7691041603684425, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.6, ""mean_generalization_gap"": 0.57613954693079, ""mean_train_loss"": 4.328054502606392, ""mean_val_loss"": 4.904194049537182, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.7, ""mean_generalization_gap"": 0.45122554898262024, ""mean_train_loss"": 4.668859913945198, ""mean_val_loss"": 5.120085462927818, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.8, ""mean_generalization_gap"": 0.3151302859187126, ""mean_train_loss"": 5.189247876405716, ""mean_val_loss"": 5.504378162324429, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.9, ""mean_generalization_gap"": 0.11999441683292389, ""mean_train_loss"": 6.50734506547451, ""mean_val_loss"": 6.627339482307434, ""n"": 1, ""std_val_loss"": 0.0}]"
|
| 5 |
+
screen_static,1000000,L12_H8_D320,12,8,320,17367040,1,0.2,4.687077932059765,0.0,0.2,0.2,0.01,5.327972687780857,0.6408947557210922,0.2,4.687077932059765,0.6408947557210922,0.9,6.74848935008049,2.0614114180207253,True,True,"[{""dropout"": 0.0, ""mean_generalization_gap"": 2.441593125462532, ""mean_train_loss"": 2.886379562318325, ""mean_val_loss"": 5.327972687780857, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.02, ""mean_generalization_gap"": 2.035951793193817, ""mean_train_loss"": 3.0628975853323936, ""mean_val_loss"": 5.098849378526211, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.05, ""mean_generalization_gap"": 1.634356364607811, ""mean_train_loss"": 3.268335275352001, ""mean_val_loss"": 4.902691639959812, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.08, ""mean_generalization_gap"": 1.3981640711426735, ""mean_train_loss"": 3.3916852474212646, ""mean_val_loss"": 4.789849318563938, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.1, ""mean_generalization_gap"": 1.285424992442131, ""mean_train_loss"": 3.4663245379924774, ""mean_val_loss"": 4.7517495304346085, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.14, ""mean_generalization_gap"": 1.1256954967975616, ""mean_train_loss"": 3.591317318379879, ""mean_val_loss"": 4.717012815177441, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.2, ""mean_generalization_gap"": 0.9711186662316322, ""mean_train_loss"": 3.7159592658281326, ""mean_val_loss"": 4.687077932059765, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.3, ""mean_generalization_gap"": 0.7806104272603989, ""mean_train_loss"": 3.9408339336514473, ""mean_val_loss"": 4.721444360911846, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.4, ""mean_generalization_gap"": 0.6547741517424583, ""mean_train_loss"": 4.1257737800478935, ""mean_val_loss"": 4.780547931790352, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.5, ""mean_generalization_gap"": 0.557686097919941, ""mean_train_loss"": 4.341994017362595, ""mean_val_loss"": 4.8996801152825356, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.6, ""mean_generalization_gap"": 0.47620102018117905, ""mean_train_loss"": 4.583218589425087, ""mean_val_loss"": 5.059419609606266, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.7, ""mean_generalization_gap"": 0.39166851341724396, ""mean_train_loss"": 4.866017505526543, ""mean_val_loss"": 5.257686018943787, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.8, ""mean_generalization_gap"": 0.29053743183612823, ""mean_train_loss"": 5.279122695326805, ""mean_val_loss"": 5.569660127162933, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.9, ""mean_generalization_gap"": 0.11938704550266266, ""mean_train_loss"": 6.6291023045778275, ""mean_val_loss"": 6.74848935008049, ""n"": 1, ""std_val_loss"": 0.0}]"
|
| 6 |
+
screen_static,1000000,L8_H8_D256,8,8,256,8388608,1,0.14,4.776347942650318,0.0,0.14,0.14,0.01,5.020985089242458,0.2446371465921402,0.14,4.776347942650318,0.2446371465921402,0.9,7.103986039757729,2.3276380971074104,True,True,"[{""dropout"": 0.0, ""mean_generalization_gap"": 1.4288483187556267, ""mean_train_loss"": 3.5921367704868317, ""mean_val_loss"": 5.020985089242458, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.02, ""mean_generalization_gap"": 1.2022551372647285, ""mean_train_loss"": 3.692635416984558, ""mean_val_loss"": 4.894890554249287, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.05, ""mean_generalization_gap"": 1.03511231392622, ""mean_train_loss"": 3.774454675614834, ""mean_val_loss"": 4.809566989541054, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.08, ""mean_generalization_gap"": 0.941984124481678, ""mean_train_loss"": 3.8572388663887978, ""mean_val_loss"": 4.799222990870476, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.1, ""mean_generalization_gap"": 0.8684338107705116, ""mean_train_loss"": 3.9261686205863953, ""mean_val_loss"": 4.794602431356907, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.14, ""mean_generalization_gap"": 0.7863189205527306, ""mean_train_loss"": 3.9900290220975876, ""mean_val_loss"": 4.776347942650318, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.2, ""mean_generalization_gap"": 0.6707855090498924, ""mean_train_loss"": 4.130261890590191, ""mean_val_loss"": 4.801047399640083, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.3, ""mean_generalization_gap"": 0.5717819258570671, ""mean_train_loss"": 4.293951943516731, ""mean_val_loss"": 4.865733869373798, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.4, ""mean_generalization_gap"": 0.4866805672645569, ""mean_train_loss"": 4.475886330008507, ""mean_val_loss"": 4.962566897273064, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.5, ""mean_generalization_gap"": 0.42112673074007034, ""mean_train_loss"": 4.666878193616867, ""mean_val_loss"": 5.088004924356937, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.6, ""mean_generalization_gap"": 0.3723558411002159, ""mean_train_loss"": 4.864694103598595, ""mean_val_loss"": 5.237049944698811, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.7, ""mean_generalization_gap"": 0.3116709440946579, ""mean_train_loss"": 5.129916787147522, ""mean_val_loss"": 5.44158773124218, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.8, ""mean_generalization_gap"": 0.23021624982357025, ""mean_train_loss"": 5.578491851687431, ""mean_val_loss"": 5.808708101511002, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.9, ""mean_generalization_gap"": 0.10863403975963593, ""mean_train_loss"": 6.995351999998093, ""mean_val_loss"": 7.103986039757729, ""n"": 1, ""std_val_loss"": 0.0}]"
|
| 7 |
+
screen_static,500000,L12_H8_D320,12,8,320,17367040,1,0.4,4.979132980108261,0.0,0.3,0.4,0.01,6.882166460156441,1.9030334800481796,0.4,4.979132980108261,1.9030334800481796,0.9,6.766090348362923,1.7869573682546616,True,True,"[{""dropout"": 0.0, ""mean_generalization_gap"": 5.444314293563366, ""mean_train_loss"": 1.4378521665930748, ""mean_val_loss"": 6.882166460156441, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.02, ""mean_generalization_gap"": 4.635581977665424, ""mean_train_loss"": 1.7486785426735878, ""mean_val_loss"": 6.384260520339012, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.05, ""mean_generalization_gap"": 3.839024405926466, ""mean_train_loss"": 2.0567418597638607, ""mean_val_loss"": 5.895766265690327, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.08, ""mean_generalization_gap"": 3.225618854165077, ""mean_train_loss"": 2.371535360813141, ""mean_val_loss"": 5.597154214978218, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.1, ""mean_generalization_gap"": 2.958456054329872, ""mean_train_loss"": 2.4893965795636177, ""mean_val_loss"": 5.44785263389349, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.14, ""mean_generalization_gap"": 2.512647695839405, ""mean_train_loss"": 2.7468534484505653, ""mean_val_loss"": 5.25950114428997, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.2, ""mean_generalization_gap"": 2.0600279718637466, ""mean_train_loss"": 3.0360172167420387, ""mean_val_loss"": 5.096045188605785, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.3, ""mean_generalization_gap"": 1.5742274522781372, ""mean_train_loss"": 3.40982835739851, ""mean_val_loss"": 4.984055809676647, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.4, ""mean_generalization_gap"": 1.2433720752596855, ""mean_train_loss"": 3.7357609048485756, ""mean_val_loss"": 4.979132980108261, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.5, ""mean_generalization_gap"": 1.0286902412772179, ""mean_train_loss"": 4.029533125460148, ""mean_val_loss"": 5.058223366737366, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.6, ""mean_generalization_gap"": 0.8609125763177872, ""mean_train_loss"": 4.322844922542572, ""mean_val_loss"": 5.183757498860359, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.7, ""mean_generalization_gap"": 0.703565426170826, ""mean_train_loss"": 4.680620342493057, ""mean_val_loss"": 5.384185768663883, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.8, ""mean_generalization_gap"": 0.530634418129921, ""mean_train_loss"": 5.157206907868385, ""mean_val_loss"": 5.687841325998306, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.9, ""mean_generalization_gap"": 0.27382446825504303, ""mean_train_loss"": 6.49226588010788, ""mean_val_loss"": 6.766090348362923, ""n"": 1, ""std_val_loss"": 0.0}]"
|
| 8 |
+
screen_static,500000,L16_H8_D384,16,8,384,31457280,1,0.4,4.98137766122818,0.0,0.4,0.5,0.01,7.288873381912708,2.3074957206845284,0.4,4.98137766122818,2.3074957206845284,0.9,6.620277687907219,1.638900026679039,True,True,"[{""dropout"": 0.0, ""mean_generalization_gap"": 6.568858440965414, ""mean_train_loss"": 0.7200149409472942, ""mean_val_loss"": 7.288873381912708, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.02, ""mean_generalization_gap"": 5.983665386214852, ""mean_train_loss"": 0.9117673244327307, ""mean_val_loss"": 6.895432710647583, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.05, ""mean_generalization_gap"": 5.264719225466251, ""mean_train_loss"": 1.2327042073011398, ""mean_val_loss"": 6.497423432767391, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.08, ""mean_generalization_gap"": 4.569968264549971, ""mean_train_loss"": 1.5379231162369251, ""mean_val_loss"": 6.107891380786896, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.1, ""mean_generalization_gap"": 4.221366349607706, ""mean_train_loss"": 1.6818840466439724, ""mean_val_loss"": 5.9032503962516785, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.14, ""mean_generalization_gap"": 3.635788831859827, ""mean_train_loss"": 1.9904268346726894, ""mean_val_loss"": 5.6262156665325165, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.2, ""mean_generalization_gap"": 2.9111227244138718, ""mean_train_loss"": 2.416015110909939, ""mean_val_loss"": 5.327137835323811, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.3, ""mean_generalization_gap"": 2.17129834741354, ""mean_train_loss"": 2.8952616825699806, ""mean_val_loss"": 5.0665600299835205, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.4, ""mean_generalization_gap"": 1.7016963437199593, ""mean_train_loss"": 3.2796813175082207, ""mean_val_loss"": 4.98137766122818, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.5, ""mean_generalization_gap"": 1.3544480353593826, ""mean_train_loss"": 3.629131570458412, ""mean_val_loss"": 4.983579605817795, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.6, ""mean_generalization_gap"": 1.0571283996105194, ""mean_train_loss"": 4.0092809200286865, ""mean_val_loss"": 5.066409319639206, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.7, ""mean_generalization_gap"": 0.8311320170760155, ""mean_train_loss"": 4.435714960098267, ""mean_val_loss"": 5.266846977174282, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.8, ""mean_generalization_gap"": 0.5717534348368645, ""mean_train_loss"": 5.014922052621841, ""mean_val_loss"": 5.586675487458706, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.9, ""mean_generalization_gap"": 0.25871309638023376, ""mean_train_loss"": 6.361564591526985, ""mean_val_loss"": 6.620277687907219, ""n"": 1, ""std_val_loss"": 0.0}]"
|
| 9 |
+
screen_static,500000,L8_H8_D256,8,8,256,8388608,1,0.2,5.02164863795042,0.0,0.2,0.2,0.01,6.079639628529549,1.0579909905791283,0.2,5.02164863795042,1.0579909905791283,0.9,7.112152203917503,2.090503565967083,True,True,"[{""dropout"": 0.0, ""mean_generalization_gap"": 3.502076916396618, ""mean_train_loss"": 2.5775627121329308, ""mean_val_loss"": 6.079639628529549, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.02, ""mean_generalization_gap"": 2.8574068769812584, ""mean_train_loss"": 2.8127700313925743, ""mean_val_loss"": 5.670176908373833, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.05, ""mean_generalization_gap"": 2.324491947889328, ""mean_train_loss"": 3.0387847647070885, ""mean_val_loss"": 5.3632767125964165, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.08, ""mean_generalization_gap"": 1.9923998713493347, ""mean_train_loss"": 3.2260689064860344, ""mean_val_loss"": 5.218468777835369, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.1, ""mean_generalization_gap"": 1.8499257937073708, ""mean_train_loss"": 3.3107700124382973, ""mean_val_loss"": 5.160695806145668, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.14, ""mean_generalization_gap"": 1.5897220373153687, ""mean_train_loss"": 3.4774143397808075, ""mean_val_loss"": 5.067136377096176, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.2, ""mean_generalization_gap"": 1.3237614631652832, ""mean_train_loss"": 3.697887174785137, ""mean_val_loss"": 5.02164863795042, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.3, ""mean_generalization_gap"": 1.0655959695577621, ""mean_train_loss"": 3.9667534679174423, ""mean_val_loss"": 5.0323494374752045, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.4, ""mean_generalization_gap"": 0.8976302519440651, ""mean_train_loss"": 4.194036483764648, ""mean_val_loss"": 5.0916667357087135, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.5, ""mean_generalization_gap"": 0.7811932787299156, ""mean_train_loss"": 4.428740322589874, ""mean_val_loss"": 5.20993360131979, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.6, ""mean_generalization_gap"": 0.6703063324093819, ""mean_train_loss"": 4.666903376579285, ""mean_val_loss"": 5.3372097089886665, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.7, ""mean_generalization_gap"": 0.5750636905431747, ""mean_train_loss"": 4.97511188685894, ""mean_val_loss"": 5.550175577402115, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.8, ""mean_generalization_gap"": 0.4504067897796631, ""mean_train_loss"": 5.43462572991848, ""mean_val_loss"": 5.885032519698143, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.9, ""mean_generalization_gap"": 0.24193444848060608, ""mean_train_loss"": 6.870217755436897, ""mean_val_loss"": 7.112152203917503, ""n"": 1, ""std_val_loss"": 0.0}]"
|
| 10 |
+
screen_static,250000,L8_H8_D256,8,8,256,8388608,1,0.4,5.417454726994038,0.0,0.4,0.4,0.01,7.917540371417999,2.5000856444239616,0.4,5.417454726994038,2.5000856444239616,0.9,7.160110808908939,1.7426560819149017,True,True,"[{""dropout"": 0.0, ""mean_generalization_gap"": 6.9651469346135855, ""mean_train_loss"": 0.9523934368044138, ""mean_val_loss"": 7.917540371417999, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.02, ""mean_generalization_gap"": 5.999898966401815, ""mean_train_loss"": 1.2368882782757282, ""mean_val_loss"": 7.236787244677544, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.05, ""mean_generalization_gap"": 4.966615006327629, ""mean_train_loss"": 1.6339460164308548, ""mean_val_loss"": 6.600561022758484, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.08, ""mean_generalization_gap"": 4.347427327185869, ""mean_train_loss"": 1.9089897610247135, ""mean_val_loss"": 6.256417088210583, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.1, ""mean_generalization_gap"": 4.005591854453087, ""mean_train_loss"": 2.0858395472168922, ""mean_val_loss"": 6.091431401669979, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.14, ""mean_generalization_gap"": 3.420706979930401, ""mean_train_loss"": 2.409040831029415, ""mean_val_loss"": 5.829747810959816, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.2, ""mean_generalization_gap"": 2.8160821720957756, ""mean_train_loss"": 2.780476927757263, ""mean_val_loss"": 5.596559099853039, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.3, ""mean_generalization_gap"": 2.1707596853375435, ""mean_train_loss"": 3.2711103558540344, ""mean_val_loss"": 5.441870041191578, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.4, ""mean_generalization_gap"": 1.7763118371367455, ""mean_train_loss"": 3.641142889857292, ""mean_val_loss"": 5.417454726994038, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.5, ""mean_generalization_gap"": 1.472730539739132, ""mean_train_loss"": 3.9926967695355415, ""mean_val_loss"": 5.4654273092746735, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.6, ""mean_generalization_gap"": 1.2663473039865494, ""mean_train_loss"": 4.320340022444725, ""mean_val_loss"": 5.586687326431274, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.7, ""mean_generalization_gap"": 1.0747108310461044, ""mean_train_loss"": 4.705530673265457, ""mean_val_loss"": 5.780241504311562, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.8, ""mean_generalization_gap"": 0.8436890691518784, ""mean_train_loss"": 5.24668562412262, ""mean_val_loss"": 6.090374693274498, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.9, ""mean_generalization_gap"": 0.44749947637319565, ""mean_train_loss"": 6.712611332535744, ""mean_val_loss"": 7.160110808908939, ""n"": 1, ""std_val_loss"": 0.0}]"
|
| 11 |
+
screen_static,250000,L12_H8_D320,12,8,320,17367040,1,0.5,5.438360869884491,0.0,0.5,0.5,0.01,8.15590063482523,2.7175397649407387,0.5,5.438360869884491,2.7175397649407387,0.9,6.905720971524715,1.4673601016402245,True,True,"[{""dropout"": 0.0, ""mean_generalization_gap"": 7.7617882722988725, ""mean_train_loss"": 0.3941123625263572, ""mean_val_loss"": 8.15590063482523, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.02, ""mean_generalization_gap"": 7.574556526727974, ""mean_train_loss"": 0.4179071066901088, ""mean_val_loss"": 7.992463633418083, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.05, ""mean_generalization_gap"": 6.955689197406173, ""mean_train_loss"": 0.5652239341288805, ""mean_val_loss"": 7.520913131535053, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.08, ""mean_generalization_gap"": 6.330640483647585, ""mean_train_loss"": 0.7794149778783321, ""mean_val_loss"": 7.110055461525917, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.1, ""mean_generalization_gap"": 6.015923976898193, ""mean_train_loss"": 0.9396447688341141, ""mean_val_loss"": 6.955568745732307, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.14, ""mean_generalization_gap"": 5.296522863209248, ""mean_train_loss"": 1.2607302814722061, ""mean_val_loss"": 6.557253144681454, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.2, ""mean_generalization_gap"": 4.479952562600374, ""mean_train_loss"": 1.6681972332298756, ""mean_val_loss"": 6.14814979583025, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.3, ""mean_generalization_gap"": 3.3666396513581276, ""mean_train_loss"": 2.34628177434206, ""mean_val_loss"": 5.712921425700188, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.4, ""mean_generalization_gap"": 2.6368888467550278, ""mean_train_loss"": 2.881612576544285, ""mean_val_loss"": 5.518501423299313, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.5, ""mean_generalization_gap"": 2.06633959710598, ""mean_train_loss"": 3.372021272778511, ""mean_val_loss"": 5.438360869884491, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.6, ""mean_generalization_gap"": 1.676231786608696, ""mean_train_loss"": 3.8244777768850327, ""mean_val_loss"": 5.500709563493729, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.7, ""mean_generalization_gap"": 1.338156834244728, ""mean_train_loss"": 4.315010622143745, ""mean_val_loss"": 5.6531674563884735, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.8, ""mean_generalization_gap"": 1.018489956855774, ""mean_train_loss"": 4.907547950744629, ""mean_val_loss"": 5.926037907600403, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.9, ""mean_generalization_gap"": 0.5023725405335426, ""mean_train_loss"": 6.403348430991173, ""mean_val_loss"": 6.905720971524715, ""n"": 1, ""std_val_loss"": 0.0}]"
|
| 12 |
+
screen_static,250000,L16_H8_D384,16,8,384,31457280,1,0.6,5.505546368658543,0.0,0.6,0.6,0.01,8.201775573194027,2.6962292045354843,0.6,5.505546368658543,2.6962292045354843,0.9,6.843862466514111,1.338316097855568,True,True,"[{""dropout"": 0.0, ""mean_generalization_gap"": 7.9225469790399075, ""mean_train_loss"": 0.2792285941541195, ""mean_val_loss"": 8.201775573194027, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.02, ""mean_generalization_gap"": 7.81083921296522, ""mean_train_loss"": 0.2605770383961499, ""mean_val_loss"": 8.07141625136137, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.05, ""mean_generalization_gap"": 7.690946782939136, ""mean_train_loss"": 0.26725416351109743, ""mean_val_loss"": 7.9582009464502335, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.08, ""mean_generalization_gap"": 7.46396254748106, ""mean_train_loss"": 0.29682786762714386, ""mean_val_loss"": 7.760790415108204, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.1, ""mean_generalization_gap"": 7.249795647338033, ""mean_train_loss"": 0.338621674105525, ""mean_val_loss"": 7.588417321443558, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.14, ""mean_generalization_gap"": 6.785148191265762, ""mean_train_loss"": 0.49970753211528063, ""mean_val_loss"": 7.2848557233810425, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.2, ""mean_generalization_gap"": 6.015826283022761, ""mean_train_loss"": 0.8224823493510485, ""mean_val_loss"": 6.83830863237381, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.3, ""mean_generalization_gap"": 4.851862885057926, ""mean_train_loss"": 1.391046978533268, ""mean_val_loss"": 6.242909863591194, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.4, ""mean_generalization_gap"": 3.791083261370659, ""mean_train_loss"": 2.053295038640499, ""mean_val_loss"": 5.844378300011158, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.5, ""mean_generalization_gap"": 2.913642890751362, ""mean_train_loss"": 2.678894154727459, ""mean_val_loss"": 5.592537045478821, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.6, ""mean_generalization_gap"": 2.1870318800210953, ""mean_train_loss"": 3.3185144886374474, ""mean_val_loss"": 5.505546368658543, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.7, ""mean_generalization_gap"": 1.656675435602665, ""mean_train_loss"": 3.9233425855636597, ""mean_val_loss"": 5.580018021166325, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.8, ""mean_generalization_gap"": 1.114737719297409, ""mean_train_loss"": 4.725207552313805, ""mean_val_loss"": 5.839945271611214, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.9, ""mean_generalization_gap"": 0.48853396624326706, ""mean_train_loss"": 6.3553285002708435, ""mean_val_loss"": 6.843862466514111, ""n"": 1, ""std_val_loss"": 0.0}]"
|
| 13 |
+
screen_static,4000000,L16_H8_D384,16,8,384,31457280,1,0.02,4.294691443443298,0.0,0.02,0.08,0.01,4.324729532003403,0.03003808856010437,0.02,4.294691443443298,0.03003808856010437,0.9,6.623879760503769,2.3291883170604706,True,False,"[{""dropout"": 0.0, ""mean_generalization_gap"": 0.4830755889415741, ""mean_train_loss"": 3.8416539430618286, ""mean_val_loss"": 4.324729532003403, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.02, ""mean_generalization_gap"": 0.44003017246723175, ""mean_train_loss"": 3.8546612709760666, ""mean_val_loss"": 4.294691443443298, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.05, ""mean_generalization_gap"": 0.40492552518844604, ""mean_train_loss"": 3.910273350775242, ""mean_val_loss"": 4.315198875963688, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.08, ""mean_generalization_gap"": 0.3686877265572548, ""mean_train_loss"": 3.9314780607819557, ""mean_val_loss"": 4.3001657873392105, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.1, ""mean_generalization_gap"": 0.3556562587618828, ""mean_train_loss"": 3.9545229598879814, ""mean_val_loss"": 4.310179218649864, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.14, ""mean_generalization_gap"": 0.31573040783405304, ""mean_train_loss"": 4.008677661418915, ""mean_val_loss"": 4.324408069252968, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.2, ""mean_generalization_gap"": 0.2667204812169075, ""mean_train_loss"": 4.085749976336956, ""mean_val_loss"": 4.3524704575538635, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.3, ""mean_generalization_gap"": 0.22910351306200027, ""mean_train_loss"": 4.181676417589188, ""mean_val_loss"": 4.410779930651188, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.4, ""mean_generalization_gap"": 0.20371749252080917, ""mean_train_loss"": 4.293805286288261, ""mean_val_loss"": 4.497522778809071, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.5, ""mean_generalization_gap"": 0.17503493279218674, ""mean_train_loss"": 4.431871071457863, ""mean_val_loss"": 4.60690600425005, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.6, ""mean_generalization_gap"": 0.13883397728204727, ""mean_train_loss"": 4.635675564408302, ""mean_val_loss"": 4.77450954169035, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.7, ""mean_generalization_gap"": 0.11617618799209595, ""mean_train_loss"": 4.896953746676445, ""mean_val_loss"": 5.013129934668541, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.8, ""mean_generalization_gap"": 0.06960213929414749, ""mean_train_loss"": 5.347471252083778, ""mean_val_loss"": 5.417073391377926, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.9, ""mean_generalization_gap"": 0.020191222429275513, ""mean_train_loss"": 6.603688538074493, ""mean_val_loss"": 6.623879760503769, ""n"": 1, ""std_val_loss"": 0.0}]"
|
| 14 |
+
screen_static,4000000,L12_H8_D320,12,8,320,17367040,1,0.02,4.387514792382717,0.0,0.02,0.02,0.01,4.399908438324928,0.012393645942211151,0.02,4.387514792382717,0.012393645942211151,0.9,6.7735652178525925,2.3860504254698753,True,False,"[{""dropout"": 0.0, ""mean_generalization_gap"": 0.3676489070057869, ""mean_train_loss"": 4.032259531319141, ""mean_val_loss"": 4.399908438324928, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.02, ""mean_generalization_gap"": 0.35747382044792175, ""mean_train_loss"": 4.030040971934795, ""mean_val_loss"": 4.387514792382717, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.05, ""mean_generalization_gap"": 0.3134579360485077, ""mean_train_loss"": 4.092799432575703, ""mean_val_loss"": 4.40625736862421, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.08, ""mean_generalization_gap"": 0.28922586888074875, ""mean_train_loss"": 4.108970053493977, ""mean_val_loss"": 4.398195922374725, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.1, ""mean_generalization_gap"": 0.27910205721855164, ""mean_train_loss"": 4.135565012693405, ""mean_val_loss"": 4.414667069911957, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.14, ""mean_generalization_gap"": 0.25001612305641174, ""mean_train_loss"": 4.194389328360558, ""mean_val_loss"": 4.444405451416969, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.2, ""mean_generalization_gap"": 0.23343528807163239, ""mean_train_loss"": 4.247993364930153, ""mean_val_loss"": 4.481428653001785, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.3, ""mean_generalization_gap"": 0.19782067835330963, ""mean_train_loss"": 4.347917139530182, ""mean_val_loss"": 4.5457378178834915, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.4, ""mean_generalization_gap"": 0.16299572587013245, ""mean_train_loss"": 4.486644446849823, ""mean_val_loss"": 4.6496401727199554, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.5, ""mean_generalization_gap"": 0.13463184982538223, ""mean_train_loss"": 4.6344869285821915, ""mean_val_loss"": 4.769118778407574, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.6, ""mean_generalization_gap"": 0.12295009940862656, ""mean_train_loss"": 4.809540346264839, ""mean_val_loss"": 4.932490445673466, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.7, ""mean_generalization_gap"": 0.09925898164510727, ""mean_train_loss"": 5.056431487202644, ""mean_val_loss"": 5.155690468847752, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.8, ""mean_generalization_gap"": 0.05897616595029831, ""mean_train_loss"": 5.429492920637131, ""mean_val_loss"": 5.488469086587429, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.9, ""mean_generalization_gap"": 0.01472686231136322, ""mean_train_loss"": 6.758838355541229, ""mean_val_loss"": 6.7735652178525925, ""n"": 1, ""std_val_loss"": 0.0}]"
|
| 15 |
+
screen_static,2000000,L8_H8_D256,8,8,256,8388608,1,0.08,4.623203128576279,0.0,0.05,0.08,0.01,4.6497810408473015,0.026577912271022797,0.08,4.623203128576279,0.026577912271022797,0.9,7.041225396096706,2.4180222675204277,True,False,"[{""dropout"": 0.0, ""mean_generalization_gap"": 0.5635415837168694, ""mean_train_loss"": 4.086239457130432, ""mean_val_loss"": 4.6497810408473015, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.02, ""mean_generalization_gap"": 0.5019041448831558, ""mean_train_loss"": 4.138374790549278, ""mean_val_loss"": 4.640278935432434, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.05, ""mean_generalization_gap"": 0.43557654321193695, ""mean_train_loss"": 4.196284607052803, ""mean_val_loss"": 4.63186115026474, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.08, ""mean_generalization_gap"": 0.40735629945993423, ""mean_train_loss"": 4.2158468291163445, ""mean_val_loss"": 4.623203128576279, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.1, ""mean_generalization_gap"": 0.3851764276623726, ""mean_train_loss"": 4.267480067908764, ""mean_val_loss"": 4.6526564955711365, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.14, ""mean_generalization_gap"": 0.36042413860559464, ""mean_train_loss"": 4.291930258274078, ""mean_val_loss"": 4.652354396879673, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.2, ""mean_generalization_gap"": 0.3055316135287285, ""mean_train_loss"": 4.382655680179596, ""mean_val_loss"": 4.688187293708324, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.3, ""mean_generalization_gap"": 0.26447462290525436, ""mean_train_loss"": 4.514920085668564, ""mean_val_loss"": 4.779394708573818, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.4, ""mean_generalization_gap"": 0.22775156050920486, ""mean_train_loss"": 4.651856943964958, ""mean_val_loss"": 4.879608504474163, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.5, ""mean_generalization_gap"": 0.18726412951946259, ""mean_train_loss"": 4.822874888777733, ""mean_val_loss"": 5.010139018297195, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.6, ""mean_generalization_gap"": 0.16301864385604858, ""mean_train_loss"": 4.99927744269371, ""mean_val_loss"": 5.162296086549759, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.7, ""mean_generalization_gap"": 0.1327260509133339, ""mean_train_loss"": 5.243539854884148, ""mean_val_loss"": 5.3762659057974815, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.8, ""mean_generalization_gap"": 0.09502238035202026, ""mean_train_loss"": 5.64663989841938, ""mean_val_loss"": 5.7416622787714005, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.9, ""mean_generalization_gap"": 0.0319184884428978, ""mean_train_loss"": 7.009306907653809, ""mean_val_loss"": 7.041225396096706, ""n"": 1, ""std_val_loss"": 0.0}]"
|
| 16 |
+
screen_static,4000000,L8_H8_D256,8,8,256,8388608,1,0.0,4.513561494648457,0.0,0.0,0.0,0.01,4.513561494648457,0.0,0.02,4.528690077364445,-0.01512858271598816,0.9,7.056577272713184,2.543015778064728,False,False,"[{""dropout"": 0.0, ""mean_generalization_gap"": 0.2620946392416954, ""mean_train_loss"": 4.251466855406761, ""mean_val_loss"": 4.513561494648457, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.02, ""mean_generalization_gap"": 0.2527272030711174, ""mean_train_loss"": 4.275962874293327, ""mean_val_loss"": 4.528690077364445, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.05, ""mean_generalization_gap"": 0.23059415817260742, ""mean_train_loss"": 4.309461995959282, ""mean_val_loss"": 4.540056154131889, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.08, ""mean_generalization_gap"": 0.21540386974811554, ""mean_train_loss"": 4.354661911725998, ""mean_val_loss"": 4.5700657814741135, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.1, ""mean_generalization_gap"": 0.1957206204533577, ""mean_train_loss"": 4.382582053542137, ""mean_val_loss"": 4.578302673995495, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.14, ""mean_generalization_gap"": 0.19393974542617798, ""mean_train_loss"": 4.412918761372566, ""mean_val_loss"": 4.606858506798744, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.2, ""mean_generalization_gap"": 0.16235049068927765, ""mean_train_loss"": 4.493421167135239, ""mean_val_loss"": 4.655771657824516, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.3, ""mean_generalization_gap"": 0.15203186124563217, ""mean_train_loss"": 4.599567919969559, ""mean_val_loss"": 4.751599781215191, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.4, ""mean_generalization_gap"": 0.12918514013290405, ""mean_train_loss"": 4.726367101073265, ""mean_val_loss"": 4.855552241206169, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.5, ""mean_generalization_gap"": 0.10444032400846481, ""mean_train_loss"": 4.873788967728615, ""mean_val_loss"": 4.97822929173708, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.6, ""mean_generalization_gap"": 0.09351418912410736, ""mean_train_loss"": 5.051660537719727, ""mean_val_loss"": 5.145174726843834, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.7, ""mean_generalization_gap"": 0.0773598924279213, ""mean_train_loss"": 5.283298075199127, ""mean_val_loss"": 5.3606579676270485, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.8, ""mean_generalization_gap"": 0.04376060515642166, ""mean_train_loss"": 5.67519947886467, ""mean_val_loss"": 5.7189600840210915, ""n"": 1, ""std_val_loss"": 0.0}, {""dropout"": 0.9, ""mean_generalization_gap"": 0.027009807527065277, ""mean_train_loss"": 7.029567465186119, ""mean_val_loss"": 7.056577272713184, ""n"": 1, ""std_val_loss"": 0.0}]"
|
runs/screen_static/20260525-133008/model_selection.json
CHANGED
|
@@ -19,12 +19,39 @@
|
|
| 19 |
"best_nonzero_dropout": 0.14,
|
| 20 |
"best_nonzero_val_loss": 4.427024222910404,
|
| 21 |
"zero_minus_best_nonzero": 0.19816528260707855,
|
| 22 |
-
"max_dropout": 0.
|
| 23 |
-
"max_dropout_val_loss":
|
| 24 |
-
"max_dropout_minus_best":
|
| 25 |
"has_nonzero_optimum": true,
|
| 26 |
"meets_target_dropout": true,
|
| 27 |
-
"curve_json": "[{\"dropout\": 0.0, \"mean_generalization_gap\": 1.2505649104714394, \"mean_train_loss\": 3.3746245950460434, \"mean_val_loss\": 4.625189505517483, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.02, \"mean_generalization_gap\": 1.0588822290301323, \"mean_train_loss\": 3.4837590381503105, \"mean_val_loss\": 4.542641267180443, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.05, \"mean_generalization_gap\": 0.9152051359415054, \"mean_train_loss\": 3.57197318226099, \"mean_val_loss\": 4.487178318202496, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.1, \"mean_generalization_gap\": 0.74983249604702, \"mean_train_loss\": 3.686768524348736, \"mean_val_loss\": 4.436601020395756, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.14, \"mean_generalization_gap\": 0.6508964970707893, \"mean_train_loss\": 3.776127725839615, \"mean_val_loss\": 4.427024222910404, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.2, \"mean_generalization_gap\": 0.5422321110963821, \"mean_train_loss\": 3.9085786044597626, \"mean_val_loss\": 4.450810715556145, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.3, \"mean_generalization_gap\": 0.4548688307404518, \"mean_train_loss\": 4.04004543274641, \"mean_val_loss\": 4.494914263486862, \"n\": 1, \"std_val_loss\": 0.0}]"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
},
|
| 29 |
{
|
| 30 |
"run_mode": "screen_static",
|
|
@@ -51,7 +78,88 @@
|
|
| 51 |
"max_dropout_minus_best": 1.9762733802199364,
|
| 52 |
"has_nonzero_optimum": true,
|
| 53 |
"meets_target_dropout": true,
|
| 54 |
-
"curve_json": "[{\"dropout\": 0.0, \"mean_generalization_gap\": 3.4047557078301907, \"mean_train_loss\": 2.2476234324276447, \"mean_val_loss\": 5.652379140257835, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.02, \"mean_generalization_gap\": 2.873790219426155, \"mean_train_loss\": 2.4646411538124084, \"mean_val_loss\": 5.3384313732385635, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.05, \"mean_generalization_gap\": 2.4113941341638565, \"mean_train_loss\": 2.7036017999053, \"mean_val_loss\": 5.114995934069157, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.1, \"mean_generalization_gap\": 1.8506078496575356, \"mean_train_loss\": 2.992223806679249, \"mean_val_loss\": 4.842831656336784, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.14, \"mean_generalization_gap\": 1.5568379163742065, \"mean_train_loss\": 3.1947626248002052, \"mean_val_loss\": 4.751600541174412, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.2, \"mean_generalization_gap\": 1.282993070781231, \"mean_train_loss\": 3.3926073163747787, \"mean_val_loss\": 4.67560038715601, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.3, \"mean_generalization_gap\": 1.0215996354818344, \"mean_train_loss\": 3.6294664666056633, \"mean_val_loss\": 4.651066102087498, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.4, \"mean_generalization_gap\": 0.8495340198278427, \"mean_train_loss\": 3.8496963307261467, \"mean_val_loss\": 4.699230350553989, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.5, \"mean_generalization_gap\": 0.7054777815937996, \"mean_train_loss\": 4.063626378774643, \"mean_val_loss\": 4.7691041603684425, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.6, \"mean_generalization_gap\": 0.57613954693079, \"mean_train_loss\": 4.328054502606392, \"mean_val_loss\": 4.904194049537182, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.7, \"mean_generalization_gap\": 0.45122554898262024, \"mean_train_loss\": 4.668859913945198, \"mean_val_loss\": 5.120085462927818, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.8, \"mean_generalization_gap\": 0.3151302859187126, \"mean_train_loss\": 5.189247876405716, \"mean_val_loss\": 5.504378162324429, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.9, \"mean_generalization_gap\": 0.11999441683292389, \"mean_train_loss\": 6.50734506547451, \"mean_val_loss\": 6.627339482307434, \"n\": 1, \"std_val_loss\": 0.0}]"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
},
|
| 56 |
{
|
| 57 |
"run_mode": "screen_static",
|
|
@@ -78,7 +186,88 @@
|
|
| 78 |
"max_dropout_minus_best": 1.638900026679039,
|
| 79 |
"has_nonzero_optimum": true,
|
| 80 |
"meets_target_dropout": true,
|
| 81 |
-
"curve_json": "[{\"dropout\": 0.0, \"mean_generalization_gap\": 6.568858440965414, \"mean_train_loss\": 0.7200149409472942, \"mean_val_loss\": 7.288873381912708, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.02, \"mean_generalization_gap\": 5.983665386214852, \"mean_train_loss\": 0.9117673244327307, \"mean_val_loss\": 6.895432710647583, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.05, \"mean_generalization_gap\": 5.264719225466251, \"mean_train_loss\": 1.2327042073011398, \"mean_val_loss\": 6.497423432767391, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.1, \"mean_generalization_gap\": 4.221366349607706, \"mean_train_loss\": 1.6818840466439724, \"mean_val_loss\": 5.9032503962516785, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.14, \"mean_generalization_gap\": 3.635788831859827, \"mean_train_loss\": 1.9904268346726894, \"mean_val_loss\": 5.6262156665325165, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.2, \"mean_generalization_gap\": 2.9111227244138718, \"mean_train_loss\": 2.416015110909939, \"mean_val_loss\": 5.327137835323811, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.3, \"mean_generalization_gap\": 2.17129834741354, \"mean_train_loss\": 2.8952616825699806, \"mean_val_loss\": 5.0665600299835205, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.4, \"mean_generalization_gap\": 1.7016963437199593, \"mean_train_loss\": 3.2796813175082207, \"mean_val_loss\": 4.98137766122818, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.5, \"mean_generalization_gap\": 1.3544480353593826, \"mean_train_loss\": 3.629131570458412, \"mean_val_loss\": 4.983579605817795, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.6, \"mean_generalization_gap\": 1.0571283996105194, \"mean_train_loss\": 4.0092809200286865, \"mean_val_loss\": 5.066409319639206, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.7, \"mean_generalization_gap\": 0.8311320170760155, \"mean_train_loss\": 4.435714960098267, \"mean_val_loss\": 5.266846977174282, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.8, \"mean_generalization_gap\": 0.5717534348368645, \"mean_train_loss\": 5.014922052621841, \"mean_val_loss\": 5.586675487458706, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.9, \"mean_generalization_gap\": 0.25871309638023376, \"mean_train_loss\": 6.361564591526985, \"mean_val_loss\": 6.620277687907219, \"n\": 1, \"std_val_loss\": 0.0}]"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
},
|
| 83 |
{
|
| 84 |
"run_mode": "screen_static",
|
|
@@ -105,6 +294,114 @@
|
|
| 105 |
"max_dropout_minus_best": 1.338316097855568,
|
| 106 |
"has_nonzero_optimum": true,
|
| 107 |
"meets_target_dropout": true,
|
| 108 |
-
"curve_json": "[{\"dropout\": 0.0, \"mean_generalization_gap\": 7.9225469790399075, \"mean_train_loss\": 0.2792285941541195, \"mean_val_loss\": 8.201775573194027, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.02, \"mean_generalization_gap\": 7.81083921296522, \"mean_train_loss\": 0.2605770383961499, \"mean_val_loss\": 8.07141625136137, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.05, \"mean_generalization_gap\": 7.690946782939136, \"mean_train_loss\": 0.26725416351109743, \"mean_val_loss\": 7.9582009464502335, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.1, \"mean_generalization_gap\": 7.249795647338033, \"mean_train_loss\": 0.338621674105525, \"mean_val_loss\": 7.588417321443558, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.14, \"mean_generalization_gap\": 6.785148191265762, \"mean_train_loss\": 0.49970753211528063, \"mean_val_loss\": 7.2848557233810425, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.2, \"mean_generalization_gap\": 6.015826283022761, \"mean_train_loss\": 0.8224823493510485, \"mean_val_loss\": 6.83830863237381, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.3, \"mean_generalization_gap\": 4.851862885057926, \"mean_train_loss\": 1.391046978533268, \"mean_val_loss\": 6.242909863591194, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.4, \"mean_generalization_gap\": 3.791083261370659, \"mean_train_loss\": 2.053295038640499, \"mean_val_loss\": 5.844378300011158, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.5, \"mean_generalization_gap\": 2.913642890751362, \"mean_train_loss\": 2.678894154727459, \"mean_val_loss\": 5.592537045478821, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.6, \"mean_generalization_gap\": 2.1870318800210953, \"mean_train_loss\": 3.3185144886374474, \"mean_val_loss\": 5.505546368658543, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.7, \"mean_generalization_gap\": 1.656675435602665, \"mean_train_loss\": 3.9233425855636597, \"mean_val_loss\": 5.580018021166325, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.8, \"mean_generalization_gap\": 1.114737719297409, \"mean_train_loss\": 4.725207552313805, \"mean_val_loss\": 5.839945271611214, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.9, \"mean_generalization_gap\": 0.48853396624326706, \"mean_train_loss\": 6.3553285002708435, \"mean_val_loss\": 6.843862466514111, \"n\": 1, \"std_val_loss\": 0.0}]"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
}
|
| 110 |
]
|
|
|
|
| 19 |
"best_nonzero_dropout": 0.14,
|
| 20 |
"best_nonzero_val_loss": 4.427024222910404,
|
| 21 |
"zero_minus_best_nonzero": 0.19816528260707855,
|
| 22 |
+
"max_dropout": 0.9,
|
| 23 |
+
"max_dropout_val_loss": 6.725677810609341,
|
| 24 |
+
"max_dropout_minus_best": 2.2986535876989365,
|
| 25 |
"has_nonzero_optimum": true,
|
| 26 |
"meets_target_dropout": true,
|
| 27 |
+
"curve_json": "[{\"dropout\": 0.0, \"mean_generalization_gap\": 1.2505649104714394, \"mean_train_loss\": 3.3746245950460434, \"mean_val_loss\": 4.625189505517483, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.02, \"mean_generalization_gap\": 1.0588822290301323, \"mean_train_loss\": 3.4837590381503105, \"mean_val_loss\": 4.542641267180443, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.05, \"mean_generalization_gap\": 0.9152051359415054, \"mean_train_loss\": 3.57197318226099, \"mean_val_loss\": 4.487178318202496, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.08, \"mean_generalization_gap\": 0.8277053758502007, \"mean_train_loss\": 3.6279477402567863, \"mean_val_loss\": 4.455653116106987, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.1, \"mean_generalization_gap\": 0.74983249604702, \"mean_train_loss\": 3.686768524348736, \"mean_val_loss\": 4.436601020395756, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.14, \"mean_generalization_gap\": 0.6508964970707893, \"mean_train_loss\": 3.776127725839615, \"mean_val_loss\": 4.427024222910404, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.2, \"mean_generalization_gap\": 0.5422321110963821, \"mean_train_loss\": 3.9085786044597626, \"mean_val_loss\": 4.450810715556145, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.3, \"mean_generalization_gap\": 0.4548688307404518, \"mean_train_loss\": 4.04004543274641, \"mean_val_loss\": 4.494914263486862, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.4, \"mean_generalization_gap\": 0.390671081840992, \"mean_train_loss\": 4.1651866137981415, \"mean_val_loss\": 4.5558576956391335, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.5, \"mean_generalization_gap\": 0.3157801032066345, \"mean_train_loss\": 4.337527960538864, \"mean_val_loss\": 4.653308063745499, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.6, \"mean_generalization_gap\": 0.26150281727313995, \"mean_train_loss\": 4.549529016017914, \"mean_val_loss\": 4.811031833291054, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.7, \"mean_generalization_gap\": 0.2040063962340355, \"mean_train_loss\": 4.847206577658653, \"mean_val_loss\": 5.051212973892689, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.8, \"mean_generalization_gap\": 0.14004798233509064, \"mean_train_loss\": 5.27906721830368, \"mean_val_loss\": 5.419115200638771, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.9, \"mean_generalization_gap\": 0.028456024825572968, \"mean_train_loss\": 6.697221785783768, \"mean_val_loss\": 6.725677810609341, \"n\": 1, \"std_val_loss\": 0.0}]"
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"run_mode": "screen_static",
|
| 31 |
+
"token_limit": 2000000,
|
| 32 |
+
"model_name": "L12_H8_D320",
|
| 33 |
+
"n_layer": 12,
|
| 34 |
+
"n_head": 8,
|
| 35 |
+
"n_embd": 320,
|
| 36 |
+
"parameters": 17367040,
|
| 37 |
+
"n": 1,
|
| 38 |
+
"best_dropout": 0.14,
|
| 39 |
+
"best_val_loss": 4.508757032454014,
|
| 40 |
+
"best_val_std": 0.0,
|
| 41 |
+
"plateau_start_dropout": 0.05,
|
| 42 |
+
"plateau_end_dropout": 0.14,
|
| 43 |
+
"plateau_delta": 0.01,
|
| 44 |
+
"zero_dropout_val_loss": 4.59531170129776,
|
| 45 |
+
"zero_minus_best": 0.08655466884374619,
|
| 46 |
+
"best_nonzero_dropout": 0.14,
|
| 47 |
+
"best_nonzero_val_loss": 4.508757032454014,
|
| 48 |
+
"zero_minus_best_nonzero": 0.08655466884374619,
|
| 49 |
+
"max_dropout": 0.9,
|
| 50 |
+
"max_dropout_val_loss": 6.741106614470482,
|
| 51 |
+
"max_dropout_minus_best": 2.232349582016468,
|
| 52 |
+
"has_nonzero_optimum": true,
|
| 53 |
+
"meets_target_dropout": true,
|
| 54 |
+
"curve_json": "[{\"dropout\": 0.0, \"mean_generalization_gap\": 0.8792819976806641, \"mean_train_loss\": 3.716029703617096, \"mean_val_loss\": 4.59531170129776, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.02, \"mean_generalization_gap\": 0.7727920487523079, \"mean_train_loss\": 3.7766708433628082, \"mean_val_loss\": 4.549462892115116, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.05, \"mean_generalization_gap\": 0.658709242939949, \"mean_train_loss\": 3.8593605384230614, \"mean_val_loss\": 4.51806978136301, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.08, \"mean_generalization_gap\": 0.5811423286795616, \"mean_train_loss\": 3.9309239983558655, \"mean_val_loss\": 4.512066327035427, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.1, \"mean_generalization_gap\": 0.5585346668958664, \"mean_train_loss\": 3.9626980274915695, \"mean_val_loss\": 4.521232694387436, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.14, \"mean_generalization_gap\": 0.486968994140625, \"mean_train_loss\": 4.021788038313389, \"mean_val_loss\": 4.508757032454014, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.2, \"mean_generalization_gap\": 0.4391852915287018, \"mean_train_loss\": 4.107810087502003, \"mean_val_loss\": 4.5469953790307045, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.3, \"mean_generalization_gap\": 0.35860010236501694, \"mean_train_loss\": 4.248827308416367, \"mean_val_loss\": 4.6074274107813835, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.4, \"mean_generalization_gap\": 0.30667147785425186, \"mean_train_loss\": 4.388647809624672, \"mean_val_loss\": 4.695319287478924, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.5, \"mean_generalization_gap\": 0.2494005709886551, \"mean_train_loss\": 4.564960986375809, \"mean_val_loss\": 4.814361557364464, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.6, \"mean_generalization_gap\": 0.21227668970823288, \"mean_train_loss\": 4.756197020411491, \"mean_val_loss\": 4.968473710119724, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.7, \"mean_generalization_gap\": 0.17023956030607224, \"mean_train_loss\": 5.009785428643227, \"mean_val_loss\": 5.180024988949299, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.8, \"mean_generalization_gap\": 0.11836093664169312, \"mean_train_loss\": 5.402336552739143, \"mean_val_loss\": 5.5206974893808365, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.9, \"mean_generalization_gap\": 0.020013347268104553, \"mean_train_loss\": 6.721093267202377, \"mean_val_loss\": 6.741106614470482, \"n\": 1, \"std_val_loss\": 0.0}]"
|
| 55 |
},
|
| 56 |
{
|
| 57 |
"run_mode": "screen_static",
|
|
|
|
| 78 |
"max_dropout_minus_best": 1.9762733802199364,
|
| 79 |
"has_nonzero_optimum": true,
|
| 80 |
"meets_target_dropout": true,
|
| 81 |
+
"curve_json": "[{\"dropout\": 0.0, \"mean_generalization_gap\": 3.4047557078301907, \"mean_train_loss\": 2.2476234324276447, \"mean_val_loss\": 5.652379140257835, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.02, \"mean_generalization_gap\": 2.873790219426155, \"mean_train_loss\": 2.4646411538124084, \"mean_val_loss\": 5.3384313732385635, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.05, \"mean_generalization_gap\": 2.4113941341638565, \"mean_train_loss\": 2.7036017999053, \"mean_val_loss\": 5.114995934069157, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.08, \"mean_generalization_gap\": 2.0462540313601494, \"mean_train_loss\": 2.8963891118764877, \"mean_val_loss\": 4.942643143236637, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.1, \"mean_generalization_gap\": 1.8506078496575356, \"mean_train_loss\": 2.992223806679249, \"mean_val_loss\": 4.842831656336784, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.14, \"mean_generalization_gap\": 1.5568379163742065, \"mean_train_loss\": 3.1947626248002052, \"mean_val_loss\": 4.751600541174412, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.2, \"mean_generalization_gap\": 1.282993070781231, \"mean_train_loss\": 3.3926073163747787, \"mean_val_loss\": 4.67560038715601, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.3, \"mean_generalization_gap\": 1.0215996354818344, \"mean_train_loss\": 3.6294664666056633, \"mean_val_loss\": 4.651066102087498, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.4, \"mean_generalization_gap\": 0.8495340198278427, \"mean_train_loss\": 3.8496963307261467, \"mean_val_loss\": 4.699230350553989, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.5, \"mean_generalization_gap\": 0.7054777815937996, \"mean_train_loss\": 4.063626378774643, \"mean_val_loss\": 4.7691041603684425, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.6, \"mean_generalization_gap\": 0.57613954693079, \"mean_train_loss\": 4.328054502606392, \"mean_val_loss\": 4.904194049537182, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.7, \"mean_generalization_gap\": 0.45122554898262024, \"mean_train_loss\": 4.668859913945198, \"mean_val_loss\": 5.120085462927818, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.8, \"mean_generalization_gap\": 0.3151302859187126, \"mean_train_loss\": 5.189247876405716, \"mean_val_loss\": 5.504378162324429, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.9, \"mean_generalization_gap\": 0.11999441683292389, \"mean_train_loss\": 6.50734506547451, \"mean_val_loss\": 6.627339482307434, \"n\": 1, \"std_val_loss\": 0.0}]"
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"run_mode": "screen_static",
|
| 85 |
+
"token_limit": 1000000,
|
| 86 |
+
"model_name": "L12_H8_D320",
|
| 87 |
+
"n_layer": 12,
|
| 88 |
+
"n_head": 8,
|
| 89 |
+
"n_embd": 320,
|
| 90 |
+
"parameters": 17367040,
|
| 91 |
+
"n": 1,
|
| 92 |
+
"best_dropout": 0.2,
|
| 93 |
+
"best_val_loss": 4.687077932059765,
|
| 94 |
+
"best_val_std": 0.0,
|
| 95 |
+
"plateau_start_dropout": 0.2,
|
| 96 |
+
"plateau_end_dropout": 0.2,
|
| 97 |
+
"plateau_delta": 0.01,
|
| 98 |
+
"zero_dropout_val_loss": 5.327972687780857,
|
| 99 |
+
"zero_minus_best": 0.6408947557210922,
|
| 100 |
+
"best_nonzero_dropout": 0.2,
|
| 101 |
+
"best_nonzero_val_loss": 4.687077932059765,
|
| 102 |
+
"zero_minus_best_nonzero": 0.6408947557210922,
|
| 103 |
+
"max_dropout": 0.9,
|
| 104 |
+
"max_dropout_val_loss": 6.74848935008049,
|
| 105 |
+
"max_dropout_minus_best": 2.0614114180207253,
|
| 106 |
+
"has_nonzero_optimum": true,
|
| 107 |
+
"meets_target_dropout": true,
|
| 108 |
+
"curve_json": "[{\"dropout\": 0.0, \"mean_generalization_gap\": 2.441593125462532, \"mean_train_loss\": 2.886379562318325, \"mean_val_loss\": 5.327972687780857, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.02, \"mean_generalization_gap\": 2.035951793193817, \"mean_train_loss\": 3.0628975853323936, \"mean_val_loss\": 5.098849378526211, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.05, \"mean_generalization_gap\": 1.634356364607811, \"mean_train_loss\": 3.268335275352001, \"mean_val_loss\": 4.902691639959812, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.08, \"mean_generalization_gap\": 1.3981640711426735, \"mean_train_loss\": 3.3916852474212646, \"mean_val_loss\": 4.789849318563938, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.1, \"mean_generalization_gap\": 1.285424992442131, \"mean_train_loss\": 3.4663245379924774, \"mean_val_loss\": 4.7517495304346085, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.14, \"mean_generalization_gap\": 1.1256954967975616, \"mean_train_loss\": 3.591317318379879, \"mean_val_loss\": 4.717012815177441, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.2, \"mean_generalization_gap\": 0.9711186662316322, \"mean_train_loss\": 3.7159592658281326, \"mean_val_loss\": 4.687077932059765, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.3, \"mean_generalization_gap\": 0.7806104272603989, \"mean_train_loss\": 3.9408339336514473, \"mean_val_loss\": 4.721444360911846, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.4, \"mean_generalization_gap\": 0.6547741517424583, \"mean_train_loss\": 4.1257737800478935, \"mean_val_loss\": 4.780547931790352, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.5, \"mean_generalization_gap\": 0.557686097919941, \"mean_train_loss\": 4.341994017362595, \"mean_val_loss\": 4.8996801152825356, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.6, \"mean_generalization_gap\": 0.47620102018117905, \"mean_train_loss\": 4.583218589425087, \"mean_val_loss\": 5.059419609606266, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.7, \"mean_generalization_gap\": 0.39166851341724396, \"mean_train_loss\": 4.866017505526543, \"mean_val_loss\": 5.257686018943787, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.8, \"mean_generalization_gap\": 0.29053743183612823, \"mean_train_loss\": 5.279122695326805, \"mean_val_loss\": 5.569660127162933, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.9, \"mean_generalization_gap\": 0.11938704550266266, \"mean_train_loss\": 6.6291023045778275, \"mean_val_loss\": 6.74848935008049, \"n\": 1, \"std_val_loss\": 0.0}]"
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"run_mode": "screen_static",
|
| 112 |
+
"token_limit": 1000000,
|
| 113 |
+
"model_name": "L8_H8_D256",
|
| 114 |
+
"n_layer": 8,
|
| 115 |
+
"n_head": 8,
|
| 116 |
+
"n_embd": 256,
|
| 117 |
+
"parameters": 8388608,
|
| 118 |
+
"n": 1,
|
| 119 |
+
"best_dropout": 0.14,
|
| 120 |
+
"best_val_loss": 4.776347942650318,
|
| 121 |
+
"best_val_std": 0.0,
|
| 122 |
+
"plateau_start_dropout": 0.14,
|
| 123 |
+
"plateau_end_dropout": 0.14,
|
| 124 |
+
"plateau_delta": 0.01,
|
| 125 |
+
"zero_dropout_val_loss": 5.020985089242458,
|
| 126 |
+
"zero_minus_best": 0.2446371465921402,
|
| 127 |
+
"best_nonzero_dropout": 0.14,
|
| 128 |
+
"best_nonzero_val_loss": 4.776347942650318,
|
| 129 |
+
"zero_minus_best_nonzero": 0.2446371465921402,
|
| 130 |
+
"max_dropout": 0.9,
|
| 131 |
+
"max_dropout_val_loss": 7.103986039757729,
|
| 132 |
+
"max_dropout_minus_best": 2.3276380971074104,
|
| 133 |
+
"has_nonzero_optimum": true,
|
| 134 |
+
"meets_target_dropout": true,
|
| 135 |
+
"curve_json": "[{\"dropout\": 0.0, \"mean_generalization_gap\": 1.4288483187556267, \"mean_train_loss\": 3.5921367704868317, \"mean_val_loss\": 5.020985089242458, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.02, \"mean_generalization_gap\": 1.2022551372647285, \"mean_train_loss\": 3.692635416984558, \"mean_val_loss\": 4.894890554249287, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.05, \"mean_generalization_gap\": 1.03511231392622, \"mean_train_loss\": 3.774454675614834, \"mean_val_loss\": 4.809566989541054, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.08, \"mean_generalization_gap\": 0.941984124481678, \"mean_train_loss\": 3.8572388663887978, \"mean_val_loss\": 4.799222990870476, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.1, \"mean_generalization_gap\": 0.8684338107705116, \"mean_train_loss\": 3.9261686205863953, \"mean_val_loss\": 4.794602431356907, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.14, \"mean_generalization_gap\": 0.7863189205527306, \"mean_train_loss\": 3.9900290220975876, \"mean_val_loss\": 4.776347942650318, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.2, \"mean_generalization_gap\": 0.6707855090498924, \"mean_train_loss\": 4.130261890590191, \"mean_val_loss\": 4.801047399640083, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.3, \"mean_generalization_gap\": 0.5717819258570671, \"mean_train_loss\": 4.293951943516731, \"mean_val_loss\": 4.865733869373798, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.4, \"mean_generalization_gap\": 0.4866805672645569, \"mean_train_loss\": 4.475886330008507, \"mean_val_loss\": 4.962566897273064, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.5, \"mean_generalization_gap\": 0.42112673074007034, \"mean_train_loss\": 4.666878193616867, \"mean_val_loss\": 5.088004924356937, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.6, \"mean_generalization_gap\": 0.3723558411002159, \"mean_train_loss\": 4.864694103598595, \"mean_val_loss\": 5.237049944698811, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.7, \"mean_generalization_gap\": 0.3116709440946579, \"mean_train_loss\": 5.129916787147522, \"mean_val_loss\": 5.44158773124218, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.8, \"mean_generalization_gap\": 0.23021624982357025, \"mean_train_loss\": 5.578491851687431, \"mean_val_loss\": 5.808708101511002, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.9, \"mean_generalization_gap\": 0.10863403975963593, \"mean_train_loss\": 6.995351999998093, \"mean_val_loss\": 7.103986039757729, \"n\": 1, \"std_val_loss\": 0.0}]"
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"run_mode": "screen_static",
|
| 139 |
+
"token_limit": 500000,
|
| 140 |
+
"model_name": "L12_H8_D320",
|
| 141 |
+
"n_layer": 12,
|
| 142 |
+
"n_head": 8,
|
| 143 |
+
"n_embd": 320,
|
| 144 |
+
"parameters": 17367040,
|
| 145 |
+
"n": 1,
|
| 146 |
+
"best_dropout": 0.4,
|
| 147 |
+
"best_val_loss": 4.979132980108261,
|
| 148 |
+
"best_val_std": 0.0,
|
| 149 |
+
"plateau_start_dropout": 0.3,
|
| 150 |
+
"plateau_end_dropout": 0.4,
|
| 151 |
+
"plateau_delta": 0.01,
|
| 152 |
+
"zero_dropout_val_loss": 6.882166460156441,
|
| 153 |
+
"zero_minus_best": 1.9030334800481796,
|
| 154 |
+
"best_nonzero_dropout": 0.4,
|
| 155 |
+
"best_nonzero_val_loss": 4.979132980108261,
|
| 156 |
+
"zero_minus_best_nonzero": 1.9030334800481796,
|
| 157 |
+
"max_dropout": 0.9,
|
| 158 |
+
"max_dropout_val_loss": 6.766090348362923,
|
| 159 |
+
"max_dropout_minus_best": 1.7869573682546616,
|
| 160 |
+
"has_nonzero_optimum": true,
|
| 161 |
+
"meets_target_dropout": true,
|
| 162 |
+
"curve_json": "[{\"dropout\": 0.0, \"mean_generalization_gap\": 5.444314293563366, \"mean_train_loss\": 1.4378521665930748, \"mean_val_loss\": 6.882166460156441, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.02, \"mean_generalization_gap\": 4.635581977665424, \"mean_train_loss\": 1.7486785426735878, \"mean_val_loss\": 6.384260520339012, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.05, \"mean_generalization_gap\": 3.839024405926466, \"mean_train_loss\": 2.0567418597638607, \"mean_val_loss\": 5.895766265690327, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.08, \"mean_generalization_gap\": 3.225618854165077, \"mean_train_loss\": 2.371535360813141, \"mean_val_loss\": 5.597154214978218, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.1, \"mean_generalization_gap\": 2.958456054329872, \"mean_train_loss\": 2.4893965795636177, \"mean_val_loss\": 5.44785263389349, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.14, \"mean_generalization_gap\": 2.512647695839405, \"mean_train_loss\": 2.7468534484505653, \"mean_val_loss\": 5.25950114428997, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.2, \"mean_generalization_gap\": 2.0600279718637466, \"mean_train_loss\": 3.0360172167420387, \"mean_val_loss\": 5.096045188605785, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.3, \"mean_generalization_gap\": 1.5742274522781372, \"mean_train_loss\": 3.40982835739851, \"mean_val_loss\": 4.984055809676647, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.4, \"mean_generalization_gap\": 1.2433720752596855, \"mean_train_loss\": 3.7357609048485756, \"mean_val_loss\": 4.979132980108261, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.5, \"mean_generalization_gap\": 1.0286902412772179, \"mean_train_loss\": 4.029533125460148, \"mean_val_loss\": 5.058223366737366, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.6, \"mean_generalization_gap\": 0.8609125763177872, \"mean_train_loss\": 4.322844922542572, \"mean_val_loss\": 5.183757498860359, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.7, \"mean_generalization_gap\": 0.703565426170826, \"mean_train_loss\": 4.680620342493057, \"mean_val_loss\": 5.384185768663883, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.8, \"mean_generalization_gap\": 0.530634418129921, \"mean_train_loss\": 5.157206907868385, \"mean_val_loss\": 5.687841325998306, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.9, \"mean_generalization_gap\": 0.27382446825504303, \"mean_train_loss\": 6.49226588010788, \"mean_val_loss\": 6.766090348362923, \"n\": 1, \"std_val_loss\": 0.0}]"
|
| 163 |
},
|
| 164 |
{
|
| 165 |
"run_mode": "screen_static",
|
|
|
|
| 186 |
"max_dropout_minus_best": 1.638900026679039,
|
| 187 |
"has_nonzero_optimum": true,
|
| 188 |
"meets_target_dropout": true,
|
| 189 |
+
"curve_json": "[{\"dropout\": 0.0, \"mean_generalization_gap\": 6.568858440965414, \"mean_train_loss\": 0.7200149409472942, \"mean_val_loss\": 7.288873381912708, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.02, \"mean_generalization_gap\": 5.983665386214852, \"mean_train_loss\": 0.9117673244327307, \"mean_val_loss\": 6.895432710647583, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.05, \"mean_generalization_gap\": 5.264719225466251, \"mean_train_loss\": 1.2327042073011398, \"mean_val_loss\": 6.497423432767391, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.08, \"mean_generalization_gap\": 4.569968264549971, \"mean_train_loss\": 1.5379231162369251, \"mean_val_loss\": 6.107891380786896, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.1, \"mean_generalization_gap\": 4.221366349607706, \"mean_train_loss\": 1.6818840466439724, \"mean_val_loss\": 5.9032503962516785, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.14, \"mean_generalization_gap\": 3.635788831859827, \"mean_train_loss\": 1.9904268346726894, \"mean_val_loss\": 5.6262156665325165, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.2, \"mean_generalization_gap\": 2.9111227244138718, \"mean_train_loss\": 2.416015110909939, \"mean_val_loss\": 5.327137835323811, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.3, \"mean_generalization_gap\": 2.17129834741354, \"mean_train_loss\": 2.8952616825699806, \"mean_val_loss\": 5.0665600299835205, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.4, \"mean_generalization_gap\": 1.7016963437199593, \"mean_train_loss\": 3.2796813175082207, \"mean_val_loss\": 4.98137766122818, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.5, \"mean_generalization_gap\": 1.3544480353593826, \"mean_train_loss\": 3.629131570458412, \"mean_val_loss\": 4.983579605817795, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.6, \"mean_generalization_gap\": 1.0571283996105194, \"mean_train_loss\": 4.0092809200286865, \"mean_val_loss\": 5.066409319639206, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.7, \"mean_generalization_gap\": 0.8311320170760155, \"mean_train_loss\": 4.435714960098267, \"mean_val_loss\": 5.266846977174282, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.8, \"mean_generalization_gap\": 0.5717534348368645, \"mean_train_loss\": 5.014922052621841, \"mean_val_loss\": 5.586675487458706, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.9, \"mean_generalization_gap\": 0.25871309638023376, \"mean_train_loss\": 6.361564591526985, \"mean_val_loss\": 6.620277687907219, \"n\": 1, \"std_val_loss\": 0.0}]"
|
| 190 |
+
},
|
| 191 |
+
{
|
| 192 |
+
"run_mode": "screen_static",
|
| 193 |
+
"token_limit": 500000,
|
| 194 |
+
"model_name": "L8_H8_D256",
|
| 195 |
+
"n_layer": 8,
|
| 196 |
+
"n_head": 8,
|
| 197 |
+
"n_embd": 256,
|
| 198 |
+
"parameters": 8388608,
|
| 199 |
+
"n": 1,
|
| 200 |
+
"best_dropout": 0.2,
|
| 201 |
+
"best_val_loss": 5.02164863795042,
|
| 202 |
+
"best_val_std": 0.0,
|
| 203 |
+
"plateau_start_dropout": 0.2,
|
| 204 |
+
"plateau_end_dropout": 0.2,
|
| 205 |
+
"plateau_delta": 0.01,
|
| 206 |
+
"zero_dropout_val_loss": 6.079639628529549,
|
| 207 |
+
"zero_minus_best": 1.0579909905791283,
|
| 208 |
+
"best_nonzero_dropout": 0.2,
|
| 209 |
+
"best_nonzero_val_loss": 5.02164863795042,
|
| 210 |
+
"zero_minus_best_nonzero": 1.0579909905791283,
|
| 211 |
+
"max_dropout": 0.9,
|
| 212 |
+
"max_dropout_val_loss": 7.112152203917503,
|
| 213 |
+
"max_dropout_minus_best": 2.090503565967083,
|
| 214 |
+
"has_nonzero_optimum": true,
|
| 215 |
+
"meets_target_dropout": true,
|
| 216 |
+
"curve_json": "[{\"dropout\": 0.0, \"mean_generalization_gap\": 3.502076916396618, \"mean_train_loss\": 2.5775627121329308, \"mean_val_loss\": 6.079639628529549, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.02, \"mean_generalization_gap\": 2.8574068769812584, \"mean_train_loss\": 2.8127700313925743, \"mean_val_loss\": 5.670176908373833, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.05, \"mean_generalization_gap\": 2.324491947889328, \"mean_train_loss\": 3.0387847647070885, \"mean_val_loss\": 5.3632767125964165, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.08, \"mean_generalization_gap\": 1.9923998713493347, \"mean_train_loss\": 3.2260689064860344, \"mean_val_loss\": 5.218468777835369, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.1, \"mean_generalization_gap\": 1.8499257937073708, \"mean_train_loss\": 3.3107700124382973, \"mean_val_loss\": 5.160695806145668, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.14, \"mean_generalization_gap\": 1.5897220373153687, \"mean_train_loss\": 3.4774143397808075, \"mean_val_loss\": 5.067136377096176, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.2, \"mean_generalization_gap\": 1.3237614631652832, \"mean_train_loss\": 3.697887174785137, \"mean_val_loss\": 5.02164863795042, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.3, \"mean_generalization_gap\": 1.0655959695577621, \"mean_train_loss\": 3.9667534679174423, \"mean_val_loss\": 5.0323494374752045, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.4, \"mean_generalization_gap\": 0.8976302519440651, \"mean_train_loss\": 4.194036483764648, \"mean_val_loss\": 5.0916667357087135, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.5, \"mean_generalization_gap\": 0.7811932787299156, \"mean_train_loss\": 4.428740322589874, \"mean_val_loss\": 5.20993360131979, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.6, \"mean_generalization_gap\": 0.6703063324093819, \"mean_train_loss\": 4.666903376579285, \"mean_val_loss\": 5.3372097089886665, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.7, \"mean_generalization_gap\": 0.5750636905431747, \"mean_train_loss\": 4.97511188685894, \"mean_val_loss\": 5.550175577402115, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.8, \"mean_generalization_gap\": 0.4504067897796631, \"mean_train_loss\": 5.43462572991848, \"mean_val_loss\": 5.885032519698143, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.9, \"mean_generalization_gap\": 0.24193444848060608, \"mean_train_loss\": 6.870217755436897, \"mean_val_loss\": 7.112152203917503, \"n\": 1, \"std_val_loss\": 0.0}]"
|
| 217 |
+
},
|
| 218 |
+
{
|
| 219 |
+
"run_mode": "screen_static",
|
| 220 |
+
"token_limit": 250000,
|
| 221 |
+
"model_name": "L8_H8_D256",
|
| 222 |
+
"n_layer": 8,
|
| 223 |
+
"n_head": 8,
|
| 224 |
+
"n_embd": 256,
|
| 225 |
+
"parameters": 8388608,
|
| 226 |
+
"n": 1,
|
| 227 |
+
"best_dropout": 0.4,
|
| 228 |
+
"best_val_loss": 5.417454726994038,
|
| 229 |
+
"best_val_std": 0.0,
|
| 230 |
+
"plateau_start_dropout": 0.4,
|
| 231 |
+
"plateau_end_dropout": 0.4,
|
| 232 |
+
"plateau_delta": 0.01,
|
| 233 |
+
"zero_dropout_val_loss": 7.917540371417999,
|
| 234 |
+
"zero_minus_best": 2.5000856444239616,
|
| 235 |
+
"best_nonzero_dropout": 0.4,
|
| 236 |
+
"best_nonzero_val_loss": 5.417454726994038,
|
| 237 |
+
"zero_minus_best_nonzero": 2.5000856444239616,
|
| 238 |
+
"max_dropout": 0.9,
|
| 239 |
+
"max_dropout_val_loss": 7.160110808908939,
|
| 240 |
+
"max_dropout_minus_best": 1.7426560819149017,
|
| 241 |
+
"has_nonzero_optimum": true,
|
| 242 |
+
"meets_target_dropout": true,
|
| 243 |
+
"curve_json": "[{\"dropout\": 0.0, \"mean_generalization_gap\": 6.9651469346135855, \"mean_train_loss\": 0.9523934368044138, \"mean_val_loss\": 7.917540371417999, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.02, \"mean_generalization_gap\": 5.999898966401815, \"mean_train_loss\": 1.2368882782757282, \"mean_val_loss\": 7.236787244677544, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.05, \"mean_generalization_gap\": 4.966615006327629, \"mean_train_loss\": 1.6339460164308548, \"mean_val_loss\": 6.600561022758484, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.08, \"mean_generalization_gap\": 4.347427327185869, \"mean_train_loss\": 1.9089897610247135, \"mean_val_loss\": 6.256417088210583, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.1, \"mean_generalization_gap\": 4.005591854453087, \"mean_train_loss\": 2.0858395472168922, \"mean_val_loss\": 6.091431401669979, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.14, \"mean_generalization_gap\": 3.420706979930401, \"mean_train_loss\": 2.409040831029415, \"mean_val_loss\": 5.829747810959816, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.2, \"mean_generalization_gap\": 2.8160821720957756, \"mean_train_loss\": 2.780476927757263, \"mean_val_loss\": 5.596559099853039, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.3, \"mean_generalization_gap\": 2.1707596853375435, \"mean_train_loss\": 3.2711103558540344, \"mean_val_loss\": 5.441870041191578, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.4, \"mean_generalization_gap\": 1.7763118371367455, \"mean_train_loss\": 3.641142889857292, \"mean_val_loss\": 5.417454726994038, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.5, \"mean_generalization_gap\": 1.472730539739132, \"mean_train_loss\": 3.9926967695355415, \"mean_val_loss\": 5.4654273092746735, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.6, \"mean_generalization_gap\": 1.2663473039865494, \"mean_train_loss\": 4.320340022444725, \"mean_val_loss\": 5.586687326431274, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.7, \"mean_generalization_gap\": 1.0747108310461044, \"mean_train_loss\": 4.705530673265457, \"mean_val_loss\": 5.780241504311562, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.8, \"mean_generalization_gap\": 0.8436890691518784, \"mean_train_loss\": 5.24668562412262, \"mean_val_loss\": 6.090374693274498, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.9, \"mean_generalization_gap\": 0.44749947637319565, \"mean_train_loss\": 6.712611332535744, \"mean_val_loss\": 7.160110808908939, \"n\": 1, \"std_val_loss\": 0.0}]"
|
| 244 |
+
},
|
| 245 |
+
{
|
| 246 |
+
"run_mode": "screen_static",
|
| 247 |
+
"token_limit": 250000,
|
| 248 |
+
"model_name": "L12_H8_D320",
|
| 249 |
+
"n_layer": 12,
|
| 250 |
+
"n_head": 8,
|
| 251 |
+
"n_embd": 320,
|
| 252 |
+
"parameters": 17367040,
|
| 253 |
+
"n": 1,
|
| 254 |
+
"best_dropout": 0.5,
|
| 255 |
+
"best_val_loss": 5.438360869884491,
|
| 256 |
+
"best_val_std": 0.0,
|
| 257 |
+
"plateau_start_dropout": 0.5,
|
| 258 |
+
"plateau_end_dropout": 0.5,
|
| 259 |
+
"plateau_delta": 0.01,
|
| 260 |
+
"zero_dropout_val_loss": 8.15590063482523,
|
| 261 |
+
"zero_minus_best": 2.7175397649407387,
|
| 262 |
+
"best_nonzero_dropout": 0.5,
|
| 263 |
+
"best_nonzero_val_loss": 5.438360869884491,
|
| 264 |
+
"zero_minus_best_nonzero": 2.7175397649407387,
|
| 265 |
+
"max_dropout": 0.9,
|
| 266 |
+
"max_dropout_val_loss": 6.905720971524715,
|
| 267 |
+
"max_dropout_minus_best": 1.4673601016402245,
|
| 268 |
+
"has_nonzero_optimum": true,
|
| 269 |
+
"meets_target_dropout": true,
|
| 270 |
+
"curve_json": "[{\"dropout\": 0.0, \"mean_generalization_gap\": 7.7617882722988725, \"mean_train_loss\": 0.3941123625263572, \"mean_val_loss\": 8.15590063482523, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.02, \"mean_generalization_gap\": 7.574556526727974, \"mean_train_loss\": 0.4179071066901088, \"mean_val_loss\": 7.992463633418083, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.05, \"mean_generalization_gap\": 6.955689197406173, \"mean_train_loss\": 0.5652239341288805, \"mean_val_loss\": 7.520913131535053, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.08, \"mean_generalization_gap\": 6.330640483647585, \"mean_train_loss\": 0.7794149778783321, \"mean_val_loss\": 7.110055461525917, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.1, \"mean_generalization_gap\": 6.015923976898193, \"mean_train_loss\": 0.9396447688341141, \"mean_val_loss\": 6.955568745732307, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.14, \"mean_generalization_gap\": 5.296522863209248, \"mean_train_loss\": 1.2607302814722061, \"mean_val_loss\": 6.557253144681454, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.2, \"mean_generalization_gap\": 4.479952562600374, \"mean_train_loss\": 1.6681972332298756, \"mean_val_loss\": 6.14814979583025, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.3, \"mean_generalization_gap\": 3.3666396513581276, \"mean_train_loss\": 2.34628177434206, \"mean_val_loss\": 5.712921425700188, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.4, \"mean_generalization_gap\": 2.6368888467550278, \"mean_train_loss\": 2.881612576544285, \"mean_val_loss\": 5.518501423299313, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.5, \"mean_generalization_gap\": 2.06633959710598, \"mean_train_loss\": 3.372021272778511, \"mean_val_loss\": 5.438360869884491, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.6, \"mean_generalization_gap\": 1.676231786608696, \"mean_train_loss\": 3.8244777768850327, \"mean_val_loss\": 5.500709563493729, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.7, \"mean_generalization_gap\": 1.338156834244728, \"mean_train_loss\": 4.315010622143745, \"mean_val_loss\": 5.6531674563884735, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.8, \"mean_generalization_gap\": 1.018489956855774, \"mean_train_loss\": 4.907547950744629, \"mean_val_loss\": 5.926037907600403, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.9, \"mean_generalization_gap\": 0.5023725405335426, \"mean_train_loss\": 6.403348430991173, \"mean_val_loss\": 6.905720971524715, \"n\": 1, \"std_val_loss\": 0.0}]"
|
| 271 |
},
|
| 272 |
{
|
| 273 |
"run_mode": "screen_static",
|
|
|
|
| 294 |
"max_dropout_minus_best": 1.338316097855568,
|
| 295 |
"has_nonzero_optimum": true,
|
| 296 |
"meets_target_dropout": true,
|
| 297 |
+
"curve_json": "[{\"dropout\": 0.0, \"mean_generalization_gap\": 7.9225469790399075, \"mean_train_loss\": 0.2792285941541195, \"mean_val_loss\": 8.201775573194027, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.02, \"mean_generalization_gap\": 7.81083921296522, \"mean_train_loss\": 0.2605770383961499, \"mean_val_loss\": 8.07141625136137, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.05, \"mean_generalization_gap\": 7.690946782939136, \"mean_train_loss\": 0.26725416351109743, \"mean_val_loss\": 7.9582009464502335, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.08, \"mean_generalization_gap\": 7.46396254748106, \"mean_train_loss\": 0.29682786762714386, \"mean_val_loss\": 7.760790415108204, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.1, \"mean_generalization_gap\": 7.249795647338033, \"mean_train_loss\": 0.338621674105525, \"mean_val_loss\": 7.588417321443558, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.14, \"mean_generalization_gap\": 6.785148191265762, \"mean_train_loss\": 0.49970753211528063, \"mean_val_loss\": 7.2848557233810425, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.2, \"mean_generalization_gap\": 6.015826283022761, \"mean_train_loss\": 0.8224823493510485, \"mean_val_loss\": 6.83830863237381, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.3, \"mean_generalization_gap\": 4.851862885057926, \"mean_train_loss\": 1.391046978533268, \"mean_val_loss\": 6.242909863591194, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.4, \"mean_generalization_gap\": 3.791083261370659, \"mean_train_loss\": 2.053295038640499, \"mean_val_loss\": 5.844378300011158, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.5, \"mean_generalization_gap\": 2.913642890751362, \"mean_train_loss\": 2.678894154727459, \"mean_val_loss\": 5.592537045478821, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.6, \"mean_generalization_gap\": 2.1870318800210953, \"mean_train_loss\": 3.3185144886374474, \"mean_val_loss\": 5.505546368658543, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.7, \"mean_generalization_gap\": 1.656675435602665, \"mean_train_loss\": 3.9233425855636597, \"mean_val_loss\": 5.580018021166325, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.8, \"mean_generalization_gap\": 1.114737719297409, \"mean_train_loss\": 4.725207552313805, \"mean_val_loss\": 5.839945271611214, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.9, \"mean_generalization_gap\": 0.48853396624326706, \"mean_train_loss\": 6.3553285002708435, \"mean_val_loss\": 6.843862466514111, \"n\": 1, \"std_val_loss\": 0.0}]"
|
| 298 |
+
},
|
| 299 |
+
{
|
| 300 |
+
"run_mode": "screen_static",
|
| 301 |
+
"token_limit": 4000000,
|
| 302 |
+
"model_name": "L16_H8_D384",
|
| 303 |
+
"n_layer": 16,
|
| 304 |
+
"n_head": 8,
|
| 305 |
+
"n_embd": 384,
|
| 306 |
+
"parameters": 31457280,
|
| 307 |
+
"n": 1,
|
| 308 |
+
"best_dropout": 0.02,
|
| 309 |
+
"best_val_loss": 4.294691443443298,
|
| 310 |
+
"best_val_std": 0.0,
|
| 311 |
+
"plateau_start_dropout": 0.02,
|
| 312 |
+
"plateau_end_dropout": 0.08,
|
| 313 |
+
"plateau_delta": 0.01,
|
| 314 |
+
"zero_dropout_val_loss": 4.324729532003403,
|
| 315 |
+
"zero_minus_best": 0.03003808856010437,
|
| 316 |
+
"best_nonzero_dropout": 0.02,
|
| 317 |
+
"best_nonzero_val_loss": 4.294691443443298,
|
| 318 |
+
"zero_minus_best_nonzero": 0.03003808856010437,
|
| 319 |
+
"max_dropout": 0.9,
|
| 320 |
+
"max_dropout_val_loss": 6.623879760503769,
|
| 321 |
+
"max_dropout_minus_best": 2.3291883170604706,
|
| 322 |
+
"has_nonzero_optimum": true,
|
| 323 |
+
"meets_target_dropout": false,
|
| 324 |
+
"curve_json": "[{\"dropout\": 0.0, \"mean_generalization_gap\": 0.4830755889415741, \"mean_train_loss\": 3.8416539430618286, \"mean_val_loss\": 4.324729532003403, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.02, \"mean_generalization_gap\": 0.44003017246723175, \"mean_train_loss\": 3.8546612709760666, \"mean_val_loss\": 4.294691443443298, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.05, \"mean_generalization_gap\": 0.40492552518844604, \"mean_train_loss\": 3.910273350775242, \"mean_val_loss\": 4.315198875963688, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.08, \"mean_generalization_gap\": 0.3686877265572548, \"mean_train_loss\": 3.9314780607819557, \"mean_val_loss\": 4.3001657873392105, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.1, \"mean_generalization_gap\": 0.3556562587618828, \"mean_train_loss\": 3.9545229598879814, \"mean_val_loss\": 4.310179218649864, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.14, \"mean_generalization_gap\": 0.31573040783405304, \"mean_train_loss\": 4.008677661418915, \"mean_val_loss\": 4.324408069252968, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.2, \"mean_generalization_gap\": 0.2667204812169075, \"mean_train_loss\": 4.085749976336956, \"mean_val_loss\": 4.3524704575538635, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.3, \"mean_generalization_gap\": 0.22910351306200027, \"mean_train_loss\": 4.181676417589188, \"mean_val_loss\": 4.410779930651188, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.4, \"mean_generalization_gap\": 0.20371749252080917, \"mean_train_loss\": 4.293805286288261, \"mean_val_loss\": 4.497522778809071, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.5, \"mean_generalization_gap\": 0.17503493279218674, \"mean_train_loss\": 4.431871071457863, \"mean_val_loss\": 4.60690600425005, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.6, \"mean_generalization_gap\": 0.13883397728204727, \"mean_train_loss\": 4.635675564408302, \"mean_val_loss\": 4.77450954169035, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.7, \"mean_generalization_gap\": 0.11617618799209595, \"mean_train_loss\": 4.896953746676445, \"mean_val_loss\": 5.013129934668541, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.8, \"mean_generalization_gap\": 0.06960213929414749, \"mean_train_loss\": 5.347471252083778, \"mean_val_loss\": 5.417073391377926, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.9, \"mean_generalization_gap\": 0.020191222429275513, \"mean_train_loss\": 6.603688538074493, \"mean_val_loss\": 6.623879760503769, \"n\": 1, \"std_val_loss\": 0.0}]"
|
| 325 |
+
},
|
| 326 |
+
{
|
| 327 |
+
"run_mode": "screen_static",
|
| 328 |
+
"token_limit": 4000000,
|
| 329 |
+
"model_name": "L12_H8_D320",
|
| 330 |
+
"n_layer": 12,
|
| 331 |
+
"n_head": 8,
|
| 332 |
+
"n_embd": 320,
|
| 333 |
+
"parameters": 17367040,
|
| 334 |
+
"n": 1,
|
| 335 |
+
"best_dropout": 0.02,
|
| 336 |
+
"best_val_loss": 4.387514792382717,
|
| 337 |
+
"best_val_std": 0.0,
|
| 338 |
+
"plateau_start_dropout": 0.02,
|
| 339 |
+
"plateau_end_dropout": 0.02,
|
| 340 |
+
"plateau_delta": 0.01,
|
| 341 |
+
"zero_dropout_val_loss": 4.399908438324928,
|
| 342 |
+
"zero_minus_best": 0.012393645942211151,
|
| 343 |
+
"best_nonzero_dropout": 0.02,
|
| 344 |
+
"best_nonzero_val_loss": 4.387514792382717,
|
| 345 |
+
"zero_minus_best_nonzero": 0.012393645942211151,
|
| 346 |
+
"max_dropout": 0.9,
|
| 347 |
+
"max_dropout_val_loss": 6.7735652178525925,
|
| 348 |
+
"max_dropout_minus_best": 2.3860504254698753,
|
| 349 |
+
"has_nonzero_optimum": true,
|
| 350 |
+
"meets_target_dropout": false,
|
| 351 |
+
"curve_json": "[{\"dropout\": 0.0, \"mean_generalization_gap\": 0.3676489070057869, \"mean_train_loss\": 4.032259531319141, \"mean_val_loss\": 4.399908438324928, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.02, \"mean_generalization_gap\": 0.35747382044792175, \"mean_train_loss\": 4.030040971934795, \"mean_val_loss\": 4.387514792382717, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.05, \"mean_generalization_gap\": 0.3134579360485077, \"mean_train_loss\": 4.092799432575703, \"mean_val_loss\": 4.40625736862421, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.08, \"mean_generalization_gap\": 0.28922586888074875, \"mean_train_loss\": 4.108970053493977, \"mean_val_loss\": 4.398195922374725, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.1, \"mean_generalization_gap\": 0.27910205721855164, \"mean_train_loss\": 4.135565012693405, \"mean_val_loss\": 4.414667069911957, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.14, \"mean_generalization_gap\": 0.25001612305641174, \"mean_train_loss\": 4.194389328360558, \"mean_val_loss\": 4.444405451416969, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.2, \"mean_generalization_gap\": 0.23343528807163239, \"mean_train_loss\": 4.247993364930153, \"mean_val_loss\": 4.481428653001785, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.3, \"mean_generalization_gap\": 0.19782067835330963, \"mean_train_loss\": 4.347917139530182, \"mean_val_loss\": 4.5457378178834915, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.4, \"mean_generalization_gap\": 0.16299572587013245, \"mean_train_loss\": 4.486644446849823, \"mean_val_loss\": 4.6496401727199554, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.5, \"mean_generalization_gap\": 0.13463184982538223, \"mean_train_loss\": 4.6344869285821915, \"mean_val_loss\": 4.769118778407574, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.6, \"mean_generalization_gap\": 0.12295009940862656, \"mean_train_loss\": 4.809540346264839, \"mean_val_loss\": 4.932490445673466, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.7, \"mean_generalization_gap\": 0.09925898164510727, \"mean_train_loss\": 5.056431487202644, \"mean_val_loss\": 5.155690468847752, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.8, \"mean_generalization_gap\": 0.05897616595029831, \"mean_train_loss\": 5.429492920637131, \"mean_val_loss\": 5.488469086587429, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.9, \"mean_generalization_gap\": 0.01472686231136322, \"mean_train_loss\": 6.758838355541229, \"mean_val_loss\": 6.7735652178525925, \"n\": 1, \"std_val_loss\": 0.0}]"
|
| 352 |
+
},
|
| 353 |
+
{
|
| 354 |
+
"run_mode": "screen_static",
|
| 355 |
+
"token_limit": 2000000,
|
| 356 |
+
"model_name": "L8_H8_D256",
|
| 357 |
+
"n_layer": 8,
|
| 358 |
+
"n_head": 8,
|
| 359 |
+
"n_embd": 256,
|
| 360 |
+
"parameters": 8388608,
|
| 361 |
+
"n": 1,
|
| 362 |
+
"best_dropout": 0.08,
|
| 363 |
+
"best_val_loss": 4.623203128576279,
|
| 364 |
+
"best_val_std": 0.0,
|
| 365 |
+
"plateau_start_dropout": 0.05,
|
| 366 |
+
"plateau_end_dropout": 0.08,
|
| 367 |
+
"plateau_delta": 0.01,
|
| 368 |
+
"zero_dropout_val_loss": 4.6497810408473015,
|
| 369 |
+
"zero_minus_best": 0.026577912271022797,
|
| 370 |
+
"best_nonzero_dropout": 0.08,
|
| 371 |
+
"best_nonzero_val_loss": 4.623203128576279,
|
| 372 |
+
"zero_minus_best_nonzero": 0.026577912271022797,
|
| 373 |
+
"max_dropout": 0.9,
|
| 374 |
+
"max_dropout_val_loss": 7.041225396096706,
|
| 375 |
+
"max_dropout_minus_best": 2.4180222675204277,
|
| 376 |
+
"has_nonzero_optimum": true,
|
| 377 |
+
"meets_target_dropout": false,
|
| 378 |
+
"curve_json": "[{\"dropout\": 0.0, \"mean_generalization_gap\": 0.5635415837168694, \"mean_train_loss\": 4.086239457130432, \"mean_val_loss\": 4.6497810408473015, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.02, \"mean_generalization_gap\": 0.5019041448831558, \"mean_train_loss\": 4.138374790549278, \"mean_val_loss\": 4.640278935432434, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.05, \"mean_generalization_gap\": 0.43557654321193695, \"mean_train_loss\": 4.196284607052803, \"mean_val_loss\": 4.63186115026474, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.08, \"mean_generalization_gap\": 0.40735629945993423, \"mean_train_loss\": 4.2158468291163445, \"mean_val_loss\": 4.623203128576279, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.1, \"mean_generalization_gap\": 0.3851764276623726, \"mean_train_loss\": 4.267480067908764, \"mean_val_loss\": 4.6526564955711365, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.14, \"mean_generalization_gap\": 0.36042413860559464, \"mean_train_loss\": 4.291930258274078, \"mean_val_loss\": 4.652354396879673, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.2, \"mean_generalization_gap\": 0.3055316135287285, \"mean_train_loss\": 4.382655680179596, \"mean_val_loss\": 4.688187293708324, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.3, \"mean_generalization_gap\": 0.26447462290525436, \"mean_train_loss\": 4.514920085668564, \"mean_val_loss\": 4.779394708573818, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.4, \"mean_generalization_gap\": 0.22775156050920486, \"mean_train_loss\": 4.651856943964958, \"mean_val_loss\": 4.879608504474163, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.5, \"mean_generalization_gap\": 0.18726412951946259, \"mean_train_loss\": 4.822874888777733, \"mean_val_loss\": 5.010139018297195, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.6, \"mean_generalization_gap\": 0.16301864385604858, \"mean_train_loss\": 4.99927744269371, \"mean_val_loss\": 5.162296086549759, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.7, \"mean_generalization_gap\": 0.1327260509133339, \"mean_train_loss\": 5.243539854884148, \"mean_val_loss\": 5.3762659057974815, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.8, \"mean_generalization_gap\": 0.09502238035202026, \"mean_train_loss\": 5.64663989841938, \"mean_val_loss\": 5.7416622787714005, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.9, \"mean_generalization_gap\": 0.0319184884428978, \"mean_train_loss\": 7.009306907653809, \"mean_val_loss\": 7.041225396096706, \"n\": 1, \"std_val_loss\": 0.0}]"
|
| 379 |
+
},
|
| 380 |
+
{
|
| 381 |
+
"run_mode": "screen_static",
|
| 382 |
+
"token_limit": 4000000,
|
| 383 |
+
"model_name": "L8_H8_D256",
|
| 384 |
+
"n_layer": 8,
|
| 385 |
+
"n_head": 8,
|
| 386 |
+
"n_embd": 256,
|
| 387 |
+
"parameters": 8388608,
|
| 388 |
+
"n": 1,
|
| 389 |
+
"best_dropout": 0.0,
|
| 390 |
+
"best_val_loss": 4.513561494648457,
|
| 391 |
+
"best_val_std": 0.0,
|
| 392 |
+
"plateau_start_dropout": 0.0,
|
| 393 |
+
"plateau_end_dropout": 0.0,
|
| 394 |
+
"plateau_delta": 0.01,
|
| 395 |
+
"zero_dropout_val_loss": 4.513561494648457,
|
| 396 |
+
"zero_minus_best": 0.0,
|
| 397 |
+
"best_nonzero_dropout": 0.02,
|
| 398 |
+
"best_nonzero_val_loss": 4.528690077364445,
|
| 399 |
+
"zero_minus_best_nonzero": -0.01512858271598816,
|
| 400 |
+
"max_dropout": 0.9,
|
| 401 |
+
"max_dropout_val_loss": 7.056577272713184,
|
| 402 |
+
"max_dropout_minus_best": 2.543015778064728,
|
| 403 |
+
"has_nonzero_optimum": false,
|
| 404 |
+
"meets_target_dropout": false,
|
| 405 |
+
"curve_json": "[{\"dropout\": 0.0, \"mean_generalization_gap\": 0.2620946392416954, \"mean_train_loss\": 4.251466855406761, \"mean_val_loss\": 4.513561494648457, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.02, \"mean_generalization_gap\": 0.2527272030711174, \"mean_train_loss\": 4.275962874293327, \"mean_val_loss\": 4.528690077364445, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.05, \"mean_generalization_gap\": 0.23059415817260742, \"mean_train_loss\": 4.309461995959282, \"mean_val_loss\": 4.540056154131889, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.08, \"mean_generalization_gap\": 0.21540386974811554, \"mean_train_loss\": 4.354661911725998, \"mean_val_loss\": 4.5700657814741135, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.1, \"mean_generalization_gap\": 0.1957206204533577, \"mean_train_loss\": 4.382582053542137, \"mean_val_loss\": 4.578302673995495, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.14, \"mean_generalization_gap\": 0.19393974542617798, \"mean_train_loss\": 4.412918761372566, \"mean_val_loss\": 4.606858506798744, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.2, \"mean_generalization_gap\": 0.16235049068927765, \"mean_train_loss\": 4.493421167135239, \"mean_val_loss\": 4.655771657824516, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.3, \"mean_generalization_gap\": 0.15203186124563217, \"mean_train_loss\": 4.599567919969559, \"mean_val_loss\": 4.751599781215191, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.4, \"mean_generalization_gap\": 0.12918514013290405, \"mean_train_loss\": 4.726367101073265, \"mean_val_loss\": 4.855552241206169, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.5, \"mean_generalization_gap\": 0.10444032400846481, \"mean_train_loss\": 4.873788967728615, \"mean_val_loss\": 4.97822929173708, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.6, \"mean_generalization_gap\": 0.09351418912410736, \"mean_train_loss\": 5.051660537719727, \"mean_val_loss\": 5.145174726843834, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.7, \"mean_generalization_gap\": 0.0773598924279213, \"mean_train_loss\": 5.283298075199127, \"mean_val_loss\": 5.3606579676270485, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.8, \"mean_generalization_gap\": 0.04376060515642166, \"mean_train_loss\": 5.67519947886467, \"mean_val_loss\": 5.7189600840210915, \"n\": 1, \"std_val_loss\": 0.0}, {\"dropout\": 0.9, \"mean_generalization_gap\": 0.027009807527065277, \"mean_train_loss\": 7.029567465186119, \"mean_val_loss\": 7.056577272713184, \"n\": 1, \"std_val_loss\": 0.0}]"
|
| 406 |
}
|
| 407 |
]
|
runs/screen_static/20260525-133008/summary.csv
CHANGED
|
@@ -1,7 +1,78 @@
|
|
| 1 |
run_mode,condition,condition_kind,stage,token_limit,model_name,n_layer,n_head,n_embd,parameters,dropout_initial,dropout_final,dropout_schedule,n,mean_train_eval_loss,std_train_eval_loss,mean_val_eval_loss,std_val_eval_loss,mean_generalization_gap,std_generalization_gap
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
screen_static,static_dropout_0,static,,250000,L16_H8_D384,16,8,384,31457280,0.0,0.0,constant,1,0.2792285941541195,0.0,8.201775573194027,0.0,7.9225469790399075,0.0
|
| 3 |
screen_static,static_dropout_0.02,static,,250000,L16_H8_D384,16,8,384,31457280,0.02,0.02,constant,1,0.2605770383961499,0.0,8.07141625136137,0.0,7.81083921296522,0.0
|
| 4 |
screen_static,static_dropout_0.05,static,,250000,L16_H8_D384,16,8,384,31457280,0.05,0.05,constant,1,0.26725416351109743,0.0,7.9582009464502335,0.0,7.690946782939136,0.0
|
|
|
|
| 5 |
screen_static,static_dropout_0.1,static,,250000,L16_H8_D384,16,8,384,31457280,0.1,0.1,constant,1,0.338621674105525,0.0,7.588417321443558,0.0,7.249795647338033,0.0
|
| 6 |
screen_static,static_dropout_0.14,static,,250000,L16_H8_D384,16,8,384,31457280,0.14,0.14,constant,1,0.49970753211528063,0.0,7.2848557233810425,0.0,6.785148191265762,0.0
|
| 7 |
screen_static,static_dropout_0.2,static,,250000,L16_H8_D384,16,8,384,31457280,0.2,0.2,constant,1,0.8224823493510485,0.0,6.83830863237381,0.0,6.015826283022761,0.0
|
|
@@ -15,6 +86,7 @@ screen_static,static_dropout_0.9,static,,250000,L16_H8_D384,16,8,384,31457280,0.
|
|
| 15 |
screen_static,static_dropout_0,static,,500000,L16_H8_D384,16,8,384,31457280,0.0,0.0,constant,1,0.7200149409472942,0.0,7.288873381912708,0.0,6.568858440965414,0.0
|
| 16 |
screen_static,static_dropout_0.02,static,,500000,L16_H8_D384,16,8,384,31457280,0.02,0.02,constant,1,0.9117673244327307,0.0,6.895432710647583,0.0,5.983665386214852,0.0
|
| 17 |
screen_static,static_dropout_0.05,static,,500000,L16_H8_D384,16,8,384,31457280,0.05,0.05,constant,1,1.2327042073011398,0.0,6.497423432767391,0.0,5.264719225466251,0.0
|
|
|
|
| 18 |
screen_static,static_dropout_0.1,static,,500000,L16_H8_D384,16,8,384,31457280,0.1,0.1,constant,1,1.6818840466439724,0.0,5.9032503962516785,0.0,4.221366349607706,0.0
|
| 19 |
screen_static,static_dropout_0.14,static,,500000,L16_H8_D384,16,8,384,31457280,0.14,0.14,constant,1,1.9904268346726894,0.0,5.6262156665325165,0.0,3.635788831859827,0.0
|
| 20 |
screen_static,static_dropout_0.2,static,,500000,L16_H8_D384,16,8,384,31457280,0.2,0.2,constant,1,2.416015110909939,0.0,5.327137835323811,0.0,2.9111227244138718,0.0
|
|
@@ -28,6 +100,7 @@ screen_static,static_dropout_0.9,static,,500000,L16_H8_D384,16,8,384,31457280,0.
|
|
| 28 |
screen_static,static_dropout_0,static,,1000000,L16_H8_D384,16,8,384,31457280,0.0,0.0,constant,1,2.2476234324276447,0.0,5.652379140257835,0.0,3.4047557078301907,0.0
|
| 29 |
screen_static,static_dropout_0.02,static,,1000000,L16_H8_D384,16,8,384,31457280,0.02,0.02,constant,1,2.4646411538124084,0.0,5.3384313732385635,0.0,2.873790219426155,0.0
|
| 30 |
screen_static,static_dropout_0.05,static,,1000000,L16_H8_D384,16,8,384,31457280,0.05,0.05,constant,1,2.7036017999053,0.0,5.114995934069157,0.0,2.4113941341638565,0.0
|
|
|
|
| 31 |
screen_static,static_dropout_0.1,static,,1000000,L16_H8_D384,16,8,384,31457280,0.1,0.1,constant,1,2.992223806679249,0.0,4.842831656336784,0.0,1.8506078496575356,0.0
|
| 32 |
screen_static,static_dropout_0.14,static,,1000000,L16_H8_D384,16,8,384,31457280,0.14,0.14,constant,1,3.1947626248002052,0.0,4.751600541174412,0.0,1.5568379163742065,0.0
|
| 33 |
screen_static,static_dropout_0.2,static,,1000000,L16_H8_D384,16,8,384,31457280,0.2,0.2,constant,1,3.3926073163747787,0.0,4.67560038715601,0.0,1.282993070781231,0.0
|
|
@@ -41,7 +114,98 @@ screen_static,static_dropout_0.9,static,,1000000,L16_H8_D384,16,8,384,31457280,0
|
|
| 41 |
screen_static,static_dropout_0,static,,2000000,L16_H8_D384,16,8,384,31457280,0.0,0.0,constant,1,3.3746245950460434,0.0,4.625189505517483,0.0,1.2505649104714394,0.0
|
| 42 |
screen_static,static_dropout_0.02,static,,2000000,L16_H8_D384,16,8,384,31457280,0.02,0.02,constant,1,3.4837590381503105,0.0,4.542641267180443,0.0,1.0588822290301323,0.0
|
| 43 |
screen_static,static_dropout_0.05,static,,2000000,L16_H8_D384,16,8,384,31457280,0.05,0.05,constant,1,3.57197318226099,0.0,4.487178318202496,0.0,0.9152051359415054,0.0
|
|
|
|
| 44 |
screen_static,static_dropout_0.1,static,,2000000,L16_H8_D384,16,8,384,31457280,0.1,0.1,constant,1,3.686768524348736,0.0,4.436601020395756,0.0,0.74983249604702,0.0
|
| 45 |
screen_static,static_dropout_0.14,static,,2000000,L16_H8_D384,16,8,384,31457280,0.14,0.14,constant,1,3.776127725839615,0.0,4.427024222910404,0.0,0.6508964970707893,0.0
|
| 46 |
screen_static,static_dropout_0.2,static,,2000000,L16_H8_D384,16,8,384,31457280,0.2,0.2,constant,1,3.9085786044597626,0.0,4.450810715556145,0.0,0.5422321110963821,0.0
|
| 47 |
screen_static,static_dropout_0.3,static,,2000000,L16_H8_D384,16,8,384,31457280,0.3,0.3,constant,1,4.04004543274641,0.0,4.494914263486862,0.0,0.4548688307404518,0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
run_mode,condition,condition_kind,stage,token_limit,model_name,n_layer,n_head,n_embd,parameters,dropout_initial,dropout_final,dropout_schedule,n,mean_train_eval_loss,std_train_eval_loss,mean_val_eval_loss,std_val_eval_loss,mean_generalization_gap,std_generalization_gap
|
| 2 |
+
screen_static,static_dropout_0,static,,250000,L12_H8_D320,12,8,320,17367040,0.0,0.0,constant,1,0.3941123625263572,0.0,8.15590063482523,0.0,7.7617882722988725,0.0
|
| 3 |
+
screen_static,static_dropout_0.02,static,,250000,L12_H8_D320,12,8,320,17367040,0.02,0.02,constant,1,0.4179071066901088,0.0,7.992463633418083,0.0,7.574556526727974,0.0
|
| 4 |
+
screen_static,static_dropout_0.05,static,,250000,L12_H8_D320,12,8,320,17367040,0.05,0.05,constant,1,0.5652239341288805,0.0,7.520913131535053,0.0,6.955689197406173,0.0
|
| 5 |
+
screen_static,static_dropout_0.08,static,,250000,L12_H8_D320,12,8,320,17367040,0.08,0.08,constant,1,0.7794149778783321,0.0,7.110055461525917,0.0,6.330640483647585,0.0
|
| 6 |
+
screen_static,static_dropout_0.1,static,,250000,L12_H8_D320,12,8,320,17367040,0.1,0.1,constant,1,0.9396447688341141,0.0,6.955568745732307,0.0,6.015923976898193,0.0
|
| 7 |
+
screen_static,static_dropout_0.14,static,,250000,L12_H8_D320,12,8,320,17367040,0.14,0.14,constant,1,1.2607302814722061,0.0,6.557253144681454,0.0,5.296522863209248,0.0
|
| 8 |
+
screen_static,static_dropout_0.2,static,,250000,L12_H8_D320,12,8,320,17367040,0.2,0.2,constant,1,1.6681972332298756,0.0,6.14814979583025,0.0,4.479952562600374,0.0
|
| 9 |
+
screen_static,static_dropout_0.3,static,,250000,L12_H8_D320,12,8,320,17367040,0.3,0.3,constant,1,2.34628177434206,0.0,5.712921425700188,0.0,3.3666396513581276,0.0
|
| 10 |
+
screen_static,static_dropout_0.4,static,,250000,L12_H8_D320,12,8,320,17367040,0.4,0.4,constant,1,2.881612576544285,0.0,5.518501423299313,0.0,2.6368888467550278,0.0
|
| 11 |
+
screen_static,static_dropout_0.5,static,,250000,L12_H8_D320,12,8,320,17367040,0.5,0.5,constant,1,3.372021272778511,0.0,5.438360869884491,0.0,2.06633959710598,0.0
|
| 12 |
+
screen_static,static_dropout_0.6,static,,250000,L12_H8_D320,12,8,320,17367040,0.6,0.6,constant,1,3.8244777768850327,0.0,5.500709563493729,0.0,1.676231786608696,0.0
|
| 13 |
+
screen_static,static_dropout_0.7,static,,250000,L12_H8_D320,12,8,320,17367040,0.7,0.7,constant,1,4.315010622143745,0.0,5.6531674563884735,0.0,1.338156834244728,0.0
|
| 14 |
+
screen_static,static_dropout_0.8,static,,250000,L12_H8_D320,12,8,320,17367040,0.8,0.8,constant,1,4.907547950744629,0.0,5.926037907600403,0.0,1.018489956855774,0.0
|
| 15 |
+
screen_static,static_dropout_0.9,static,,250000,L12_H8_D320,12,8,320,17367040,0.9,0.9,constant,1,6.403348430991173,0.0,6.905720971524715,0.0,0.5023725405335426,0.0
|
| 16 |
+
screen_static,static_dropout_0,static,,500000,L12_H8_D320,12,8,320,17367040,0.0,0.0,constant,1,1.4378521665930748,0.0,6.882166460156441,0.0,5.444314293563366,0.0
|
| 17 |
+
screen_static,static_dropout_0.02,static,,500000,L12_H8_D320,12,8,320,17367040,0.02,0.02,constant,1,1.7486785426735878,0.0,6.384260520339012,0.0,4.635581977665424,0.0
|
| 18 |
+
screen_static,static_dropout_0.05,static,,500000,L12_H8_D320,12,8,320,17367040,0.05,0.05,constant,1,2.0567418597638607,0.0,5.895766265690327,0.0,3.839024405926466,0.0
|
| 19 |
+
screen_static,static_dropout_0.08,static,,500000,L12_H8_D320,12,8,320,17367040,0.08,0.08,constant,1,2.371535360813141,0.0,5.597154214978218,0.0,3.225618854165077,0.0
|
| 20 |
+
screen_static,static_dropout_0.1,static,,500000,L12_H8_D320,12,8,320,17367040,0.1,0.1,constant,1,2.4893965795636177,0.0,5.44785263389349,0.0,2.958456054329872,0.0
|
| 21 |
+
screen_static,static_dropout_0.14,static,,500000,L12_H8_D320,12,8,320,17367040,0.14,0.14,constant,1,2.7468534484505653,0.0,5.25950114428997,0.0,2.512647695839405,0.0
|
| 22 |
+
screen_static,static_dropout_0.2,static,,500000,L12_H8_D320,12,8,320,17367040,0.2,0.2,constant,1,3.0360172167420387,0.0,5.096045188605785,0.0,2.0600279718637466,0.0
|
| 23 |
+
screen_static,static_dropout_0.3,static,,500000,L12_H8_D320,12,8,320,17367040,0.3,0.3,constant,1,3.40982835739851,0.0,4.984055809676647,0.0,1.5742274522781372,0.0
|
| 24 |
+
screen_static,static_dropout_0.4,static,,500000,L12_H8_D320,12,8,320,17367040,0.4,0.4,constant,1,3.7357609048485756,0.0,4.979132980108261,0.0,1.2433720752596855,0.0
|
| 25 |
+
screen_static,static_dropout_0.5,static,,500000,L12_H8_D320,12,8,320,17367040,0.5,0.5,constant,1,4.029533125460148,0.0,5.058223366737366,0.0,1.0286902412772179,0.0
|
| 26 |
+
screen_static,static_dropout_0.6,static,,500000,L12_H8_D320,12,8,320,17367040,0.6,0.6,constant,1,4.322844922542572,0.0,5.183757498860359,0.0,0.8609125763177872,0.0
|
| 27 |
+
screen_static,static_dropout_0.7,static,,500000,L12_H8_D320,12,8,320,17367040,0.7,0.7,constant,1,4.680620342493057,0.0,5.384185768663883,0.0,0.703565426170826,0.0
|
| 28 |
+
screen_static,static_dropout_0.8,static,,500000,L12_H8_D320,12,8,320,17367040,0.8,0.8,constant,1,5.157206907868385,0.0,5.687841325998306,0.0,0.530634418129921,0.0
|
| 29 |
+
screen_static,static_dropout_0.9,static,,500000,L12_H8_D320,12,8,320,17367040,0.9,0.9,constant,1,6.49226588010788,0.0,6.766090348362923,0.0,0.27382446825504303,0.0
|
| 30 |
+
screen_static,static_dropout_0,static,,1000000,L12_H8_D320,12,8,320,17367040,0.0,0.0,constant,1,2.886379562318325,0.0,5.327972687780857,0.0,2.441593125462532,0.0
|
| 31 |
+
screen_static,static_dropout_0.02,static,,1000000,L12_H8_D320,12,8,320,17367040,0.02,0.02,constant,1,3.0628975853323936,0.0,5.098849378526211,0.0,2.035951793193817,0.0
|
| 32 |
+
screen_static,static_dropout_0.05,static,,1000000,L12_H8_D320,12,8,320,17367040,0.05,0.05,constant,1,3.268335275352001,0.0,4.902691639959812,0.0,1.634356364607811,0.0
|
| 33 |
+
screen_static,static_dropout_0.08,static,,1000000,L12_H8_D320,12,8,320,17367040,0.08,0.08,constant,1,3.3916852474212646,0.0,4.789849318563938,0.0,1.3981640711426735,0.0
|
| 34 |
+
screen_static,static_dropout_0.1,static,,1000000,L12_H8_D320,12,8,320,17367040,0.1,0.1,constant,1,3.4663245379924774,0.0,4.7517495304346085,0.0,1.285424992442131,0.0
|
| 35 |
+
screen_static,static_dropout_0.14,static,,1000000,L12_H8_D320,12,8,320,17367040,0.14,0.14,constant,1,3.591317318379879,0.0,4.717012815177441,0.0,1.1256954967975616,0.0
|
| 36 |
+
screen_static,static_dropout_0.2,static,,1000000,L12_H8_D320,12,8,320,17367040,0.2,0.2,constant,1,3.7159592658281326,0.0,4.687077932059765,0.0,0.9711186662316322,0.0
|
| 37 |
+
screen_static,static_dropout_0.3,static,,1000000,L12_H8_D320,12,8,320,17367040,0.3,0.3,constant,1,3.9408339336514473,0.0,4.721444360911846,0.0,0.7806104272603989,0.0
|
| 38 |
+
screen_static,static_dropout_0.4,static,,1000000,L12_H8_D320,12,8,320,17367040,0.4,0.4,constant,1,4.1257737800478935,0.0,4.780547931790352,0.0,0.6547741517424583,0.0
|
| 39 |
+
screen_static,static_dropout_0.5,static,,1000000,L12_H8_D320,12,8,320,17367040,0.5,0.5,constant,1,4.341994017362595,0.0,4.8996801152825356,0.0,0.557686097919941,0.0
|
| 40 |
+
screen_static,static_dropout_0.6,static,,1000000,L12_H8_D320,12,8,320,17367040,0.6,0.6,constant,1,4.583218589425087,0.0,5.059419609606266,0.0,0.47620102018117905,0.0
|
| 41 |
+
screen_static,static_dropout_0.7,static,,1000000,L12_H8_D320,12,8,320,17367040,0.7,0.7,constant,1,4.866017505526543,0.0,5.257686018943787,0.0,0.39166851341724396,0.0
|
| 42 |
+
screen_static,static_dropout_0.8,static,,1000000,L12_H8_D320,12,8,320,17367040,0.8,0.8,constant,1,5.279122695326805,0.0,5.569660127162933,0.0,0.29053743183612823,0.0
|
| 43 |
+
screen_static,static_dropout_0.9,static,,1000000,L12_H8_D320,12,8,320,17367040,0.9,0.9,constant,1,6.6291023045778275,0.0,6.74848935008049,0.0,0.11938704550266266,0.0
|
| 44 |
+
screen_static,static_dropout_0,static,,2000000,L12_H8_D320,12,8,320,17367040,0.0,0.0,constant,1,3.716029703617096,0.0,4.59531170129776,0.0,0.8792819976806641,0.0
|
| 45 |
+
screen_static,static_dropout_0.02,static,,2000000,L12_H8_D320,12,8,320,17367040,0.02,0.02,constant,1,3.7766708433628082,0.0,4.549462892115116,0.0,0.7727920487523079,0.0
|
| 46 |
+
screen_static,static_dropout_0.05,static,,2000000,L12_H8_D320,12,8,320,17367040,0.05,0.05,constant,1,3.8593605384230614,0.0,4.51806978136301,0.0,0.658709242939949,0.0
|
| 47 |
+
screen_static,static_dropout_0.08,static,,2000000,L12_H8_D320,12,8,320,17367040,0.08,0.08,constant,1,3.9309239983558655,0.0,4.512066327035427,0.0,0.5811423286795616,0.0
|
| 48 |
+
screen_static,static_dropout_0.1,static,,2000000,L12_H8_D320,12,8,320,17367040,0.1,0.1,constant,1,3.9626980274915695,0.0,4.521232694387436,0.0,0.5585346668958664,0.0
|
| 49 |
+
screen_static,static_dropout_0.14,static,,2000000,L12_H8_D320,12,8,320,17367040,0.14,0.14,constant,1,4.021788038313389,0.0,4.508757032454014,0.0,0.486968994140625,0.0
|
| 50 |
+
screen_static,static_dropout_0.2,static,,2000000,L12_H8_D320,12,8,320,17367040,0.2,0.2,constant,1,4.107810087502003,0.0,4.5469953790307045,0.0,0.4391852915287018,0.0
|
| 51 |
+
screen_static,static_dropout_0.3,static,,2000000,L12_H8_D320,12,8,320,17367040,0.3,0.3,constant,1,4.248827308416367,0.0,4.6074274107813835,0.0,0.35860010236501694,0.0
|
| 52 |
+
screen_static,static_dropout_0.4,static,,2000000,L12_H8_D320,12,8,320,17367040,0.4,0.4,constant,1,4.388647809624672,0.0,4.695319287478924,0.0,0.30667147785425186,0.0
|
| 53 |
+
screen_static,static_dropout_0.5,static,,2000000,L12_H8_D320,12,8,320,17367040,0.5,0.5,constant,1,4.564960986375809,0.0,4.814361557364464,0.0,0.2494005709886551,0.0
|
| 54 |
+
screen_static,static_dropout_0.6,static,,2000000,L12_H8_D320,12,8,320,17367040,0.6,0.6,constant,1,4.756197020411491,0.0,4.968473710119724,0.0,0.21227668970823288,0.0
|
| 55 |
+
screen_static,static_dropout_0.7,static,,2000000,L12_H8_D320,12,8,320,17367040,0.7,0.7,constant,1,5.009785428643227,0.0,5.180024988949299,0.0,0.17023956030607224,0.0
|
| 56 |
+
screen_static,static_dropout_0.8,static,,2000000,L12_H8_D320,12,8,320,17367040,0.8,0.8,constant,1,5.402336552739143,0.0,5.5206974893808365,0.0,0.11836093664169312,0.0
|
| 57 |
+
screen_static,static_dropout_0.9,static,,2000000,L12_H8_D320,12,8,320,17367040,0.9,0.9,constant,1,6.721093267202377,0.0,6.741106614470482,0.0,0.020013347268104553,0.0
|
| 58 |
+
screen_static,static_dropout_0,static,,4000000,L12_H8_D320,12,8,320,17367040,0.0,0.0,constant,1,4.032259531319141,0.0,4.399908438324928,0.0,0.3676489070057869,0.0
|
| 59 |
+
screen_static,static_dropout_0.02,static,,4000000,L12_H8_D320,12,8,320,17367040,0.02,0.02,constant,1,4.030040971934795,0.0,4.387514792382717,0.0,0.35747382044792175,0.0
|
| 60 |
+
screen_static,static_dropout_0.05,static,,4000000,L12_H8_D320,12,8,320,17367040,0.05,0.05,constant,1,4.092799432575703,0.0,4.40625736862421,0.0,0.3134579360485077,0.0
|
| 61 |
+
screen_static,static_dropout_0.08,static,,4000000,L12_H8_D320,12,8,320,17367040,0.08,0.08,constant,1,4.108970053493977,0.0,4.398195922374725,0.0,0.28922586888074875,0.0
|
| 62 |
+
screen_static,static_dropout_0.1,static,,4000000,L12_H8_D320,12,8,320,17367040,0.1,0.1,constant,1,4.135565012693405,0.0,4.414667069911957,0.0,0.27910205721855164,0.0
|
| 63 |
+
screen_static,static_dropout_0.14,static,,4000000,L12_H8_D320,12,8,320,17367040,0.14,0.14,constant,1,4.194389328360558,0.0,4.444405451416969,0.0,0.25001612305641174,0.0
|
| 64 |
+
screen_static,static_dropout_0.2,static,,4000000,L12_H8_D320,12,8,320,17367040,0.2,0.2,constant,1,4.247993364930153,0.0,4.481428653001785,0.0,0.23343528807163239,0.0
|
| 65 |
+
screen_static,static_dropout_0.3,static,,4000000,L12_H8_D320,12,8,320,17367040,0.3,0.3,constant,1,4.347917139530182,0.0,4.5457378178834915,0.0,0.19782067835330963,0.0
|
| 66 |
+
screen_static,static_dropout_0.4,static,,4000000,L12_H8_D320,12,8,320,17367040,0.4,0.4,constant,1,4.486644446849823,0.0,4.6496401727199554,0.0,0.16299572587013245,0.0
|
| 67 |
+
screen_static,static_dropout_0.5,static,,4000000,L12_H8_D320,12,8,320,17367040,0.5,0.5,constant,1,4.6344869285821915,0.0,4.769118778407574,0.0,0.13463184982538223,0.0
|
| 68 |
+
screen_static,static_dropout_0.6,static,,4000000,L12_H8_D320,12,8,320,17367040,0.6,0.6,constant,1,4.809540346264839,0.0,4.932490445673466,0.0,0.12295009940862656,0.0
|
| 69 |
+
screen_static,static_dropout_0.7,static,,4000000,L12_H8_D320,12,8,320,17367040,0.7,0.7,constant,1,5.056431487202644,0.0,5.155690468847752,0.0,0.09925898164510727,0.0
|
| 70 |
+
screen_static,static_dropout_0.8,static,,4000000,L12_H8_D320,12,8,320,17367040,0.8,0.8,constant,1,5.429492920637131,0.0,5.488469086587429,0.0,0.05897616595029831,0.0
|
| 71 |
+
screen_static,static_dropout_0.9,static,,4000000,L12_H8_D320,12,8,320,17367040,0.9,0.9,constant,1,6.758838355541229,0.0,6.7735652178525925,0.0,0.01472686231136322,0.0
|
| 72 |
screen_static,static_dropout_0,static,,250000,L16_H8_D384,16,8,384,31457280,0.0,0.0,constant,1,0.2792285941541195,0.0,8.201775573194027,0.0,7.9225469790399075,0.0
|
| 73 |
screen_static,static_dropout_0.02,static,,250000,L16_H8_D384,16,8,384,31457280,0.02,0.02,constant,1,0.2605770383961499,0.0,8.07141625136137,0.0,7.81083921296522,0.0
|
| 74 |
screen_static,static_dropout_0.05,static,,250000,L16_H8_D384,16,8,384,31457280,0.05,0.05,constant,1,0.26725416351109743,0.0,7.9582009464502335,0.0,7.690946782939136,0.0
|
| 75 |
+
screen_static,static_dropout_0.08,static,,250000,L16_H8_D384,16,8,384,31457280,0.08,0.08,constant,1,0.29682786762714386,0.0,7.760790415108204,0.0,7.46396254748106,0.0
|
| 76 |
screen_static,static_dropout_0.1,static,,250000,L16_H8_D384,16,8,384,31457280,0.1,0.1,constant,1,0.338621674105525,0.0,7.588417321443558,0.0,7.249795647338033,0.0
|
| 77 |
screen_static,static_dropout_0.14,static,,250000,L16_H8_D384,16,8,384,31457280,0.14,0.14,constant,1,0.49970753211528063,0.0,7.2848557233810425,0.0,6.785148191265762,0.0
|
| 78 |
screen_static,static_dropout_0.2,static,,250000,L16_H8_D384,16,8,384,31457280,0.2,0.2,constant,1,0.8224823493510485,0.0,6.83830863237381,0.0,6.015826283022761,0.0
|
|
|
|
| 86 |
screen_static,static_dropout_0,static,,500000,L16_H8_D384,16,8,384,31457280,0.0,0.0,constant,1,0.7200149409472942,0.0,7.288873381912708,0.0,6.568858440965414,0.0
|
| 87 |
screen_static,static_dropout_0.02,static,,500000,L16_H8_D384,16,8,384,31457280,0.02,0.02,constant,1,0.9117673244327307,0.0,6.895432710647583,0.0,5.983665386214852,0.0
|
| 88 |
screen_static,static_dropout_0.05,static,,500000,L16_H8_D384,16,8,384,31457280,0.05,0.05,constant,1,1.2327042073011398,0.0,6.497423432767391,0.0,5.264719225466251,0.0
|
| 89 |
+
screen_static,static_dropout_0.08,static,,500000,L16_H8_D384,16,8,384,31457280,0.08,0.08,constant,1,1.5379231162369251,0.0,6.107891380786896,0.0,4.569968264549971,0.0
|
| 90 |
screen_static,static_dropout_0.1,static,,500000,L16_H8_D384,16,8,384,31457280,0.1,0.1,constant,1,1.6818840466439724,0.0,5.9032503962516785,0.0,4.221366349607706,0.0
|
| 91 |
screen_static,static_dropout_0.14,static,,500000,L16_H8_D384,16,8,384,31457280,0.14,0.14,constant,1,1.9904268346726894,0.0,5.6262156665325165,0.0,3.635788831859827,0.0
|
| 92 |
screen_static,static_dropout_0.2,static,,500000,L16_H8_D384,16,8,384,31457280,0.2,0.2,constant,1,2.416015110909939,0.0,5.327137835323811,0.0,2.9111227244138718,0.0
|
|
|
|
| 100 |
screen_static,static_dropout_0,static,,1000000,L16_H8_D384,16,8,384,31457280,0.0,0.0,constant,1,2.2476234324276447,0.0,5.652379140257835,0.0,3.4047557078301907,0.0
|
| 101 |
screen_static,static_dropout_0.02,static,,1000000,L16_H8_D384,16,8,384,31457280,0.02,0.02,constant,1,2.4646411538124084,0.0,5.3384313732385635,0.0,2.873790219426155,0.0
|
| 102 |
screen_static,static_dropout_0.05,static,,1000000,L16_H8_D384,16,8,384,31457280,0.05,0.05,constant,1,2.7036017999053,0.0,5.114995934069157,0.0,2.4113941341638565,0.0
|
| 103 |
+
screen_static,static_dropout_0.08,static,,1000000,L16_H8_D384,16,8,384,31457280,0.08,0.08,constant,1,2.8963891118764877,0.0,4.942643143236637,0.0,2.0462540313601494,0.0
|
| 104 |
screen_static,static_dropout_0.1,static,,1000000,L16_H8_D384,16,8,384,31457280,0.1,0.1,constant,1,2.992223806679249,0.0,4.842831656336784,0.0,1.8506078496575356,0.0
|
| 105 |
screen_static,static_dropout_0.14,static,,1000000,L16_H8_D384,16,8,384,31457280,0.14,0.14,constant,1,3.1947626248002052,0.0,4.751600541174412,0.0,1.5568379163742065,0.0
|
| 106 |
screen_static,static_dropout_0.2,static,,1000000,L16_H8_D384,16,8,384,31457280,0.2,0.2,constant,1,3.3926073163747787,0.0,4.67560038715601,0.0,1.282993070781231,0.0
|
|
|
|
| 114 |
screen_static,static_dropout_0,static,,2000000,L16_H8_D384,16,8,384,31457280,0.0,0.0,constant,1,3.3746245950460434,0.0,4.625189505517483,0.0,1.2505649104714394,0.0
|
| 115 |
screen_static,static_dropout_0.02,static,,2000000,L16_H8_D384,16,8,384,31457280,0.02,0.02,constant,1,3.4837590381503105,0.0,4.542641267180443,0.0,1.0588822290301323,0.0
|
| 116 |
screen_static,static_dropout_0.05,static,,2000000,L16_H8_D384,16,8,384,31457280,0.05,0.05,constant,1,3.57197318226099,0.0,4.487178318202496,0.0,0.9152051359415054,0.0
|
| 117 |
+
screen_static,static_dropout_0.08,static,,2000000,L16_H8_D384,16,8,384,31457280,0.08,0.08,constant,1,3.6279477402567863,0.0,4.455653116106987,0.0,0.8277053758502007,0.0
|
| 118 |
screen_static,static_dropout_0.1,static,,2000000,L16_H8_D384,16,8,384,31457280,0.1,0.1,constant,1,3.686768524348736,0.0,4.436601020395756,0.0,0.74983249604702,0.0
|
| 119 |
screen_static,static_dropout_0.14,static,,2000000,L16_H8_D384,16,8,384,31457280,0.14,0.14,constant,1,3.776127725839615,0.0,4.427024222910404,0.0,0.6508964970707893,0.0
|
| 120 |
screen_static,static_dropout_0.2,static,,2000000,L16_H8_D384,16,8,384,31457280,0.2,0.2,constant,1,3.9085786044597626,0.0,4.450810715556145,0.0,0.5422321110963821,0.0
|
| 121 |
screen_static,static_dropout_0.3,static,,2000000,L16_H8_D384,16,8,384,31457280,0.3,0.3,constant,1,4.04004543274641,0.0,4.494914263486862,0.0,0.4548688307404518,0.0
|
| 122 |
+
screen_static,static_dropout_0.4,static,,2000000,L16_H8_D384,16,8,384,31457280,0.4,0.4,constant,1,4.1651866137981415,0.0,4.5558576956391335,0.0,0.390671081840992,0.0
|
| 123 |
+
screen_static,static_dropout_0.5,static,,2000000,L16_H8_D384,16,8,384,31457280,0.5,0.5,constant,1,4.337527960538864,0.0,4.653308063745499,0.0,0.3157801032066345,0.0
|
| 124 |
+
screen_static,static_dropout_0.6,static,,2000000,L16_H8_D384,16,8,384,31457280,0.6,0.6,constant,1,4.549529016017914,0.0,4.811031833291054,0.0,0.26150281727313995,0.0
|
| 125 |
+
screen_static,static_dropout_0.7,static,,2000000,L16_H8_D384,16,8,384,31457280,0.7,0.7,constant,1,4.847206577658653,0.0,5.051212973892689,0.0,0.2040063962340355,0.0
|
| 126 |
+
screen_static,static_dropout_0.8,static,,2000000,L16_H8_D384,16,8,384,31457280,0.8,0.8,constant,1,5.27906721830368,0.0,5.419115200638771,0.0,0.14004798233509064,0.0
|
| 127 |
+
screen_static,static_dropout_0.9,static,,2000000,L16_H8_D384,16,8,384,31457280,0.9,0.9,constant,1,6.697221785783768,0.0,6.725677810609341,0.0,0.028456024825572968,0.0
|
| 128 |
+
screen_static,static_dropout_0,static,,4000000,L16_H8_D384,16,8,384,31457280,0.0,0.0,constant,1,3.8416539430618286,0.0,4.324729532003403,0.0,0.4830755889415741,0.0
|
| 129 |
+
screen_static,static_dropout_0.02,static,,4000000,L16_H8_D384,16,8,384,31457280,0.02,0.02,constant,1,3.8546612709760666,0.0,4.294691443443298,0.0,0.44003017246723175,0.0
|
| 130 |
+
screen_static,static_dropout_0.05,static,,4000000,L16_H8_D384,16,8,384,31457280,0.05,0.05,constant,1,3.910273350775242,0.0,4.315198875963688,0.0,0.40492552518844604,0.0
|
| 131 |
+
screen_static,static_dropout_0.08,static,,4000000,L16_H8_D384,16,8,384,31457280,0.08,0.08,constant,1,3.9314780607819557,0.0,4.3001657873392105,0.0,0.3686877265572548,0.0
|
| 132 |
+
screen_static,static_dropout_0.1,static,,4000000,L16_H8_D384,16,8,384,31457280,0.1,0.1,constant,1,3.9545229598879814,0.0,4.310179218649864,0.0,0.3556562587618828,0.0
|
| 133 |
+
screen_static,static_dropout_0.14,static,,4000000,L16_H8_D384,16,8,384,31457280,0.14,0.14,constant,1,4.008677661418915,0.0,4.324408069252968,0.0,0.31573040783405304,0.0
|
| 134 |
+
screen_static,static_dropout_0.2,static,,4000000,L16_H8_D384,16,8,384,31457280,0.2,0.2,constant,1,4.085749976336956,0.0,4.3524704575538635,0.0,0.2667204812169075,0.0
|
| 135 |
+
screen_static,static_dropout_0.3,static,,4000000,L16_H8_D384,16,8,384,31457280,0.3,0.3,constant,1,4.181676417589188,0.0,4.410779930651188,0.0,0.22910351306200027,0.0
|
| 136 |
+
screen_static,static_dropout_0.4,static,,4000000,L16_H8_D384,16,8,384,31457280,0.4,0.4,constant,1,4.293805286288261,0.0,4.497522778809071,0.0,0.20371749252080917,0.0
|
| 137 |
+
screen_static,static_dropout_0.5,static,,4000000,L16_H8_D384,16,8,384,31457280,0.5,0.5,constant,1,4.431871071457863,0.0,4.60690600425005,0.0,0.17503493279218674,0.0
|
| 138 |
+
screen_static,static_dropout_0.6,static,,4000000,L16_H8_D384,16,8,384,31457280,0.6,0.6,constant,1,4.635675564408302,0.0,4.77450954169035,0.0,0.13883397728204727,0.0
|
| 139 |
+
screen_static,static_dropout_0.7,static,,4000000,L16_H8_D384,16,8,384,31457280,0.7,0.7,constant,1,4.896953746676445,0.0,5.013129934668541,0.0,0.11617618799209595,0.0
|
| 140 |
+
screen_static,static_dropout_0.8,static,,4000000,L16_H8_D384,16,8,384,31457280,0.8,0.8,constant,1,5.347471252083778,0.0,5.417073391377926,0.0,0.06960213929414749,0.0
|
| 141 |
+
screen_static,static_dropout_0.9,static,,4000000,L16_H8_D384,16,8,384,31457280,0.9,0.9,constant,1,6.603688538074493,0.0,6.623879760503769,0.0,0.020191222429275513,0.0
|
| 142 |
+
screen_static,static_dropout_0,static,,250000,L8_H8_D256,8,8,256,8388608,0.0,0.0,constant,1,0.9523934368044138,0.0,7.917540371417999,0.0,6.9651469346135855,0.0
|
| 143 |
+
screen_static,static_dropout_0.02,static,,250000,L8_H8_D256,8,8,256,8388608,0.02,0.02,constant,1,1.2368882782757282,0.0,7.236787244677544,0.0,5.999898966401815,0.0
|
| 144 |
+
screen_static,static_dropout_0.05,static,,250000,L8_H8_D256,8,8,256,8388608,0.05,0.05,constant,1,1.6339460164308548,0.0,6.600561022758484,0.0,4.966615006327629,0.0
|
| 145 |
+
screen_static,static_dropout_0.08,static,,250000,L8_H8_D256,8,8,256,8388608,0.08,0.08,constant,1,1.9089897610247135,0.0,6.256417088210583,0.0,4.347427327185869,0.0
|
| 146 |
+
screen_static,static_dropout_0.1,static,,250000,L8_H8_D256,8,8,256,8388608,0.1,0.1,constant,1,2.0858395472168922,0.0,6.091431401669979,0.0,4.005591854453087,0.0
|
| 147 |
+
screen_static,static_dropout_0.14,static,,250000,L8_H8_D256,8,8,256,8388608,0.14,0.14,constant,1,2.409040831029415,0.0,5.829747810959816,0.0,3.420706979930401,0.0
|
| 148 |
+
screen_static,static_dropout_0.2,static,,250000,L8_H8_D256,8,8,256,8388608,0.2,0.2,constant,1,2.780476927757263,0.0,5.596559099853039,0.0,2.8160821720957756,0.0
|
| 149 |
+
screen_static,static_dropout_0.3,static,,250000,L8_H8_D256,8,8,256,8388608,0.3,0.3,constant,1,3.2711103558540344,0.0,5.441870041191578,0.0,2.1707596853375435,0.0
|
| 150 |
+
screen_static,static_dropout_0.4,static,,250000,L8_H8_D256,8,8,256,8388608,0.4,0.4,constant,1,3.641142889857292,0.0,5.417454726994038,0.0,1.7763118371367455,0.0
|
| 151 |
+
screen_static,static_dropout_0.5,static,,250000,L8_H8_D256,8,8,256,8388608,0.5,0.5,constant,1,3.9926967695355415,0.0,5.4654273092746735,0.0,1.472730539739132,0.0
|
| 152 |
+
screen_static,static_dropout_0.6,static,,250000,L8_H8_D256,8,8,256,8388608,0.6,0.6,constant,1,4.320340022444725,0.0,5.586687326431274,0.0,1.2663473039865494,0.0
|
| 153 |
+
screen_static,static_dropout_0.7,static,,250000,L8_H8_D256,8,8,256,8388608,0.7,0.7,constant,1,4.705530673265457,0.0,5.780241504311562,0.0,1.0747108310461044,0.0
|
| 154 |
+
screen_static,static_dropout_0.8,static,,250000,L8_H8_D256,8,8,256,8388608,0.8,0.8,constant,1,5.24668562412262,0.0,6.090374693274498,0.0,0.8436890691518784,0.0
|
| 155 |
+
screen_static,static_dropout_0.9,static,,250000,L8_H8_D256,8,8,256,8388608,0.9,0.9,constant,1,6.712611332535744,0.0,7.160110808908939,0.0,0.44749947637319565,0.0
|
| 156 |
+
screen_static,static_dropout_0,static,,500000,L8_H8_D256,8,8,256,8388608,0.0,0.0,constant,1,2.5775627121329308,0.0,6.079639628529549,0.0,3.502076916396618,0.0
|
| 157 |
+
screen_static,static_dropout_0.02,static,,500000,L8_H8_D256,8,8,256,8388608,0.02,0.02,constant,1,2.8127700313925743,0.0,5.670176908373833,0.0,2.8574068769812584,0.0
|
| 158 |
+
screen_static,static_dropout_0.05,static,,500000,L8_H8_D256,8,8,256,8388608,0.05,0.05,constant,1,3.0387847647070885,0.0,5.3632767125964165,0.0,2.324491947889328,0.0
|
| 159 |
+
screen_static,static_dropout_0.08,static,,500000,L8_H8_D256,8,8,256,8388608,0.08,0.08,constant,1,3.2260689064860344,0.0,5.218468777835369,0.0,1.9923998713493347,0.0
|
| 160 |
+
screen_static,static_dropout_0.1,static,,500000,L8_H8_D256,8,8,256,8388608,0.1,0.1,constant,1,3.3107700124382973,0.0,5.160695806145668,0.0,1.8499257937073708,0.0
|
| 161 |
+
screen_static,static_dropout_0.14,static,,500000,L8_H8_D256,8,8,256,8388608,0.14,0.14,constant,1,3.4774143397808075,0.0,5.067136377096176,0.0,1.5897220373153687,0.0
|
| 162 |
+
screen_static,static_dropout_0.2,static,,500000,L8_H8_D256,8,8,256,8388608,0.2,0.2,constant,1,3.697887174785137,0.0,5.02164863795042,0.0,1.3237614631652832,0.0
|
| 163 |
+
screen_static,static_dropout_0.3,static,,500000,L8_H8_D256,8,8,256,8388608,0.3,0.3,constant,1,3.9667534679174423,0.0,5.0323494374752045,0.0,1.0655959695577621,0.0
|
| 164 |
+
screen_static,static_dropout_0.4,static,,500000,L8_H8_D256,8,8,256,8388608,0.4,0.4,constant,1,4.194036483764648,0.0,5.0916667357087135,0.0,0.8976302519440651,0.0
|
| 165 |
+
screen_static,static_dropout_0.5,static,,500000,L8_H8_D256,8,8,256,8388608,0.5,0.5,constant,1,4.428740322589874,0.0,5.20993360131979,0.0,0.7811932787299156,0.0
|
| 166 |
+
screen_static,static_dropout_0.6,static,,500000,L8_H8_D256,8,8,256,8388608,0.6,0.6,constant,1,4.666903376579285,0.0,5.3372097089886665,0.0,0.6703063324093819,0.0
|
| 167 |
+
screen_static,static_dropout_0.7,static,,500000,L8_H8_D256,8,8,256,8388608,0.7,0.7,constant,1,4.97511188685894,0.0,5.550175577402115,0.0,0.5750636905431747,0.0
|
| 168 |
+
screen_static,static_dropout_0.8,static,,500000,L8_H8_D256,8,8,256,8388608,0.8,0.8,constant,1,5.43462572991848,0.0,5.885032519698143,0.0,0.4504067897796631,0.0
|
| 169 |
+
screen_static,static_dropout_0.9,static,,500000,L8_H8_D256,8,8,256,8388608,0.9,0.9,constant,1,6.870217755436897,0.0,7.112152203917503,0.0,0.24193444848060608,0.0
|
| 170 |
+
screen_static,static_dropout_0,static,,1000000,L8_H8_D256,8,8,256,8388608,0.0,0.0,constant,1,3.5921367704868317,0.0,5.020985089242458,0.0,1.4288483187556267,0.0
|
| 171 |
+
screen_static,static_dropout_0.02,static,,1000000,L8_H8_D256,8,8,256,8388608,0.02,0.02,constant,1,3.692635416984558,0.0,4.894890554249287,0.0,1.2022551372647285,0.0
|
| 172 |
+
screen_static,static_dropout_0.05,static,,1000000,L8_H8_D256,8,8,256,8388608,0.05,0.05,constant,1,3.774454675614834,0.0,4.809566989541054,0.0,1.03511231392622,0.0
|
| 173 |
+
screen_static,static_dropout_0.08,static,,1000000,L8_H8_D256,8,8,256,8388608,0.08,0.08,constant,1,3.8572388663887978,0.0,4.799222990870476,0.0,0.941984124481678,0.0
|
| 174 |
+
screen_static,static_dropout_0.1,static,,1000000,L8_H8_D256,8,8,256,8388608,0.1,0.1,constant,1,3.9261686205863953,0.0,4.794602431356907,0.0,0.8684338107705116,0.0
|
| 175 |
+
screen_static,static_dropout_0.14,static,,1000000,L8_H8_D256,8,8,256,8388608,0.14,0.14,constant,1,3.9900290220975876,0.0,4.776347942650318,0.0,0.7863189205527306,0.0
|
| 176 |
+
screen_static,static_dropout_0.2,static,,1000000,L8_H8_D256,8,8,256,8388608,0.2,0.2,constant,1,4.130261890590191,0.0,4.801047399640083,0.0,0.6707855090498924,0.0
|
| 177 |
+
screen_static,static_dropout_0.3,static,,1000000,L8_H8_D256,8,8,256,8388608,0.3,0.3,constant,1,4.293951943516731,0.0,4.865733869373798,0.0,0.5717819258570671,0.0
|
| 178 |
+
screen_static,static_dropout_0.4,static,,1000000,L8_H8_D256,8,8,256,8388608,0.4,0.4,constant,1,4.475886330008507,0.0,4.962566897273064,0.0,0.4866805672645569,0.0
|
| 179 |
+
screen_static,static_dropout_0.5,static,,1000000,L8_H8_D256,8,8,256,8388608,0.5,0.5,constant,1,4.666878193616867,0.0,5.088004924356937,0.0,0.42112673074007034,0.0
|
| 180 |
+
screen_static,static_dropout_0.6,static,,1000000,L8_H8_D256,8,8,256,8388608,0.6,0.6,constant,1,4.864694103598595,0.0,5.237049944698811,0.0,0.3723558411002159,0.0
|
| 181 |
+
screen_static,static_dropout_0.7,static,,1000000,L8_H8_D256,8,8,256,8388608,0.7,0.7,constant,1,5.129916787147522,0.0,5.44158773124218,0.0,0.3116709440946579,0.0
|
| 182 |
+
screen_static,static_dropout_0.8,static,,1000000,L8_H8_D256,8,8,256,8388608,0.8,0.8,constant,1,5.578491851687431,0.0,5.808708101511002,0.0,0.23021624982357025,0.0
|
| 183 |
+
screen_static,static_dropout_0.9,static,,1000000,L8_H8_D256,8,8,256,8388608,0.9,0.9,constant,1,6.995351999998093,0.0,7.103986039757729,0.0,0.10863403975963593,0.0
|
| 184 |
+
screen_static,static_dropout_0,static,,2000000,L8_H8_D256,8,8,256,8388608,0.0,0.0,constant,1,4.086239457130432,0.0,4.6497810408473015,0.0,0.5635415837168694,0.0
|
| 185 |
+
screen_static,static_dropout_0.02,static,,2000000,L8_H8_D256,8,8,256,8388608,0.02,0.02,constant,1,4.138374790549278,0.0,4.640278935432434,0.0,0.5019041448831558,0.0
|
| 186 |
+
screen_static,static_dropout_0.05,static,,2000000,L8_H8_D256,8,8,256,8388608,0.05,0.05,constant,1,4.196284607052803,0.0,4.63186115026474,0.0,0.43557654321193695,0.0
|
| 187 |
+
screen_static,static_dropout_0.08,static,,2000000,L8_H8_D256,8,8,256,8388608,0.08,0.08,constant,1,4.2158468291163445,0.0,4.623203128576279,0.0,0.40735629945993423,0.0
|
| 188 |
+
screen_static,static_dropout_0.1,static,,2000000,L8_H8_D256,8,8,256,8388608,0.1,0.1,constant,1,4.267480067908764,0.0,4.6526564955711365,0.0,0.3851764276623726,0.0
|
| 189 |
+
screen_static,static_dropout_0.14,static,,2000000,L8_H8_D256,8,8,256,8388608,0.14,0.14,constant,1,4.291930258274078,0.0,4.652354396879673,0.0,0.36042413860559464,0.0
|
| 190 |
+
screen_static,static_dropout_0.2,static,,2000000,L8_H8_D256,8,8,256,8388608,0.2,0.2,constant,1,4.382655680179596,0.0,4.688187293708324,0.0,0.3055316135287285,0.0
|
| 191 |
+
screen_static,static_dropout_0.3,static,,2000000,L8_H8_D256,8,8,256,8388608,0.3,0.3,constant,1,4.514920085668564,0.0,4.779394708573818,0.0,0.26447462290525436,0.0
|
| 192 |
+
screen_static,static_dropout_0.4,static,,2000000,L8_H8_D256,8,8,256,8388608,0.4,0.4,constant,1,4.651856943964958,0.0,4.879608504474163,0.0,0.22775156050920486,0.0
|
| 193 |
+
screen_static,static_dropout_0.5,static,,2000000,L8_H8_D256,8,8,256,8388608,0.5,0.5,constant,1,4.822874888777733,0.0,5.010139018297195,0.0,0.18726412951946259,0.0
|
| 194 |
+
screen_static,static_dropout_0.6,static,,2000000,L8_H8_D256,8,8,256,8388608,0.6,0.6,constant,1,4.99927744269371,0.0,5.162296086549759,0.0,0.16301864385604858,0.0
|
| 195 |
+
screen_static,static_dropout_0.7,static,,2000000,L8_H8_D256,8,8,256,8388608,0.7,0.7,constant,1,5.243539854884148,0.0,5.3762659057974815,0.0,0.1327260509133339,0.0
|
| 196 |
+
screen_static,static_dropout_0.8,static,,2000000,L8_H8_D256,8,8,256,8388608,0.8,0.8,constant,1,5.64663989841938,0.0,5.7416622787714005,0.0,0.09502238035202026,0.0
|
| 197 |
+
screen_static,static_dropout_0.9,static,,2000000,L8_H8_D256,8,8,256,8388608,0.9,0.9,constant,1,7.009306907653809,0.0,7.041225396096706,0.0,0.0319184884428978,0.0
|
| 198 |
+
screen_static,static_dropout_0,static,,4000000,L8_H8_D256,8,8,256,8388608,0.0,0.0,constant,1,4.251466855406761,0.0,4.513561494648457,0.0,0.2620946392416954,0.0
|
| 199 |
+
screen_static,static_dropout_0.02,static,,4000000,L8_H8_D256,8,8,256,8388608,0.02,0.02,constant,1,4.275962874293327,0.0,4.528690077364445,0.0,0.2527272030711174,0.0
|
| 200 |
+
screen_static,static_dropout_0.05,static,,4000000,L8_H8_D256,8,8,256,8388608,0.05,0.05,constant,1,4.309461995959282,0.0,4.540056154131889,0.0,0.23059415817260742,0.0
|
| 201 |
+
screen_static,static_dropout_0.08,static,,4000000,L8_H8_D256,8,8,256,8388608,0.08,0.08,constant,1,4.354661911725998,0.0,4.5700657814741135,0.0,0.21540386974811554,0.0
|
| 202 |
+
screen_static,static_dropout_0.1,static,,4000000,L8_H8_D256,8,8,256,8388608,0.1,0.1,constant,1,4.382582053542137,0.0,4.578302673995495,0.0,0.1957206204533577,0.0
|
| 203 |
+
screen_static,static_dropout_0.14,static,,4000000,L8_H8_D256,8,8,256,8388608,0.14,0.14,constant,1,4.412918761372566,0.0,4.606858506798744,0.0,0.19393974542617798,0.0
|
| 204 |
+
screen_static,static_dropout_0.2,static,,4000000,L8_H8_D256,8,8,256,8388608,0.2,0.2,constant,1,4.493421167135239,0.0,4.655771657824516,0.0,0.16235049068927765,0.0
|
| 205 |
+
screen_static,static_dropout_0.3,static,,4000000,L8_H8_D256,8,8,256,8388608,0.3,0.3,constant,1,4.599567919969559,0.0,4.751599781215191,0.0,0.15203186124563217,0.0
|
| 206 |
+
screen_static,static_dropout_0.4,static,,4000000,L8_H8_D256,8,8,256,8388608,0.4,0.4,constant,1,4.726367101073265,0.0,4.855552241206169,0.0,0.12918514013290405,0.0
|
| 207 |
+
screen_static,static_dropout_0.5,static,,4000000,L8_H8_D256,8,8,256,8388608,0.5,0.5,constant,1,4.873788967728615,0.0,4.97822929173708,0.0,0.10444032400846481,0.0
|
| 208 |
+
screen_static,static_dropout_0.6,static,,4000000,L8_H8_D256,8,8,256,8388608,0.6,0.6,constant,1,5.051660537719727,0.0,5.145174726843834,0.0,0.09351418912410736,0.0
|
| 209 |
+
screen_static,static_dropout_0.7,static,,4000000,L8_H8_D256,8,8,256,8388608,0.7,0.7,constant,1,5.283298075199127,0.0,5.3606579676270485,0.0,0.0773598924279213,0.0
|
| 210 |
+
screen_static,static_dropout_0.8,static,,4000000,L8_H8_D256,8,8,256,8388608,0.8,0.8,constant,1,5.67519947886467,0.0,5.7189600840210915,0.0,0.04376060515642166,0.0
|
| 211 |
+
screen_static,static_dropout_0.9,static,,4000000,L8_H8_D256,8,8,256,8388608,0.9,0.9,constant,1,7.029567465186119,0.0,7.056577272713184,0.0,0.027009807527065277,0.0
|
runs/screen_static/20260525-133008/summary.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
runs/screen_static/20260525-133008/trace.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
src/dropout_decay/experiment.py
CHANGED
|
@@ -570,6 +570,47 @@ def write_csv(path: Path, rows: list[dict], fieldnames: list[str]) -> None:
|
|
| 570 |
writer.writerows(rows)
|
| 571 |
|
| 572 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 573 |
def build_model_selection(summary: list[dict], args: argparse.Namespace) -> list[dict]:
|
| 574 |
groups: dict[tuple, list[dict]] = defaultdict(list)
|
| 575 |
for row in summary:
|
|
@@ -663,82 +704,165 @@ def build_model_selection(summary: list[dict], args: argparse.Namespace) -> list
|
|
| 663 |
def write_screen_markdown_summary(output_dir: Path, rows: list[dict]) -> None:
|
| 664 |
if not rows:
|
| 665 |
return
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
)
|
| 675 |
-
by_prefix: dict[int, list[dict]] = defaultdict(list)
|
| 676 |
for row in rows:
|
| 677 |
if row["run_mode"] == "screen_static" and row["condition_kind"] == "static":
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 681 |
|
| 682 |
-
first = rows[0]
|
| 683 |
lines = [
|
| 684 |
"# Static Dropout Screen Summary",
|
| 685 |
"",
|
| 686 |
f"Run directory: `{output_dir}`",
|
| 687 |
"",
|
| 688 |
-
|
| 689 |
-
f"Model: `{first['model_name']}` causal Transformer, "
|
| 690 |
-
f"{int(first['parameters']):,} parameters, {first['n_layer']} layers, "
|
| 691 |
-
f"{first['n_head']} heads, {first['n_embd']} embedding dim, "
|
| 692 |
-
f"block size {first['model_config']['block_size']}, "
|
| 693 |
-
f"vocab size {first['model_config']['vocab_size']}."
|
| 694 |
-
),
|
| 695 |
-
(
|
| 696 |
-
"Training per condition: "
|
| 697 |
-
f"{first['steps']:,} steps x sampled batches = "
|
| 698 |
-
f"{int(first['tokens_seen']):,} sampled tokens. "
|
| 699 |
-
f"Seeds present: {', '.join(str(seed) for seed in sorted({row['seed'] for row in rows}))}."
|
| 700 |
-
),
|
| 701 |
"",
|
| 702 |
-
"
|
| 703 |
-
"",
|
| 704 |
-
"| Prefix tokens | Effective epochs | Best dropout | Val loss | Train eval loss | Gap | Plateau/bracket note |",
|
| 705 |
-
"|---:|---:|---:|---:|---:|---:|---|",
|
| 706 |
]
|
| 707 |
-
for
|
| 708 |
-
best = min(prefix_rows, key=lambda row: row["val_eval_loss"])
|
| 709 |
-
rates = [float(row["dropout_initial"]) for row in prefix_rows]
|
| 710 |
-
eff_epochs = float(best["tokens_seen"]) / prefix
|
| 711 |
-
if best["dropout_initial"] == max(rates):
|
| 712 |
-
note = "not bracketed; best at top of tested grid"
|
| 713 |
-
elif best["dropout_initial"] == min(rates):
|
| 714 |
-
note = "not bracketed; best at bottom of tested grid"
|
| 715 |
-
else:
|
| 716 |
-
note = "bracketed by tested grid"
|
| 717 |
lines.append(
|
| 718 |
"| "
|
| 719 |
-
f"{
|
| 720 |
-
f"{
|
| 721 |
-
f"{
|
|
|
|
| 722 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 723 |
|
| 724 |
-
for
|
| 725 |
-
|
|
|
|
|
|
|
| 726 |
lines.extend(
|
| 727 |
[
|
| 728 |
"",
|
| 729 |
-
f"##
|
| 730 |
-
"",
|
| 731 |
-
"| Dropout | Val loss | Train eval loss | Gap | Sampled tokens | Params |",
|
| 732 |
-
"|---:|---:|---:|---:|---:|---:|",
|
| 733 |
]
|
| 734 |
)
|
| 735 |
-
for
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 741 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 742 |
|
| 743 |
output = "\n".join(lines) + "\n"
|
| 744 |
(output_dir / "RESULT_SUMMARY.md").write_text(output, encoding="utf-8")
|
|
@@ -747,6 +871,126 @@ def write_screen_markdown_summary(output_dir: Path, rows: list[dict]) -> None:
|
|
| 747 |
(docs_path / "screen_static_results.md").write_text(output, encoding="utf-8")
|
| 748 |
|
| 749 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 750 |
def static_conditions(dropout_rates: list[float]) -> list[DropoutCondition]:
|
| 751 |
return [
|
| 752 |
DropoutCondition(
|
|
@@ -776,6 +1020,21 @@ def run_fixed_static_sweep(
|
|
| 776 |
rows: list[dict] = []
|
| 777 |
completed_keys = completed_keys or set()
|
| 778 |
conditions = static_conditions(sorted(set(args.dropout_rates)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 779 |
for token_limit in token_limits:
|
| 780 |
for model_spec in model_specs:
|
| 781 |
best_val_loss = float("inf")
|
|
@@ -830,6 +1089,7 @@ def run_fixed_static_sweep(
|
|
| 830 |
rows.append(row)
|
| 831 |
condition_rows.append(row)
|
| 832 |
completed_keys.add(metric_key(row))
|
|
|
|
| 833 |
del model, optimizer
|
| 834 |
torch.mps.empty_cache()
|
| 835 |
|
|
@@ -1080,6 +1340,7 @@ def run(args: argparse.Namespace) -> Path:
|
|
| 1080 |
)
|
| 1081 |
write_csv(output_dir / "model_selection.csv", selection, SELECTION_FIELDS)
|
| 1082 |
write_screen_markdown_summary(output_dir, rows)
|
|
|
|
| 1083 |
|
| 1084 |
print(
|
| 1085 |
json.dumps(
|
|
|
|
| 570 |
writer.writerows(rows)
|
| 571 |
|
| 572 |
|
| 573 |
+
def format_duration(seconds: float) -> str:
|
| 574 |
+
seconds = max(0, int(seconds))
|
| 575 |
+
hours, remainder = divmod(seconds, 3600)
|
| 576 |
+
minutes, secs = divmod(remainder, 60)
|
| 577 |
+
if hours:
|
| 578 |
+
return f"{hours}h{minutes:02d}m"
|
| 579 |
+
if minutes:
|
| 580 |
+
return f"{minutes}m{secs:02d}s"
|
| 581 |
+
return f"{secs}s"
|
| 582 |
+
|
| 583 |
+
|
| 584 |
+
class ProgressMeter:
|
| 585 |
+
def __init__(self, total: int):
|
| 586 |
+
self.total = max(0, total)
|
| 587 |
+
self.done = 0
|
| 588 |
+
self.started_at = time.time()
|
| 589 |
+
|
| 590 |
+
def mark_done(self, row: dict) -> None:
|
| 591 |
+
self.done += 1
|
| 592 |
+
elapsed = time.time() - self.started_at
|
| 593 |
+
mean = elapsed / self.done if self.done else 0.0
|
| 594 |
+
remaining = max(0, self.total - self.done)
|
| 595 |
+
eta = remaining * mean
|
| 596 |
+
print(
|
| 597 |
+
"progress "
|
| 598 |
+
f"{self.done}/{self.total} "
|
| 599 |
+
f"eta={format_duration(eta)} "
|
| 600 |
+
f"mode={row['run_mode']} "
|
| 601 |
+
f"model={row['model_name']} "
|
| 602 |
+
f"params={int(row['parameters']):,} "
|
| 603 |
+
f"prefix={int(row['token_limit']):,} "
|
| 604 |
+
f"seed={row['seed']} "
|
| 605 |
+
f"condition={row['condition']} "
|
| 606 |
+
f"val={row['val_eval_loss']:.4f} "
|
| 607 |
+
f"train={row['train_eval_loss']:.4f} "
|
| 608 |
+
f"gap={row['generalization_gap']:.4f} "
|
| 609 |
+
f"elapsed={format_duration(float(row['elapsed_sec']))}",
|
| 610 |
+
flush=True,
|
| 611 |
+
)
|
| 612 |
+
|
| 613 |
+
|
| 614 |
def build_model_selection(summary: list[dict], args: argparse.Namespace) -> list[dict]:
|
| 615 |
groups: dict[tuple, list[dict]] = defaultdict(list)
|
| 616 |
for row in summary:
|
|
|
|
| 704 |
def write_screen_markdown_summary(output_dir: Path, rows: list[dict]) -> None:
|
| 705 |
if not rows:
|
| 706 |
return
|
| 707 |
+
static_rows = [
|
| 708 |
+
row
|
| 709 |
+
for row in rows
|
| 710 |
+
if row["run_mode"] == "screen_static" and row["condition_kind"] == "static"
|
| 711 |
+
]
|
| 712 |
+
if not static_rows:
|
| 713 |
+
return
|
| 714 |
+
|
| 715 |
+
by_model_prefix_rate: dict[tuple[str, int, float], list[dict]] = defaultdict(list)
|
|
|
|
| 716 |
for row in rows:
|
| 717 |
if row["run_mode"] == "screen_static" and row["condition_kind"] == "static":
|
| 718 |
+
by_model_prefix_rate[
|
| 719 |
+
(
|
| 720 |
+
row["model_name"],
|
| 721 |
+
int(row["token_limit"]),
|
| 722 |
+
float(row["dropout_initial"]),
|
| 723 |
+
)
|
| 724 |
+
].append(row)
|
| 725 |
+
|
| 726 |
+
aggregates: list[dict] = []
|
| 727 |
+
for (model_name, prefix, dropout), group_rows in by_model_prefix_rate.items():
|
| 728 |
+
first = group_rows[0]
|
| 729 |
+
val_losses = [float(row["val_eval_loss"]) for row in group_rows]
|
| 730 |
+
train_losses = [float(row["train_eval_loss"]) for row in group_rows]
|
| 731 |
+
gaps = [float(row["generalization_gap"]) for row in group_rows]
|
| 732 |
+
aggregates.append(
|
| 733 |
+
{
|
| 734 |
+
"model_name": model_name,
|
| 735 |
+
"token_limit": prefix,
|
| 736 |
+
"dropout_initial": dropout,
|
| 737 |
+
"n": len(group_rows),
|
| 738 |
+
"mean_val_eval_loss": statistics.fmean(val_losses),
|
| 739 |
+
"std_val_eval_loss": statistics.stdev(val_losses)
|
| 740 |
+
if len(val_losses) > 1
|
| 741 |
+
else 0.0,
|
| 742 |
+
"mean_train_eval_loss": statistics.fmean(train_losses),
|
| 743 |
+
"std_train_eval_loss": statistics.stdev(train_losses)
|
| 744 |
+
if len(train_losses) > 1
|
| 745 |
+
else 0.0,
|
| 746 |
+
"mean_generalization_gap": statistics.fmean(gaps),
|
| 747 |
+
"std_generalization_gap": statistics.stdev(gaps)
|
| 748 |
+
if len(gaps) > 1
|
| 749 |
+
else 0.0,
|
| 750 |
+
"parameters": int(first["parameters"]),
|
| 751 |
+
"n_layer": int(first["n_layer"]),
|
| 752 |
+
"n_head": int(first["n_head"]),
|
| 753 |
+
"n_embd": int(first["n_embd"]),
|
| 754 |
+
"block_size": int(first["model_config"]["block_size"]),
|
| 755 |
+
"vocab_size": int(first["model_config"]["vocab_size"]),
|
| 756 |
+
"tokens_seen": int(first["tokens_seen"]),
|
| 757 |
+
"seeds": sorted({int(row["seed"]) for row in group_rows}),
|
| 758 |
+
}
|
| 759 |
+
)
|
| 760 |
+
|
| 761 |
+
by_model: dict[str, list[dict]] = defaultdict(list)
|
| 762 |
+
for row in aggregates:
|
| 763 |
+
by_model[row["model_name"]].append(row)
|
| 764 |
+
|
| 765 |
+
model_rows = []
|
| 766 |
+
for model_name, model_group in by_model.items():
|
| 767 |
+
first = model_group[0]
|
| 768 |
+
seeds = sorted({seed for row in model_group for seed in row["seeds"]})
|
| 769 |
+
model_rows.append(
|
| 770 |
+
{
|
| 771 |
+
"model_name": model_name,
|
| 772 |
+
"parameters": first["parameters"],
|
| 773 |
+
"n_layer": first["n_layer"],
|
| 774 |
+
"n_head": first["n_head"],
|
| 775 |
+
"n_embd": first["n_embd"],
|
| 776 |
+
"block_size": first["block_size"],
|
| 777 |
+
"vocab_size": first["vocab_size"],
|
| 778 |
+
"seeds": seeds,
|
| 779 |
+
}
|
| 780 |
+
)
|
| 781 |
|
|
|
|
| 782 |
lines = [
|
| 783 |
"# Static Dropout Screen Summary",
|
| 784 |
"",
|
| 785 |
f"Run directory: `{output_dir}`",
|
| 786 |
"",
|
| 787 |
+
"## Models",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 788 |
"",
|
| 789 |
+
"| Model | Params | Layers | Heads | Embedding | Block | Vocab | Seeds |",
|
| 790 |
+
"|---|---:|---:|---:|---:|---:|---:|---|",
|
|
|
|
|
|
|
| 791 |
]
|
| 792 |
+
for model in sorted(model_rows, key=lambda item: item["parameters"]):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 793 |
lines.append(
|
| 794 |
"| "
|
| 795 |
+
f"`{model['model_name']}` | {model['parameters']:,} | "
|
| 796 |
+
f"{model['n_layer']} | {model['n_head']} | {model['n_embd']} | "
|
| 797 |
+
f"{model['block_size']} | {model['vocab_size']} | "
|
| 798 |
+
f"{', '.join(str(seed) for seed in model['seeds'])} |"
|
| 799 |
)
|
| 800 |
+
lines.extend(
|
| 801 |
+
[
|
| 802 |
+
"",
|
| 803 |
+
"## Best Dropout By Model And Prefix",
|
| 804 |
+
"",
|
| 805 |
+
"| Model | Prefix tokens | Effective epochs | Best dropout | Mean val loss | Val std | Mean train loss | Mean gap | Plateau/bracket note |",
|
| 806 |
+
"|---|---:|---:|---:|---:|---:|---:|---:|---|",
|
| 807 |
+
]
|
| 808 |
+
)
|
| 809 |
+
for model_name, model_group in sorted(by_model.items()):
|
| 810 |
+
by_prefix: dict[int, list[dict]] = defaultdict(list)
|
| 811 |
+
for row in model_group:
|
| 812 |
+
by_prefix[int(row["token_limit"])].append(row)
|
| 813 |
+
for prefix, prefix_rows in sorted(by_prefix.items()):
|
| 814 |
+
best = min(prefix_rows, key=lambda row: row["mean_val_eval_loss"])
|
| 815 |
+
rates = [float(row["dropout_initial"]) for row in prefix_rows]
|
| 816 |
+
eff_epochs = float(best["tokens_seen"]) / prefix
|
| 817 |
+
if best["dropout_initial"] == max(rates):
|
| 818 |
+
note = "not bracketed; best at top of tested grid"
|
| 819 |
+
elif best["dropout_initial"] == min(rates):
|
| 820 |
+
note = "not bracketed; best at bottom of tested grid"
|
| 821 |
+
else:
|
| 822 |
+
note = "bracketed by tested grid"
|
| 823 |
+
lines.append(
|
| 824 |
+
"| "
|
| 825 |
+
f"`{model_name}` | {prefix:,} | {eff_epochs:.2f} | "
|
| 826 |
+
f"{best['dropout_initial']:.2f} | "
|
| 827 |
+
f"{best['mean_val_eval_loss']:.4f} | "
|
| 828 |
+
f"{best['std_val_eval_loss']:.4f} | "
|
| 829 |
+
f"{best['mean_train_eval_loss']:.4f} | "
|
| 830 |
+
f"{best['mean_generalization_gap']:.4f} | {note} |"
|
| 831 |
+
)
|
| 832 |
|
| 833 |
+
for model_name, model_group in sorted(by_model.items()):
|
| 834 |
+
by_prefix = defaultdict(list)
|
| 835 |
+
for row in model_group:
|
| 836 |
+
by_prefix[int(row["token_limit"])].append(row)
|
| 837 |
lines.extend(
|
| 838 |
[
|
| 839 |
"",
|
| 840 |
+
f"## Model `{model_name}`",
|
|
|
|
|
|
|
|
|
|
| 841 |
]
|
| 842 |
)
|
| 843 |
+
for prefix, prefix_rows in sorted(by_prefix.items()):
|
| 844 |
+
eff_epochs = float(prefix_rows[0]["tokens_seen"]) / prefix
|
| 845 |
+
lines.extend(
|
| 846 |
+
[
|
| 847 |
+
"",
|
| 848 |
+
f"### Prefix {prefix:,} Tokens ({eff_epochs:.2f} Effective Epochs)",
|
| 849 |
+
"",
|
| 850 |
+
"| Dropout | N | Mean val loss | Val std | Mean train loss | Train std | Mean gap | Gap std | Sampled tokens | Params |",
|
| 851 |
+
"|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|",
|
| 852 |
+
]
|
| 853 |
)
|
| 854 |
+
for row in sorted(prefix_rows, key=lambda item: item["dropout_initial"]):
|
| 855 |
+
lines.append(
|
| 856 |
+
"| "
|
| 857 |
+
f"{row['dropout_initial']:.2f} | {row['n']} | "
|
| 858 |
+
f"{row['mean_val_eval_loss']:.4f} | "
|
| 859 |
+
f"{row['std_val_eval_loss']:.4f} | "
|
| 860 |
+
f"{row['mean_train_eval_loss']:.4f} | "
|
| 861 |
+
f"{row['std_train_eval_loss']:.4f} | "
|
| 862 |
+
f"{row['mean_generalization_gap']:.4f} | "
|
| 863 |
+
f"{row['std_generalization_gap']:.4f} | "
|
| 864 |
+
f"{int(row['tokens_seen']):,} | {int(row['parameters']):,} |"
|
| 865 |
+
)
|
| 866 |
|
| 867 |
output = "\n".join(lines) + "\n"
|
| 868 |
(output_dir / "RESULT_SUMMARY.md").write_text(output, encoding="utf-8")
|
|
|
|
| 871 |
(docs_path / "screen_static_results.md").write_text(output, encoding="utf-8")
|
| 872 |
|
| 873 |
|
| 874 |
+
def svg_escape(value: object) -> str:
|
| 875 |
+
return (
|
| 876 |
+
str(value)
|
| 877 |
+
.replace("&", "&")
|
| 878 |
+
.replace("<", "<")
|
| 879 |
+
.replace(">", ">")
|
| 880 |
+
.replace('"', """)
|
| 881 |
+
)
|
| 882 |
+
|
| 883 |
+
|
| 884 |
+
def write_dropout_curve_svg(output_dir: Path, summary: list[dict]) -> None:
|
| 885 |
+
rows = [
|
| 886 |
+
row
|
| 887 |
+
for row in summary
|
| 888 |
+
if row["run_mode"] == "screen_static" and row["condition_kind"] == "static"
|
| 889 |
+
]
|
| 890 |
+
if not rows:
|
| 891 |
+
return
|
| 892 |
+
|
| 893 |
+
grouped: dict[tuple[str, int], list[dict]] = defaultdict(list)
|
| 894 |
+
model_params: dict[str, int] = {}
|
| 895 |
+
for row in rows:
|
| 896 |
+
model_name = row["model_name"]
|
| 897 |
+
grouped[(model_name, int(row["token_limit"]))].append(row)
|
| 898 |
+
model_params[model_name] = int(row["parameters"])
|
| 899 |
+
|
| 900 |
+
models = sorted(model_params, key=lambda name: model_params[name])
|
| 901 |
+
prefixes = sorted({int(row["token_limit"]) for row in rows})
|
| 902 |
+
panel_w, panel_h = 230, 170
|
| 903 |
+
margin_l, margin_t = 58, 34
|
| 904 |
+
plot_w, plot_h = 142, 94
|
| 905 |
+
gap_x, gap_y = 18, 38
|
| 906 |
+
width = margin_l + len(prefixes) * panel_w + gap_x
|
| 907 |
+
height = 70 + len(models) * (panel_h + gap_y)
|
| 908 |
+
colors = ["#1f77b4", "#d62728", "#2ca02c", "#9467bd", "#ff7f0e"]
|
| 909 |
+
|
| 910 |
+
parts = [
|
| 911 |
+
f'<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}" viewBox="0 0 {width} {height}">',
|
| 912 |
+
"<style>",
|
| 913 |
+
"text{font-family:Arial,Helvetica,sans-serif;fill:#111827}",
|
| 914 |
+
".small{font-size:10px}.label{font-size:11px}.title{font-size:15px;font-weight:700}",
|
| 915 |
+
".axis{stroke:#374151;stroke-width:1}.grid{stroke:#e5e7eb;stroke-width:1}.line{fill:none;stroke-width:2}",
|
| 916 |
+
"</style>",
|
| 917 |
+
'<rect width="100%" height="100%" fill="#ffffff"/>',
|
| 918 |
+
'<text x="24" y="28" class="title">Static dropout law: validation loss vs dropout</text>',
|
| 919 |
+
'<text x="24" y="48" class="label">Each panel uses its own y-scale. Points are one-seed means unless N > 1.</text>',
|
| 920 |
+
]
|
| 921 |
+
|
| 922 |
+
for col, prefix in enumerate(prefixes):
|
| 923 |
+
x = margin_l + col * panel_w + plot_w / 2
|
| 924 |
+
parts.append(
|
| 925 |
+
f'<text x="{x:.1f}" y="70" text-anchor="middle" class="label">{prefix:,} prefix tokens</text>'
|
| 926 |
+
)
|
| 927 |
+
|
| 928 |
+
for row_idx, model_name in enumerate(models):
|
| 929 |
+
row_y = 92 + row_idx * (panel_h + gap_y)
|
| 930 |
+
parts.append(
|
| 931 |
+
f'<text x="24" y="{row_y + 48}" class="label" transform="rotate(-90 24 {row_y + 48})">'
|
| 932 |
+
f'{svg_escape(model_name)} ({model_params[model_name] / 1_000_000:.1f}M)</text>'
|
| 933 |
+
)
|
| 934 |
+
for col, prefix in enumerate(prefixes):
|
| 935 |
+
panel_x = margin_l + col * panel_w
|
| 936 |
+
panel_y = row_y
|
| 937 |
+
curve = sorted(
|
| 938 |
+
grouped.get((model_name, prefix), []),
|
| 939 |
+
key=lambda item: float(item["dropout_initial"]),
|
| 940 |
+
)
|
| 941 |
+
if not curve:
|
| 942 |
+
continue
|
| 943 |
+
losses = [float(item["mean_val_eval_loss"]) for item in curve]
|
| 944 |
+
min_loss, max_loss = min(losses), max(losses)
|
| 945 |
+
pad = max(0.02, (max_loss - min_loss) * 0.08)
|
| 946 |
+
y_min, y_max = min_loss - pad, max_loss + pad
|
| 947 |
+
best = min(curve, key=lambda item: float(item["mean_val_eval_loss"]))
|
| 948 |
+
|
| 949 |
+
def px(dropout: float) -> float:
|
| 950 |
+
return panel_x + (dropout / 0.9) * plot_w
|
| 951 |
+
|
| 952 |
+
def py(loss: float) -> float:
|
| 953 |
+
scale = (loss - y_min) / (y_max - y_min)
|
| 954 |
+
return panel_y + plot_h - scale * plot_h
|
| 955 |
+
|
| 956 |
+
parts.extend(
|
| 957 |
+
[
|
| 958 |
+
f'<line x1="{panel_x:.1f}" y1="{panel_y:.1f}" x2="{panel_x:.1f}" y2="{panel_y + plot_h:.1f}" class="axis"/>',
|
| 959 |
+
f'<line x1="{panel_x:.1f}" y1="{panel_y + plot_h:.1f}" x2="{panel_x + plot_w:.1f}" y2="{panel_y + plot_h:.1f}" class="axis"/>',
|
| 960 |
+
f'<line x1="{panel_x:.1f}" y1="{panel_y:.1f}" x2="{panel_x + plot_w:.1f}" y2="{panel_y:.1f}" class="grid"/>',
|
| 961 |
+
f'<text x="{panel_x:.1f}" y="{panel_y - 6:.1f}" class="small">{y_max:.2f}</text>',
|
| 962 |
+
f'<text x="{panel_x:.1f}" y="{panel_y + plot_h + 13:.1f}" class="small">{y_min:.2f}</text>',
|
| 963 |
+
f'<text x="{panel_x:.1f}" y="{panel_y + plot_h + 28:.1f}" class="small">0</text>',
|
| 964 |
+
f'<text x="{panel_x + plot_w:.1f}" y="{panel_y + plot_h + 28:.1f}" text-anchor="end" class="small">0.9</text>',
|
| 965 |
+
]
|
| 966 |
+
)
|
| 967 |
+
points = " ".join(
|
| 968 |
+
f"{px(float(item['dropout_initial'])):.1f},{py(float(item['mean_val_eval_loss'])):.1f}"
|
| 969 |
+
for item in curve
|
| 970 |
+
)
|
| 971 |
+
color = colors[row_idx % len(colors)]
|
| 972 |
+
parts.append(f'<polyline points="{points}" class="line" stroke="{color}"/>')
|
| 973 |
+
for item in curve:
|
| 974 |
+
dropout = float(item["dropout_initial"])
|
| 975 |
+
loss = float(item["mean_val_eval_loss"])
|
| 976 |
+
radius = 4 if item is best else 2.7
|
| 977 |
+
fill = "#111827" if item is best else "#ffffff"
|
| 978 |
+
parts.append(
|
| 979 |
+
f'<circle cx="{px(dropout):.1f}" cy="{py(loss):.1f}" r="{radius}" fill="{fill}" stroke="{color}" stroke-width="1.5"/>'
|
| 980 |
+
)
|
| 981 |
+
parts.append(
|
| 982 |
+
f'<text x="{panel_x + plot_w + 8:.1f}" y="{panel_y + 14:.1f}" class="small">'
|
| 983 |
+
f'best p={float(best["dropout_initial"]):.2f}</text>'
|
| 984 |
+
)
|
| 985 |
+
parts.append(
|
| 986 |
+
f'<text x="{panel_x + plot_w + 8:.1f}" y="{panel_y + 28:.1f}" class="small">'
|
| 987 |
+
f'loss={float(best["mean_val_eval_loss"]):.3f}</text>'
|
| 988 |
+
)
|
| 989 |
+
|
| 990 |
+
parts.append("</svg>")
|
| 991 |
+
(output_dir / "dropout_curves.svg").write_text("\n".join(parts), encoding="utf-8")
|
| 992 |
+
|
| 993 |
+
|
| 994 |
def static_conditions(dropout_rates: list[float]) -> list[DropoutCondition]:
|
| 995 |
return [
|
| 996 |
DropoutCondition(
|
|
|
|
| 1020 |
rows: list[dict] = []
|
| 1021 |
completed_keys = completed_keys or set()
|
| 1022 |
conditions = static_conditions(sorted(set(args.dropout_rates)))
|
| 1023 |
+
planned = 0
|
| 1024 |
+
for token_limit in token_limits:
|
| 1025 |
+
for model_spec in model_specs:
|
| 1026 |
+
for condition in conditions:
|
| 1027 |
+
for seed in seeds:
|
| 1028 |
+
key = planned_metric_key(
|
| 1029 |
+
mode=args.mode,
|
| 1030 |
+
condition=condition,
|
| 1031 |
+
model_spec=model_spec,
|
| 1032 |
+
seed=seed,
|
| 1033 |
+
token_limit=token_limit,
|
| 1034 |
+
)
|
| 1035 |
+
if key not in completed_keys:
|
| 1036 |
+
planned += 1
|
| 1037 |
+
progress = ProgressMeter(planned)
|
| 1038 |
for token_limit in token_limits:
|
| 1039 |
for model_spec in model_specs:
|
| 1040 |
best_val_loss = float("inf")
|
|
|
|
| 1089 |
rows.append(row)
|
| 1090 |
condition_rows.append(row)
|
| 1091 |
completed_keys.add(metric_key(row))
|
| 1092 |
+
progress.mark_done(row)
|
| 1093 |
del model, optimizer
|
| 1094 |
torch.mps.empty_cache()
|
| 1095 |
|
|
|
|
| 1340 |
)
|
| 1341 |
write_csv(output_dir / "model_selection.csv", selection, SELECTION_FIELDS)
|
| 1342 |
write_screen_markdown_summary(output_dir, rows)
|
| 1343 |
+
write_dropout_curve_svg(output_dir, summary)
|
| 1344 |
|
| 1345 |
print(
|
| 1346 |
json.dumps(
|