Upload stage_3_mtetahwr/meta_001123.json with huggingface_hub
Browse files
stage_3_mtetahwr/meta_001123.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
"step": 1123,
|
| 3 |
"val_bpb": 2.2258310665885443,
|
| 4 |
"model_config": {
|
| 5 |
-
"run": "rva_6x256/
|
| 6 |
"wandb_group": "rva_6x256",
|
| 7 |
"seed": 42,
|
| 8 |
"device_type": "cuda",
|
|
@@ -62,7 +62,7 @@
|
|
| 62 |
"warmup_ratio": 0.05,
|
| 63 |
"warmdown_ratio": 0.4,
|
| 64 |
"final_lr_frac": 0.0,
|
| 65 |
-
"resume_from_step":
|
| 66 |
"eval_every": 250,
|
| 67 |
"eval_tokens": 10485760,
|
| 68 |
"core_metric_every": -1,
|
|
@@ -74,14 +74,15 @@
|
|
| 74 |
"profile_step": 2,
|
| 75 |
"profile_micro_step": 0,
|
| 76 |
"memory_history_max_entries": 10000,
|
| 77 |
-
"model_tag": "rva_6x256/
|
|
|
|
| 78 |
"n_layer": 6,
|
| 79 |
"n_head": 16,
|
| 80 |
"n_kv_head": 16,
|
| 81 |
"n_embd": 256
|
| 82 |
},
|
| 83 |
"user_config": {
|
| 84 |
-
"run": "rva_6x256/
|
| 85 |
"wandb_group": "rva_6x256",
|
| 86 |
"seed": 42,
|
| 87 |
"device_type": "cuda",
|
|
@@ -141,7 +142,7 @@
|
|
| 141 |
"warmup_ratio": 0.05,
|
| 142 |
"warmdown_ratio": 0.4,
|
| 143 |
"final_lr_frac": 0.0,
|
| 144 |
-
"resume_from_step":
|
| 145 |
"eval_every": 250,
|
| 146 |
"eval_tokens": 10485760,
|
| 147 |
"core_metric_every": -1,
|
|
@@ -153,13 +154,15 @@
|
|
| 153 |
"profile_step": 2,
|
| 154 |
"profile_micro_step": 0,
|
| 155 |
"memory_history_max_entries": 10000,
|
| 156 |
-
"model_tag": "rva_6x256/
|
|
|
|
| 157 |
},
|
|
|
|
| 158 |
"device_batch_size": 8,
|
| 159 |
"sequence_len": 1024,
|
| 160 |
"dataloader_state_dict": {
|
| 161 |
"pq_idx": 2,
|
| 162 |
-
"rg_idx":
|
| 163 |
},
|
| 164 |
"loop_state": {
|
| 165 |
"min_val_bpb": 2.2258310665885443,
|
|
|
|
| 2 |
"step": 1123,
|
| 3 |
"val_bpb": 2.2258310665885443,
|
| 4 |
"model_config": {
|
| 5 |
+
"run": "rva_6x256/stage_3_mtetahwr",
|
| 6 |
"wandb_group": "rva_6x256",
|
| 7 |
"seed": 42,
|
| 8 |
"device_type": "cuda",
|
|
|
|
| 62 |
"warmup_ratio": 0.05,
|
| 63 |
"warmdown_ratio": 0.4,
|
| 64 |
"final_lr_frac": 0.0,
|
| 65 |
+
"resume_from_step": 1123,
|
| 66 |
"eval_every": 250,
|
| 67 |
"eval_tokens": 10485760,
|
| 68 |
"core_metric_every": -1,
|
|
|
|
| 74 |
"profile_step": 2,
|
| 75 |
"profile_micro_step": 0,
|
| 76 |
"memory_history_max_entries": 10000,
|
| 77 |
+
"model_tag": "rva_6x256/stage_3_mtetahwr",
|
| 78 |
+
"stage": 3,
|
| 79 |
"n_layer": 6,
|
| 80 |
"n_head": 16,
|
| 81 |
"n_kv_head": 16,
|
| 82 |
"n_embd": 256
|
| 83 |
},
|
| 84 |
"user_config": {
|
| 85 |
+
"run": "rva_6x256/stage_3_mtetahwr",
|
| 86 |
"wandb_group": "rva_6x256",
|
| 87 |
"seed": 42,
|
| 88 |
"device_type": "cuda",
|
|
|
|
| 142 |
"warmup_ratio": 0.05,
|
| 143 |
"warmdown_ratio": 0.4,
|
| 144 |
"final_lr_frac": 0.0,
|
| 145 |
+
"resume_from_step": 1123,
|
| 146 |
"eval_every": 250,
|
| 147 |
"eval_tokens": 10485760,
|
| 148 |
"core_metric_every": -1,
|
|
|
|
| 154 |
"profile_step": 2,
|
| 155 |
"profile_micro_step": 0,
|
| 156 |
"memory_history_max_entries": 10000,
|
| 157 |
+
"model_tag": "rva_6x256/stage_3_mtetahwr",
|
| 158 |
+
"stage": 3
|
| 159 |
},
|
| 160 |
+
"stage": 3,
|
| 161 |
"device_batch_size": 8,
|
| 162 |
"sequence_len": 1024,
|
| 163 |
"dataloader_state_dict": {
|
| 164 |
"pq_idx": 2,
|
| 165 |
+
"rg_idx": 20
|
| 166 |
},
|
| 167 |
"loop_state": {
|
| 168 |
"min_val_bpb": 2.2258310665885443,
|