Upload folder using huggingface_hub
Browse files- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr0.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr100.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr125.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr150.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr175.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr200.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr25.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr50.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr75.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_trainer_state.json +742 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr0.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr100.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr125.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr150.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr175.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr200.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr25.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr50.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr75.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_trainer_state.json +742 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr0.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr100.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr125.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr150.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr175.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr200.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr25.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr50.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr75.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_trainer_state.json +742 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr0.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr100.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr125.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr150.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr175.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr200.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr25.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr50.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr75.pth +3 -0
- client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_trainer_state.json +742 -0
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e9c87175d6670ea0a7e93a81d8836225b3c726b3daa4fbfe87fefa157cea616a
|
| 3 |
+
size 389170122
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr0.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8b0102e813ed9d28a49d25fcc64b847bda863108e19f3ff5681263b527d6a413
|
| 3 |
+
size 389172166
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr100.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9d8efd7b883c051ce014f750e26cb5cb90a1e5fb3319b48953a83b49f23c5546
|
| 3 |
+
size 389172958
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr125.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1eaa483fa6f46352a8b7d5f44c5c792f501ea238c1ce4fd396993be54dd2ef58
|
| 3 |
+
size 389172958
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr150.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bcb63eb11795c0a75df6ba716dec5a301d613230b740101ba1422ad03a8d3455
|
| 3 |
+
size 389172958
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr175.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:128b4087fc1df81576986c9f476fa1d82ff7268f03b0fe5cc6e193a5079c8f6b
|
| 3 |
+
size 389172958
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr200.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:72c214c649182fabc10bd7248537f81c5df845d5b4cb6592ba47f932f0576bc3
|
| 3 |
+
size 389172958
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr25.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cd3b132d03a92166634fa1e31f278cb279365653994dcb731d5e955b870c8272
|
| 3 |
+
size 389172562
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr50.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4bf804f790082b1e20fc3ee8d7fe51388c1fb419aee382d3e8052b939525aaf0
|
| 3 |
+
size 389172562
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr75.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ce18d753cf3cd6d1fc131bfbdf829243456103ba84069d3fb80747cac5beb3af
|
| 3 |
+
size 389172562
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_trainer_state.json
ADDED
|
@@ -0,0 +1,742 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_metric": null,
|
| 3 |
+
"best_model_checkpoint": null,
|
| 4 |
+
"epoch": 1.0,
|
| 5 |
+
"eval_steps": 500,
|
| 6 |
+
"global_step": 201,
|
| 7 |
+
"is_hyper_param_search": false,
|
| 8 |
+
"is_local_process_zero": true,
|
| 9 |
+
"is_world_process_zero": true,
|
| 10 |
+
"log_history": [
|
| 11 |
+
{
|
| 12 |
+
"epoch": 0.009950248756218905,
|
| 13 |
+
"grad_norm": 1.316588044166565,
|
| 14 |
+
"learning_rate": 1e-05,
|
| 15 |
+
"loss": 1.7881,
|
| 16 |
+
"step": 2
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"epoch": 0.01990049751243781,
|
| 20 |
+
"grad_norm": 1.1037206649780273,
|
| 21 |
+
"learning_rate": 1e-05,
|
| 22 |
+
"loss": 1.7363,
|
| 23 |
+
"step": 4
|
| 24 |
+
},
|
| 25 |
+
{
|
| 26 |
+
"epoch": 0.029850746268656716,
|
| 27 |
+
"grad_norm": 0.7940536141395569,
|
| 28 |
+
"learning_rate": 1e-05,
|
| 29 |
+
"loss": 1.707,
|
| 30 |
+
"step": 6
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"epoch": 0.03980099502487562,
|
| 34 |
+
"grad_norm": 0.8492249250411987,
|
| 35 |
+
"learning_rate": 1e-05,
|
| 36 |
+
"loss": 1.6855,
|
| 37 |
+
"step": 8
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"epoch": 0.04975124378109453,
|
| 41 |
+
"grad_norm": 1.1886231899261475,
|
| 42 |
+
"learning_rate": 1e-05,
|
| 43 |
+
"loss": 1.7168,
|
| 44 |
+
"step": 10
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
"epoch": 0.05970149253731343,
|
| 48 |
+
"grad_norm": 1.4966411590576172,
|
| 49 |
+
"learning_rate": 1e-05,
|
| 50 |
+
"loss": 2.1992,
|
| 51 |
+
"step": 12
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"epoch": 0.06965174129353234,
|
| 55 |
+
"grad_norm": 1.4271999597549438,
|
| 56 |
+
"learning_rate": 1e-05,
|
| 57 |
+
"loss": 1.7031,
|
| 58 |
+
"step": 14
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"epoch": 0.07960199004975124,
|
| 62 |
+
"grad_norm": 1.644882321357727,
|
| 63 |
+
"learning_rate": 1e-05,
|
| 64 |
+
"loss": 2.1426,
|
| 65 |
+
"step": 16
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"epoch": 0.08955223880597014,
|
| 69 |
+
"grad_norm": 1.7688827514648438,
|
| 70 |
+
"learning_rate": 1e-05,
|
| 71 |
+
"loss": 2.1768,
|
| 72 |
+
"step": 18
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"epoch": 0.09950248756218906,
|
| 76 |
+
"grad_norm": 0.8430641889572144,
|
| 77 |
+
"learning_rate": 1e-05,
|
| 78 |
+
"loss": 1.9033,
|
| 79 |
+
"step": 20
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"epoch": 0.10945273631840796,
|
| 83 |
+
"grad_norm": 1.0195722579956055,
|
| 84 |
+
"learning_rate": 1e-05,
|
| 85 |
+
"loss": 1.7842,
|
| 86 |
+
"step": 22
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"epoch": 0.11940298507462686,
|
| 90 |
+
"grad_norm": 0.7112452387809753,
|
| 91 |
+
"learning_rate": 1e-05,
|
| 92 |
+
"loss": 1.8604,
|
| 93 |
+
"step": 24
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
"epoch": 0.12935323383084577,
|
| 97 |
+
"grad_norm": 1.6654636859893799,
|
| 98 |
+
"learning_rate": 1e-05,
|
| 99 |
+
"loss": 1.5967,
|
| 100 |
+
"step": 26
|
| 101 |
+
},
|
| 102 |
+
{
|
| 103 |
+
"epoch": 0.13930348258706468,
|
| 104 |
+
"grad_norm": 0.5782097578048706,
|
| 105 |
+
"learning_rate": 1e-05,
|
| 106 |
+
"loss": 1.7471,
|
| 107 |
+
"step": 28
|
| 108 |
+
},
|
| 109 |
+
{
|
| 110 |
+
"epoch": 0.14925373134328357,
|
| 111 |
+
"grad_norm": 1.0965440273284912,
|
| 112 |
+
"learning_rate": 1e-05,
|
| 113 |
+
"loss": 1.6221,
|
| 114 |
+
"step": 30
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"epoch": 0.15920398009950248,
|
| 118 |
+
"grad_norm": 1.0025074481964111,
|
| 119 |
+
"learning_rate": 1e-05,
|
| 120 |
+
"loss": 1.9658,
|
| 121 |
+
"step": 32
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"epoch": 0.1691542288557214,
|
| 125 |
+
"grad_norm": 1.0948214530944824,
|
| 126 |
+
"learning_rate": 1e-05,
|
| 127 |
+
"loss": 2.0059,
|
| 128 |
+
"step": 34
|
| 129 |
+
},
|
| 130 |
+
{
|
| 131 |
+
"epoch": 0.1791044776119403,
|
| 132 |
+
"grad_norm": 1.1663291454315186,
|
| 133 |
+
"learning_rate": 1e-05,
|
| 134 |
+
"loss": 1.8125,
|
| 135 |
+
"step": 36
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"epoch": 0.1890547263681592,
|
| 139 |
+
"grad_norm": 0.6280285120010376,
|
| 140 |
+
"learning_rate": 1e-05,
|
| 141 |
+
"loss": 1.833,
|
| 142 |
+
"step": 38
|
| 143 |
+
},
|
| 144 |
+
{
|
| 145 |
+
"epoch": 0.19900497512437812,
|
| 146 |
+
"grad_norm": 0.7364129424095154,
|
| 147 |
+
"learning_rate": 1e-05,
|
| 148 |
+
"loss": 1.6533,
|
| 149 |
+
"step": 40
|
| 150 |
+
},
|
| 151 |
+
{
|
| 152 |
+
"epoch": 0.208955223880597,
|
| 153 |
+
"grad_norm": 1.1327072381973267,
|
| 154 |
+
"learning_rate": 1e-05,
|
| 155 |
+
"loss": 1.9336,
|
| 156 |
+
"step": 42
|
| 157 |
+
},
|
| 158 |
+
{
|
| 159 |
+
"epoch": 0.21890547263681592,
|
| 160 |
+
"grad_norm": 0.7770842909812927,
|
| 161 |
+
"learning_rate": 1e-05,
|
| 162 |
+
"loss": 1.7471,
|
| 163 |
+
"step": 44
|
| 164 |
+
},
|
| 165 |
+
{
|
| 166 |
+
"epoch": 0.22885572139303484,
|
| 167 |
+
"grad_norm": 0.7920796871185303,
|
| 168 |
+
"learning_rate": 1e-05,
|
| 169 |
+
"loss": 1.9375,
|
| 170 |
+
"step": 46
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"epoch": 0.23880597014925373,
|
| 174 |
+
"grad_norm": 0.8180975914001465,
|
| 175 |
+
"learning_rate": 1e-05,
|
| 176 |
+
"loss": 1.999,
|
| 177 |
+
"step": 48
|
| 178 |
+
},
|
| 179 |
+
{
|
| 180 |
+
"epoch": 0.24875621890547264,
|
| 181 |
+
"grad_norm": 0.9668822884559631,
|
| 182 |
+
"learning_rate": 1e-05,
|
| 183 |
+
"loss": 1.8721,
|
| 184 |
+
"step": 50
|
| 185 |
+
},
|
| 186 |
+
{
|
| 187 |
+
"epoch": 0.25870646766169153,
|
| 188 |
+
"grad_norm": 0.6620003581047058,
|
| 189 |
+
"learning_rate": 1e-05,
|
| 190 |
+
"loss": 1.6611,
|
| 191 |
+
"step": 52
|
| 192 |
+
},
|
| 193 |
+
{
|
| 194 |
+
"epoch": 0.26865671641791045,
|
| 195 |
+
"grad_norm": 1.0094668865203857,
|
| 196 |
+
"learning_rate": 1e-05,
|
| 197 |
+
"loss": 2.0352,
|
| 198 |
+
"step": 54
|
| 199 |
+
},
|
| 200 |
+
{
|
| 201 |
+
"epoch": 0.27860696517412936,
|
| 202 |
+
"grad_norm": 0.8333507776260376,
|
| 203 |
+
"learning_rate": 1e-05,
|
| 204 |
+
"loss": 1.9297,
|
| 205 |
+
"step": 56
|
| 206 |
+
},
|
| 207 |
+
{
|
| 208 |
+
"epoch": 0.2885572139303483,
|
| 209 |
+
"grad_norm": 0.6568053364753723,
|
| 210 |
+
"learning_rate": 1e-05,
|
| 211 |
+
"loss": 1.8447,
|
| 212 |
+
"step": 58
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"epoch": 0.29850746268656714,
|
| 216 |
+
"grad_norm": 1.129006028175354,
|
| 217 |
+
"learning_rate": 1e-05,
|
| 218 |
+
"loss": 1.8936,
|
| 219 |
+
"step": 60
|
| 220 |
+
},
|
| 221 |
+
{
|
| 222 |
+
"epoch": 0.30845771144278605,
|
| 223 |
+
"grad_norm": 0.7393130660057068,
|
| 224 |
+
"learning_rate": 1e-05,
|
| 225 |
+
"loss": 1.9336,
|
| 226 |
+
"step": 62
|
| 227 |
+
},
|
| 228 |
+
{
|
| 229 |
+
"epoch": 0.31840796019900497,
|
| 230 |
+
"grad_norm": 0.4612615704536438,
|
| 231 |
+
"learning_rate": 1e-05,
|
| 232 |
+
"loss": 1.9365,
|
| 233 |
+
"step": 64
|
| 234 |
+
},
|
| 235 |
+
{
|
| 236 |
+
"epoch": 0.3283582089552239,
|
| 237 |
+
"grad_norm": 0.6561993360519409,
|
| 238 |
+
"learning_rate": 1e-05,
|
| 239 |
+
"loss": 1.8389,
|
| 240 |
+
"step": 66
|
| 241 |
+
},
|
| 242 |
+
{
|
| 243 |
+
"epoch": 0.3383084577114428,
|
| 244 |
+
"grad_norm": 1.0325121879577637,
|
| 245 |
+
"learning_rate": 1e-05,
|
| 246 |
+
"loss": 1.8486,
|
| 247 |
+
"step": 68
|
| 248 |
+
},
|
| 249 |
+
{
|
| 250 |
+
"epoch": 0.3482587064676617,
|
| 251 |
+
"grad_norm": 0.7401711344718933,
|
| 252 |
+
"learning_rate": 1e-05,
|
| 253 |
+
"loss": 1.8662,
|
| 254 |
+
"step": 70
|
| 255 |
+
},
|
| 256 |
+
{
|
| 257 |
+
"epoch": 0.3582089552238806,
|
| 258 |
+
"grad_norm": 0.6198751330375671,
|
| 259 |
+
"learning_rate": 1e-05,
|
| 260 |
+
"loss": 1.8506,
|
| 261 |
+
"step": 72
|
| 262 |
+
},
|
| 263 |
+
{
|
| 264 |
+
"epoch": 0.3681592039800995,
|
| 265 |
+
"grad_norm": 0.6299334764480591,
|
| 266 |
+
"learning_rate": 1e-05,
|
| 267 |
+
"loss": 1.8555,
|
| 268 |
+
"step": 74
|
| 269 |
+
},
|
| 270 |
+
{
|
| 271 |
+
"epoch": 0.3781094527363184,
|
| 272 |
+
"grad_norm": 0.8257051706314087,
|
| 273 |
+
"learning_rate": 1e-05,
|
| 274 |
+
"loss": 1.7344,
|
| 275 |
+
"step": 76
|
| 276 |
+
},
|
| 277 |
+
{
|
| 278 |
+
"epoch": 0.3880597014925373,
|
| 279 |
+
"grad_norm": 0.8762025237083435,
|
| 280 |
+
"learning_rate": 1e-05,
|
| 281 |
+
"loss": 1.7891,
|
| 282 |
+
"step": 78
|
| 283 |
+
},
|
| 284 |
+
{
|
| 285 |
+
"epoch": 0.39800995024875624,
|
| 286 |
+
"grad_norm": 1.2744340896606445,
|
| 287 |
+
"learning_rate": 1e-05,
|
| 288 |
+
"loss": 1.9102,
|
| 289 |
+
"step": 80
|
| 290 |
+
},
|
| 291 |
+
{
|
| 292 |
+
"epoch": 0.4079601990049751,
|
| 293 |
+
"grad_norm": 0.5431731939315796,
|
| 294 |
+
"learning_rate": 1e-05,
|
| 295 |
+
"loss": 1.7705,
|
| 296 |
+
"step": 82
|
| 297 |
+
},
|
| 298 |
+
{
|
| 299 |
+
"epoch": 0.417910447761194,
|
| 300 |
+
"grad_norm": 0.8810946345329285,
|
| 301 |
+
"learning_rate": 1e-05,
|
| 302 |
+
"loss": 1.6855,
|
| 303 |
+
"step": 84
|
| 304 |
+
},
|
| 305 |
+
{
|
| 306 |
+
"epoch": 0.42786069651741293,
|
| 307 |
+
"grad_norm": 0.8568848967552185,
|
| 308 |
+
"learning_rate": 1e-05,
|
| 309 |
+
"loss": 1.7959,
|
| 310 |
+
"step": 86
|
| 311 |
+
},
|
| 312 |
+
{
|
| 313 |
+
"epoch": 0.43781094527363185,
|
| 314 |
+
"grad_norm": 0.9605632424354553,
|
| 315 |
+
"learning_rate": 1e-05,
|
| 316 |
+
"loss": 1.873,
|
| 317 |
+
"step": 88
|
| 318 |
+
},
|
| 319 |
+
{
|
| 320 |
+
"epoch": 0.44776119402985076,
|
| 321 |
+
"grad_norm": 0.512973964214325,
|
| 322 |
+
"learning_rate": 1e-05,
|
| 323 |
+
"loss": 1.7891,
|
| 324 |
+
"step": 90
|
| 325 |
+
},
|
| 326 |
+
{
|
| 327 |
+
"epoch": 0.4577114427860697,
|
| 328 |
+
"grad_norm": 0.723425567150116,
|
| 329 |
+
"learning_rate": 1e-05,
|
| 330 |
+
"loss": 1.877,
|
| 331 |
+
"step": 92
|
| 332 |
+
},
|
| 333 |
+
{
|
| 334 |
+
"epoch": 0.46766169154228854,
|
| 335 |
+
"grad_norm": 0.5228793025016785,
|
| 336 |
+
"learning_rate": 1e-05,
|
| 337 |
+
"loss": 1.999,
|
| 338 |
+
"step": 94
|
| 339 |
+
},
|
| 340 |
+
{
|
| 341 |
+
"epoch": 0.47761194029850745,
|
| 342 |
+
"grad_norm": 0.7799379825592041,
|
| 343 |
+
"learning_rate": 1e-05,
|
| 344 |
+
"loss": 1.751,
|
| 345 |
+
"step": 96
|
| 346 |
+
},
|
| 347 |
+
{
|
| 348 |
+
"epoch": 0.48756218905472637,
|
| 349 |
+
"grad_norm": 1.0080820322036743,
|
| 350 |
+
"learning_rate": 1e-05,
|
| 351 |
+
"loss": 1.877,
|
| 352 |
+
"step": 98
|
| 353 |
+
},
|
| 354 |
+
{
|
| 355 |
+
"epoch": 0.4975124378109453,
|
| 356 |
+
"grad_norm": 0.9821782112121582,
|
| 357 |
+
"learning_rate": 1e-05,
|
| 358 |
+
"loss": 1.8867,
|
| 359 |
+
"step": 100
|
| 360 |
+
},
|
| 361 |
+
{
|
| 362 |
+
"epoch": 0.5074626865671642,
|
| 363 |
+
"grad_norm": 0.5222265720367432,
|
| 364 |
+
"learning_rate": 1e-05,
|
| 365 |
+
"loss": 1.793,
|
| 366 |
+
"step": 102
|
| 367 |
+
},
|
| 368 |
+
{
|
| 369 |
+
"epoch": 0.5174129353233831,
|
| 370 |
+
"grad_norm": 0.5731136798858643,
|
| 371 |
+
"learning_rate": 1e-05,
|
| 372 |
+
"loss": 1.915,
|
| 373 |
+
"step": 104
|
| 374 |
+
},
|
| 375 |
+
{
|
| 376 |
+
"epoch": 0.527363184079602,
|
| 377 |
+
"grad_norm": 0.6745629906654358,
|
| 378 |
+
"learning_rate": 1e-05,
|
| 379 |
+
"loss": 1.7998,
|
| 380 |
+
"step": 106
|
| 381 |
+
},
|
| 382 |
+
{
|
| 383 |
+
"epoch": 0.5373134328358209,
|
| 384 |
+
"grad_norm": 0.7346249222755432,
|
| 385 |
+
"learning_rate": 1e-05,
|
| 386 |
+
"loss": 1.7988,
|
| 387 |
+
"step": 108
|
| 388 |
+
},
|
| 389 |
+
{
|
| 390 |
+
"epoch": 0.5472636815920398,
|
| 391 |
+
"grad_norm": 0.6089544892311096,
|
| 392 |
+
"learning_rate": 1e-05,
|
| 393 |
+
"loss": 1.7949,
|
| 394 |
+
"step": 110
|
| 395 |
+
},
|
| 396 |
+
{
|
| 397 |
+
"epoch": 0.5572139303482587,
|
| 398 |
+
"grad_norm": 0.9230899214744568,
|
| 399 |
+
"learning_rate": 1e-05,
|
| 400 |
+
"loss": 1.9463,
|
| 401 |
+
"step": 112
|
| 402 |
+
},
|
| 403 |
+
{
|
| 404 |
+
"epoch": 0.5671641791044776,
|
| 405 |
+
"grad_norm": 0.8394888639450073,
|
| 406 |
+
"learning_rate": 1e-05,
|
| 407 |
+
"loss": 1.9131,
|
| 408 |
+
"step": 114
|
| 409 |
+
},
|
| 410 |
+
{
|
| 411 |
+
"epoch": 0.5771144278606966,
|
| 412 |
+
"grad_norm": 0.603209376335144,
|
| 413 |
+
"learning_rate": 1e-05,
|
| 414 |
+
"loss": 1.8389,
|
| 415 |
+
"step": 116
|
| 416 |
+
},
|
| 417 |
+
{
|
| 418 |
+
"epoch": 0.5870646766169154,
|
| 419 |
+
"grad_norm": 0.6753935813903809,
|
| 420 |
+
"learning_rate": 1e-05,
|
| 421 |
+
"loss": 1.8379,
|
| 422 |
+
"step": 118
|
| 423 |
+
},
|
| 424 |
+
{
|
| 425 |
+
"epoch": 0.5970149253731343,
|
| 426 |
+
"grad_norm": 0.7781857252120972,
|
| 427 |
+
"learning_rate": 1e-05,
|
| 428 |
+
"loss": 1.8662,
|
| 429 |
+
"step": 120
|
| 430 |
+
},
|
| 431 |
+
{
|
| 432 |
+
"epoch": 0.6069651741293532,
|
| 433 |
+
"grad_norm": 0.6543675661087036,
|
| 434 |
+
"learning_rate": 1e-05,
|
| 435 |
+
"loss": 1.8711,
|
| 436 |
+
"step": 122
|
| 437 |
+
},
|
| 438 |
+
{
|
| 439 |
+
"epoch": 0.6169154228855721,
|
| 440 |
+
"grad_norm": 0.7465837001800537,
|
| 441 |
+
"learning_rate": 1e-05,
|
| 442 |
+
"loss": 1.8457,
|
| 443 |
+
"step": 124
|
| 444 |
+
},
|
| 445 |
+
{
|
| 446 |
+
"epoch": 0.6268656716417911,
|
| 447 |
+
"grad_norm": 0.6059397459030151,
|
| 448 |
+
"learning_rate": 1e-05,
|
| 449 |
+
"loss": 1.8184,
|
| 450 |
+
"step": 126
|
| 451 |
+
},
|
| 452 |
+
{
|
| 453 |
+
"epoch": 0.6368159203980099,
|
| 454 |
+
"grad_norm": 0.6485504508018494,
|
| 455 |
+
"learning_rate": 1e-05,
|
| 456 |
+
"loss": 1.877,
|
| 457 |
+
"step": 128
|
| 458 |
+
},
|
| 459 |
+
{
|
| 460 |
+
"epoch": 0.6467661691542289,
|
| 461 |
+
"grad_norm": 0.6433750987052917,
|
| 462 |
+
"learning_rate": 1e-05,
|
| 463 |
+
"loss": 1.7803,
|
| 464 |
+
"step": 130
|
| 465 |
+
},
|
| 466 |
+
{
|
| 467 |
+
"epoch": 0.6567164179104478,
|
| 468 |
+
"grad_norm": 0.6054277420043945,
|
| 469 |
+
"learning_rate": 1e-05,
|
| 470 |
+
"loss": 1.8145,
|
| 471 |
+
"step": 132
|
| 472 |
+
},
|
| 473 |
+
{
|
| 474 |
+
"epoch": 0.6666666666666666,
|
| 475 |
+
"grad_norm": 0.9794463515281677,
|
| 476 |
+
"learning_rate": 1e-05,
|
| 477 |
+
"loss": 1.6592,
|
| 478 |
+
"step": 134
|
| 479 |
+
},
|
| 480 |
+
{
|
| 481 |
+
"epoch": 0.6766169154228856,
|
| 482 |
+
"grad_norm": 1.128212332725525,
|
| 483 |
+
"learning_rate": 1e-05,
|
| 484 |
+
"loss": 2.042,
|
| 485 |
+
"step": 136
|
| 486 |
+
},
|
| 487 |
+
{
|
| 488 |
+
"epoch": 0.6865671641791045,
|
| 489 |
+
"grad_norm": 0.848319947719574,
|
| 490 |
+
"learning_rate": 1e-05,
|
| 491 |
+
"loss": 1.9219,
|
| 492 |
+
"step": 138
|
| 493 |
+
},
|
| 494 |
+
{
|
| 495 |
+
"epoch": 0.6965174129353234,
|
| 496 |
+
"grad_norm": 1.446349859237671,
|
| 497 |
+
"learning_rate": 1e-05,
|
| 498 |
+
"loss": 1.9805,
|
| 499 |
+
"step": 140
|
| 500 |
+
},
|
| 501 |
+
{
|
| 502 |
+
"epoch": 0.7064676616915423,
|
| 503 |
+
"grad_norm": 0.8592532277107239,
|
| 504 |
+
"learning_rate": 1e-05,
|
| 505 |
+
"loss": 1.9404,
|
| 506 |
+
"step": 142
|
| 507 |
+
},
|
| 508 |
+
{
|
| 509 |
+
"epoch": 0.7164179104477612,
|
| 510 |
+
"grad_norm": 0.7463251948356628,
|
| 511 |
+
"learning_rate": 1e-05,
|
| 512 |
+
"loss": 2.0098,
|
| 513 |
+
"step": 144
|
| 514 |
+
},
|
| 515 |
+
{
|
| 516 |
+
"epoch": 0.7263681592039801,
|
| 517 |
+
"grad_norm": 0.6972345113754272,
|
| 518 |
+
"learning_rate": 1e-05,
|
| 519 |
+
"loss": 1.9043,
|
| 520 |
+
"step": 146
|
| 521 |
+
},
|
| 522 |
+
{
|
| 523 |
+
"epoch": 0.736318407960199,
|
| 524 |
+
"grad_norm": 1.0360370874404907,
|
| 525 |
+
"learning_rate": 1e-05,
|
| 526 |
+
"loss": 1.918,
|
| 527 |
+
"step": 148
|
| 528 |
+
},
|
| 529 |
+
{
|
| 530 |
+
"epoch": 0.746268656716418,
|
| 531 |
+
"grad_norm": 0.7613181471824646,
|
| 532 |
+
"learning_rate": 1e-05,
|
| 533 |
+
"loss": 1.8232,
|
| 534 |
+
"step": 150
|
| 535 |
+
},
|
| 536 |
+
{
|
| 537 |
+
"epoch": 0.7562189054726368,
|
| 538 |
+
"grad_norm": 0.8578123450279236,
|
| 539 |
+
"learning_rate": 1e-05,
|
| 540 |
+
"loss": 1.9209,
|
| 541 |
+
"step": 152
|
| 542 |
+
},
|
| 543 |
+
{
|
| 544 |
+
"epoch": 0.7661691542288557,
|
| 545 |
+
"grad_norm": 0.6234486103057861,
|
| 546 |
+
"learning_rate": 1e-05,
|
| 547 |
+
"loss": 1.8105,
|
| 548 |
+
"step": 154
|
| 549 |
+
},
|
| 550 |
+
{
|
| 551 |
+
"epoch": 0.7761194029850746,
|
| 552 |
+
"grad_norm": 0.8788239359855652,
|
| 553 |
+
"learning_rate": 1e-05,
|
| 554 |
+
"loss": 1.7852,
|
| 555 |
+
"step": 156
|
| 556 |
+
},
|
| 557 |
+
{
|
| 558 |
+
"epoch": 0.7860696517412935,
|
| 559 |
+
"grad_norm": 0.5887688994407654,
|
| 560 |
+
"learning_rate": 1e-05,
|
| 561 |
+
"loss": 1.835,
|
| 562 |
+
"step": 158
|
| 563 |
+
},
|
| 564 |
+
{
|
| 565 |
+
"epoch": 0.7960199004975125,
|
| 566 |
+
"grad_norm": 0.5808454155921936,
|
| 567 |
+
"learning_rate": 1e-05,
|
| 568 |
+
"loss": 1.8691,
|
| 569 |
+
"step": 160
|
| 570 |
+
},
|
| 571 |
+
{
|
| 572 |
+
"epoch": 0.8059701492537313,
|
| 573 |
+
"grad_norm": 0.8322702050209045,
|
| 574 |
+
"learning_rate": 1e-05,
|
| 575 |
+
"loss": 1.8652,
|
| 576 |
+
"step": 162
|
| 577 |
+
},
|
| 578 |
+
{
|
| 579 |
+
"epoch": 0.8159203980099502,
|
| 580 |
+
"grad_norm": 0.6851075291633606,
|
| 581 |
+
"learning_rate": 1e-05,
|
| 582 |
+
"loss": 2.043,
|
| 583 |
+
"step": 164
|
| 584 |
+
},
|
| 585 |
+
{
|
| 586 |
+
"epoch": 0.8258706467661692,
|
| 587 |
+
"grad_norm": 0.6591010093688965,
|
| 588 |
+
"learning_rate": 1e-05,
|
| 589 |
+
"loss": 1.8418,
|
| 590 |
+
"step": 166
|
| 591 |
+
},
|
| 592 |
+
{
|
| 593 |
+
"epoch": 0.835820895522388,
|
| 594 |
+
"grad_norm": 0.9328513145446777,
|
| 595 |
+
"learning_rate": 1e-05,
|
| 596 |
+
"loss": 1.8467,
|
| 597 |
+
"step": 168
|
| 598 |
+
},
|
| 599 |
+
{
|
| 600 |
+
"epoch": 0.845771144278607,
|
| 601 |
+
"grad_norm": 0.7491399049758911,
|
| 602 |
+
"learning_rate": 1e-05,
|
| 603 |
+
"loss": 1.7734,
|
| 604 |
+
"step": 170
|
| 605 |
+
},
|
| 606 |
+
{
|
| 607 |
+
"epoch": 0.8557213930348259,
|
| 608 |
+
"grad_norm": 0.6368930339813232,
|
| 609 |
+
"learning_rate": 1e-05,
|
| 610 |
+
"loss": 1.8408,
|
| 611 |
+
"step": 172
|
| 612 |
+
},
|
| 613 |
+
{
|
| 614 |
+
"epoch": 0.8656716417910447,
|
| 615 |
+
"grad_norm": 0.6846456527709961,
|
| 616 |
+
"learning_rate": 1e-05,
|
| 617 |
+
"loss": 1.9053,
|
| 618 |
+
"step": 174
|
| 619 |
+
},
|
| 620 |
+
{
|
| 621 |
+
"epoch": 0.8756218905472637,
|
| 622 |
+
"grad_norm": 0.5860757231712341,
|
| 623 |
+
"learning_rate": 1e-05,
|
| 624 |
+
"loss": 1.8438,
|
| 625 |
+
"step": 176
|
| 626 |
+
},
|
| 627 |
+
{
|
| 628 |
+
"epoch": 0.8855721393034826,
|
| 629 |
+
"grad_norm": 0.6338534355163574,
|
| 630 |
+
"learning_rate": 1e-05,
|
| 631 |
+
"loss": 1.8496,
|
| 632 |
+
"step": 178
|
| 633 |
+
},
|
| 634 |
+
{
|
| 635 |
+
"epoch": 0.8955223880597015,
|
| 636 |
+
"grad_norm": 0.5710776448249817,
|
| 637 |
+
"learning_rate": 1e-05,
|
| 638 |
+
"loss": 1.7627,
|
| 639 |
+
"step": 180
|
| 640 |
+
},
|
| 641 |
+
{
|
| 642 |
+
"epoch": 0.9054726368159204,
|
| 643 |
+
"grad_norm": 0.7385186553001404,
|
| 644 |
+
"learning_rate": 1e-05,
|
| 645 |
+
"loss": 1.791,
|
| 646 |
+
"step": 182
|
| 647 |
+
},
|
| 648 |
+
{
|
| 649 |
+
"epoch": 0.9154228855721394,
|
| 650 |
+
"grad_norm": 0.5550143122673035,
|
| 651 |
+
"learning_rate": 1e-05,
|
| 652 |
+
"loss": 1.7607,
|
| 653 |
+
"step": 184
|
| 654 |
+
},
|
| 655 |
+
{
|
| 656 |
+
"epoch": 0.9253731343283582,
|
| 657 |
+
"grad_norm": 0.6846106648445129,
|
| 658 |
+
"learning_rate": 1e-05,
|
| 659 |
+
"loss": 1.8447,
|
| 660 |
+
"step": 186
|
| 661 |
+
},
|
| 662 |
+
{
|
| 663 |
+
"epoch": 0.9353233830845771,
|
| 664 |
+
"grad_norm": 0.43355798721313477,
|
| 665 |
+
"learning_rate": 1e-05,
|
| 666 |
+
"loss": 1.7822,
|
| 667 |
+
"step": 188
|
| 668 |
+
},
|
| 669 |
+
{
|
| 670 |
+
"epoch": 0.945273631840796,
|
| 671 |
+
"grad_norm": 0.6083195209503174,
|
| 672 |
+
"learning_rate": 1e-05,
|
| 673 |
+
"loss": 1.8418,
|
| 674 |
+
"step": 190
|
| 675 |
+
},
|
| 676 |
+
{
|
| 677 |
+
"epoch": 0.9552238805970149,
|
| 678 |
+
"grad_norm": 1.3910738229751587,
|
| 679 |
+
"learning_rate": 1e-05,
|
| 680 |
+
"loss": 2.0508,
|
| 681 |
+
"step": 192
|
| 682 |
+
},
|
| 683 |
+
{
|
| 684 |
+
"epoch": 0.9651741293532339,
|
| 685 |
+
"grad_norm": 0.6805091500282288,
|
| 686 |
+
"learning_rate": 1e-05,
|
| 687 |
+
"loss": 1.8906,
|
| 688 |
+
"step": 194
|
| 689 |
+
},
|
| 690 |
+
{
|
| 691 |
+
"epoch": 0.9751243781094527,
|
| 692 |
+
"grad_norm": 0.7249168753623962,
|
| 693 |
+
"learning_rate": 1e-05,
|
| 694 |
+
"loss": 1.9424,
|
| 695 |
+
"step": 196
|
| 696 |
+
},
|
| 697 |
+
{
|
| 698 |
+
"epoch": 0.9850746268656716,
|
| 699 |
+
"grad_norm": 0.6910979747772217,
|
| 700 |
+
"learning_rate": 1e-05,
|
| 701 |
+
"loss": 1.8467,
|
| 702 |
+
"step": 198
|
| 703 |
+
},
|
| 704 |
+
{
|
| 705 |
+
"epoch": 0.9950248756218906,
|
| 706 |
+
"grad_norm": 0.5376845598220825,
|
| 707 |
+
"learning_rate": 1e-05,
|
| 708 |
+
"loss": 1.8105,
|
| 709 |
+
"step": 200
|
| 710 |
+
},
|
| 711 |
+
{
|
| 712 |
+
"epoch": 1.0,
|
| 713 |
+
"step": 201,
|
| 714 |
+
"total_flos": 4.207228707510682e+16,
|
| 715 |
+
"train_loss": 1.8549926150497513,
|
| 716 |
+
"train_runtime": 548.5019,
|
| 717 |
+
"train_samples_per_second": 1.466,
|
| 718 |
+
"train_steps_per_second": 0.366
|
| 719 |
+
}
|
| 720 |
+
],
|
| 721 |
+
"logging_steps": 2,
|
| 722 |
+
"max_steps": 201,
|
| 723 |
+
"num_input_tokens_seen": 0,
|
| 724 |
+
"num_train_epochs": 1,
|
| 725 |
+
"save_steps": 500,
|
| 726 |
+
"stateful_callbacks": {
|
| 727 |
+
"TrainerControl": {
|
| 728 |
+
"args": {
|
| 729 |
+
"should_epoch_stop": false,
|
| 730 |
+
"should_evaluate": false,
|
| 731 |
+
"should_log": false,
|
| 732 |
+
"should_save": false,
|
| 733 |
+
"should_training_stop": false
|
| 734 |
+
},
|
| 735 |
+
"attributes": {}
|
| 736 |
+
}
|
| 737 |
+
},
|
| 738 |
+
"total_flos": 4.207228707510682e+16,
|
| 739 |
+
"train_batch_size": 1,
|
| 740 |
+
"trial_name": null,
|
| 741 |
+
"trial_params": null
|
| 742 |
+
}
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:001f4000001fb736aff464f30f0d782601e31161ce407d88322d15b792880c1a
|
| 3 |
+
size 389170122
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr0.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4e8b0d276fa4d098986bf6074ca63416ea79b7ec5a916f8e4a01940fc76660ea
|
| 3 |
+
size 389172166
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr100.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d437b04572abbed85cb39c65fabeb508959096c75388d5bef11b19e797566c0e
|
| 3 |
+
size 389172958
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr125.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b8a4da4cc37ea14cc4b5aee2b93783d9e7237d59b56cb34e8efac177e0efb89b
|
| 3 |
+
size 389172958
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr150.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:10b27d2f9cf8fe79ff703abcfc09de76db592720e6c8c20b3111e702fe7ac719
|
| 3 |
+
size 389172958
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr175.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:191069f1dce0ec6078bf81d2ff2e78e66a559f44093cc8003ac4893a73b8e058
|
| 3 |
+
size 389172958
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr200.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:102805372f12920f3fa2ed2d140065828e2e2d25ab3432d393c954c4ce1d6c98
|
| 3 |
+
size 389172958
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr25.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9d82e1b14da8bbba34bdd50adbbf25114daf82ba7984c890086c6b7e42fefba2
|
| 3 |
+
size 389172562
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr50.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e4a1906c160dc30bf726a4f54af8b913ba4043c992dbc5d778a3e5f4cf346c13
|
| 3 |
+
size 389172562
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr75.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:069a89736b0f91c14b1703db6b7d7c151a1213b54437d9a99f4972fe03ffab7b
|
| 3 |
+
size 389172562
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_trainer_state.json
ADDED
|
@@ -0,0 +1,742 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_metric": null,
|
| 3 |
+
"best_model_checkpoint": null,
|
| 4 |
+
"epoch": 1.0,
|
| 5 |
+
"eval_steps": 500,
|
| 6 |
+
"global_step": 201,
|
| 7 |
+
"is_hyper_param_search": false,
|
| 8 |
+
"is_local_process_zero": true,
|
| 9 |
+
"is_world_process_zero": true,
|
| 10 |
+
"log_history": [
|
| 11 |
+
{
|
| 12 |
+
"epoch": 0.009950248756218905,
|
| 13 |
+
"grad_norm": 1.3227640390396118,
|
| 14 |
+
"learning_rate": 1e-05,
|
| 15 |
+
"loss": 1.8506,
|
| 16 |
+
"step": 2
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"epoch": 0.01990049751243781,
|
| 20 |
+
"grad_norm": 1.6141871213912964,
|
| 21 |
+
"learning_rate": 1e-05,
|
| 22 |
+
"loss": 1.8975,
|
| 23 |
+
"step": 4
|
| 24 |
+
},
|
| 25 |
+
{
|
| 26 |
+
"epoch": 0.029850746268656716,
|
| 27 |
+
"grad_norm": 1.6484756469726562,
|
| 28 |
+
"learning_rate": 1e-05,
|
| 29 |
+
"loss": 1.749,
|
| 30 |
+
"step": 6
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"epoch": 0.03980099502487562,
|
| 34 |
+
"grad_norm": 1.487459659576416,
|
| 35 |
+
"learning_rate": 1e-05,
|
| 36 |
+
"loss": 1.6318,
|
| 37 |
+
"step": 8
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"epoch": 0.04975124378109453,
|
| 41 |
+
"grad_norm": 1.5136044025421143,
|
| 42 |
+
"learning_rate": 1e-05,
|
| 43 |
+
"loss": 1.5537,
|
| 44 |
+
"step": 10
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
"epoch": 0.05970149253731343,
|
| 48 |
+
"grad_norm": 1.2955031394958496,
|
| 49 |
+
"learning_rate": 1e-05,
|
| 50 |
+
"loss": 1.623,
|
| 51 |
+
"step": 12
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"epoch": 0.06965174129353234,
|
| 55 |
+
"grad_norm": 1.6998140811920166,
|
| 56 |
+
"learning_rate": 1e-05,
|
| 57 |
+
"loss": 2.0244,
|
| 58 |
+
"step": 14
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"epoch": 0.07960199004975124,
|
| 62 |
+
"grad_norm": 2.039724111557007,
|
| 63 |
+
"learning_rate": 1e-05,
|
| 64 |
+
"loss": 1.3848,
|
| 65 |
+
"step": 16
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"epoch": 0.08955223880597014,
|
| 69 |
+
"grad_norm": 3.201810598373413,
|
| 70 |
+
"learning_rate": 1e-05,
|
| 71 |
+
"loss": 1.9473,
|
| 72 |
+
"step": 18
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"epoch": 0.09950248756218906,
|
| 76 |
+
"grad_norm": 2.301619291305542,
|
| 77 |
+
"learning_rate": 1e-05,
|
| 78 |
+
"loss": 0.9546,
|
| 79 |
+
"step": 20
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"epoch": 0.10945273631840796,
|
| 83 |
+
"grad_norm": 3.1304359436035156,
|
| 84 |
+
"learning_rate": 1e-05,
|
| 85 |
+
"loss": 1.5273,
|
| 86 |
+
"step": 22
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"epoch": 0.11940298507462686,
|
| 90 |
+
"grad_norm": 1.8952662944793701,
|
| 91 |
+
"learning_rate": 1e-05,
|
| 92 |
+
"loss": 0.742,
|
| 93 |
+
"step": 24
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
"epoch": 0.12935323383084577,
|
| 97 |
+
"grad_norm": 4.763426780700684,
|
| 98 |
+
"learning_rate": 1e-05,
|
| 99 |
+
"loss": 1.4138,
|
| 100 |
+
"step": 26
|
| 101 |
+
},
|
| 102 |
+
{
|
| 103 |
+
"epoch": 0.13930348258706468,
|
| 104 |
+
"grad_norm": 3.3053810596466064,
|
| 105 |
+
"learning_rate": 1e-05,
|
| 106 |
+
"loss": 1.009,
|
| 107 |
+
"step": 28
|
| 108 |
+
},
|
| 109 |
+
{
|
| 110 |
+
"epoch": 0.14925373134328357,
|
| 111 |
+
"grad_norm": 3.5452332496643066,
|
| 112 |
+
"learning_rate": 1e-05,
|
| 113 |
+
"loss": 1.4029,
|
| 114 |
+
"step": 30
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"epoch": 0.15920398009950248,
|
| 118 |
+
"grad_norm": 3.621952533721924,
|
| 119 |
+
"learning_rate": 1e-05,
|
| 120 |
+
"loss": 0.972,
|
| 121 |
+
"step": 32
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"epoch": 0.1691542288557214,
|
| 125 |
+
"grad_norm": 3.8620715141296387,
|
| 126 |
+
"learning_rate": 1e-05,
|
| 127 |
+
"loss": 1.1833,
|
| 128 |
+
"step": 34
|
| 129 |
+
},
|
| 130 |
+
{
|
| 131 |
+
"epoch": 0.1791044776119403,
|
| 132 |
+
"grad_norm": 5.8020195960998535,
|
| 133 |
+
"learning_rate": 1e-05,
|
| 134 |
+
"loss": 1.7115,
|
| 135 |
+
"step": 36
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"epoch": 0.1890547263681592,
|
| 139 |
+
"grad_norm": 3.4086718559265137,
|
| 140 |
+
"learning_rate": 1e-05,
|
| 141 |
+
"loss": 1.1666,
|
| 142 |
+
"step": 38
|
| 143 |
+
},
|
| 144 |
+
{
|
| 145 |
+
"epoch": 0.19900497512437812,
|
| 146 |
+
"grad_norm": 2.975222587585449,
|
| 147 |
+
"learning_rate": 1e-05,
|
| 148 |
+
"loss": 0.7056,
|
| 149 |
+
"step": 40
|
| 150 |
+
},
|
| 151 |
+
{
|
| 152 |
+
"epoch": 0.208955223880597,
|
| 153 |
+
"grad_norm": 3.689805507659912,
|
| 154 |
+
"learning_rate": 1e-05,
|
| 155 |
+
"loss": 1.5175,
|
| 156 |
+
"step": 42
|
| 157 |
+
},
|
| 158 |
+
{
|
| 159 |
+
"epoch": 0.21890547263681592,
|
| 160 |
+
"grad_norm": 5.3562912940979,
|
| 161 |
+
"learning_rate": 1e-05,
|
| 162 |
+
"loss": 0.6652,
|
| 163 |
+
"step": 44
|
| 164 |
+
},
|
| 165 |
+
{
|
| 166 |
+
"epoch": 0.22885572139303484,
|
| 167 |
+
"grad_norm": 4.174887657165527,
|
| 168 |
+
"learning_rate": 1e-05,
|
| 169 |
+
"loss": 0.4799,
|
| 170 |
+
"step": 46
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"epoch": 0.23880597014925373,
|
| 174 |
+
"grad_norm": 3.432663679122925,
|
| 175 |
+
"learning_rate": 1e-05,
|
| 176 |
+
"loss": 0.7211,
|
| 177 |
+
"step": 48
|
| 178 |
+
},
|
| 179 |
+
{
|
| 180 |
+
"epoch": 0.24875621890547264,
|
| 181 |
+
"grad_norm": 4.918137073516846,
|
| 182 |
+
"learning_rate": 1e-05,
|
| 183 |
+
"loss": 0.8524,
|
| 184 |
+
"step": 50
|
| 185 |
+
},
|
| 186 |
+
{
|
| 187 |
+
"epoch": 0.25870646766169153,
|
| 188 |
+
"grad_norm": 1.390620470046997,
|
| 189 |
+
"learning_rate": 1e-05,
|
| 190 |
+
"loss": 0.1488,
|
| 191 |
+
"step": 52
|
| 192 |
+
},
|
| 193 |
+
{
|
| 194 |
+
"epoch": 0.26865671641791045,
|
| 195 |
+
"grad_norm": 4.325483322143555,
|
| 196 |
+
"learning_rate": 1e-05,
|
| 197 |
+
"loss": 0.9623,
|
| 198 |
+
"step": 54
|
| 199 |
+
},
|
| 200 |
+
{
|
| 201 |
+
"epoch": 0.27860696517412936,
|
| 202 |
+
"grad_norm": 1.1009166240692139,
|
| 203 |
+
"learning_rate": 1e-05,
|
| 204 |
+
"loss": 0.3015,
|
| 205 |
+
"step": 56
|
| 206 |
+
},
|
| 207 |
+
{
|
| 208 |
+
"epoch": 0.2885572139303483,
|
| 209 |
+
"grad_norm": 5.028674125671387,
|
| 210 |
+
"learning_rate": 1e-05,
|
| 211 |
+
"loss": 0.8217,
|
| 212 |
+
"step": 58
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"epoch": 0.29850746268656714,
|
| 216 |
+
"grad_norm": 6.246382713317871,
|
| 217 |
+
"learning_rate": 1e-05,
|
| 218 |
+
"loss": 0.5652,
|
| 219 |
+
"step": 60
|
| 220 |
+
},
|
| 221 |
+
{
|
| 222 |
+
"epoch": 0.30845771144278605,
|
| 223 |
+
"grad_norm": 3.5103182792663574,
|
| 224 |
+
"learning_rate": 1e-05,
|
| 225 |
+
"loss": 0.4114,
|
| 226 |
+
"step": 62
|
| 227 |
+
},
|
| 228 |
+
{
|
| 229 |
+
"epoch": 0.31840796019900497,
|
| 230 |
+
"grad_norm": 5.664974689483643,
|
| 231 |
+
"learning_rate": 1e-05,
|
| 232 |
+
"loss": 0.918,
|
| 233 |
+
"step": 64
|
| 234 |
+
},
|
| 235 |
+
{
|
| 236 |
+
"epoch": 0.3283582089552239,
|
| 237 |
+
"grad_norm": 10.550684928894043,
|
| 238 |
+
"learning_rate": 1e-05,
|
| 239 |
+
"loss": 0.8192,
|
| 240 |
+
"step": 66
|
| 241 |
+
},
|
| 242 |
+
{
|
| 243 |
+
"epoch": 0.3383084577114428,
|
| 244 |
+
"grad_norm": 0.7836717367172241,
|
| 245 |
+
"learning_rate": 1e-05,
|
| 246 |
+
"loss": 0.3381,
|
| 247 |
+
"step": 68
|
| 248 |
+
},
|
| 249 |
+
{
|
| 250 |
+
"epoch": 0.3482587064676617,
|
| 251 |
+
"grad_norm": 1.888235330581665,
|
| 252 |
+
"learning_rate": 1e-05,
|
| 253 |
+
"loss": 0.1491,
|
| 254 |
+
"step": 70
|
| 255 |
+
},
|
| 256 |
+
{
|
| 257 |
+
"epoch": 0.3582089552238806,
|
| 258 |
+
"grad_norm": 7.468411445617676,
|
| 259 |
+
"learning_rate": 1e-05,
|
| 260 |
+
"loss": 0.9326,
|
| 261 |
+
"step": 72
|
| 262 |
+
},
|
| 263 |
+
{
|
| 264 |
+
"epoch": 0.3681592039800995,
|
| 265 |
+
"grad_norm": 8.028440475463867,
|
| 266 |
+
"learning_rate": 1e-05,
|
| 267 |
+
"loss": 1.3224,
|
| 268 |
+
"step": 74
|
| 269 |
+
},
|
| 270 |
+
{
|
| 271 |
+
"epoch": 0.3781094527363184,
|
| 272 |
+
"grad_norm": 10.142037391662598,
|
| 273 |
+
"learning_rate": 1e-05,
|
| 274 |
+
"loss": 0.6093,
|
| 275 |
+
"step": 76
|
| 276 |
+
},
|
| 277 |
+
{
|
| 278 |
+
"epoch": 0.3880597014925373,
|
| 279 |
+
"grad_norm": 4.81419563293457,
|
| 280 |
+
"learning_rate": 1e-05,
|
| 281 |
+
"loss": 0.4134,
|
| 282 |
+
"step": 78
|
| 283 |
+
},
|
| 284 |
+
{
|
| 285 |
+
"epoch": 0.39800995024875624,
|
| 286 |
+
"grad_norm": 7.888396739959717,
|
| 287 |
+
"learning_rate": 1e-05,
|
| 288 |
+
"loss": 1.6637,
|
| 289 |
+
"step": 80
|
| 290 |
+
},
|
| 291 |
+
{
|
| 292 |
+
"epoch": 0.4079601990049751,
|
| 293 |
+
"grad_norm": 1.8005106449127197,
|
| 294 |
+
"learning_rate": 1e-05,
|
| 295 |
+
"loss": 1.7748,
|
| 296 |
+
"step": 82
|
| 297 |
+
},
|
| 298 |
+
{
|
| 299 |
+
"epoch": 0.417910447761194,
|
| 300 |
+
"grad_norm": 0.9087793827056885,
|
| 301 |
+
"learning_rate": 1e-05,
|
| 302 |
+
"loss": 0.3785,
|
| 303 |
+
"step": 84
|
| 304 |
+
},
|
| 305 |
+
{
|
| 306 |
+
"epoch": 0.42786069651741293,
|
| 307 |
+
"grad_norm": 4.730865955352783,
|
| 308 |
+
"learning_rate": 1e-05,
|
| 309 |
+
"loss": 0.5779,
|
| 310 |
+
"step": 86
|
| 311 |
+
},
|
| 312 |
+
{
|
| 313 |
+
"epoch": 0.43781094527363185,
|
| 314 |
+
"grad_norm": 8.102535247802734,
|
| 315 |
+
"learning_rate": 1e-05,
|
| 316 |
+
"loss": 0.6772,
|
| 317 |
+
"step": 88
|
| 318 |
+
},
|
| 319 |
+
{
|
| 320 |
+
"epoch": 0.44776119402985076,
|
| 321 |
+
"grad_norm": 6.577178001403809,
|
| 322 |
+
"learning_rate": 1e-05,
|
| 323 |
+
"loss": 1.6124,
|
| 324 |
+
"step": 90
|
| 325 |
+
},
|
| 326 |
+
{
|
| 327 |
+
"epoch": 0.4577114427860697,
|
| 328 |
+
"grad_norm": 1.7844473123550415,
|
| 329 |
+
"learning_rate": 1e-05,
|
| 330 |
+
"loss": 0.4683,
|
| 331 |
+
"step": 92
|
| 332 |
+
},
|
| 333 |
+
{
|
| 334 |
+
"epoch": 0.46766169154228854,
|
| 335 |
+
"grad_norm": 5.1499247550964355,
|
| 336 |
+
"learning_rate": 1e-05,
|
| 337 |
+
"loss": 0.8164,
|
| 338 |
+
"step": 94
|
| 339 |
+
},
|
| 340 |
+
{
|
| 341 |
+
"epoch": 0.47761194029850745,
|
| 342 |
+
"grad_norm": 3.6172220706939697,
|
| 343 |
+
"learning_rate": 1e-05,
|
| 344 |
+
"loss": 0.9205,
|
| 345 |
+
"step": 96
|
| 346 |
+
},
|
| 347 |
+
{
|
| 348 |
+
"epoch": 0.48756218905472637,
|
| 349 |
+
"grad_norm": 1.3999346494674683,
|
| 350 |
+
"learning_rate": 1e-05,
|
| 351 |
+
"loss": 0.4349,
|
| 352 |
+
"step": 98
|
| 353 |
+
},
|
| 354 |
+
{
|
| 355 |
+
"epoch": 0.4975124378109453,
|
| 356 |
+
"grad_norm": 4.574583530426025,
|
| 357 |
+
"learning_rate": 1e-05,
|
| 358 |
+
"loss": 0.5055,
|
| 359 |
+
"step": 100
|
| 360 |
+
},
|
| 361 |
+
{
|
| 362 |
+
"epoch": 0.5074626865671642,
|
| 363 |
+
"grad_norm": 3.0469908714294434,
|
| 364 |
+
"learning_rate": 1e-05,
|
| 365 |
+
"loss": 0.6799,
|
| 366 |
+
"step": 102
|
| 367 |
+
},
|
| 368 |
+
{
|
| 369 |
+
"epoch": 0.5174129353233831,
|
| 370 |
+
"grad_norm": 1.137192726135254,
|
| 371 |
+
"learning_rate": 1e-05,
|
| 372 |
+
"loss": 0.2079,
|
| 373 |
+
"step": 104
|
| 374 |
+
},
|
| 375 |
+
{
|
| 376 |
+
"epoch": 0.527363184079602,
|
| 377 |
+
"grad_norm": 8.398505210876465,
|
| 378 |
+
"learning_rate": 1e-05,
|
| 379 |
+
"loss": 0.8173,
|
| 380 |
+
"step": 106
|
| 381 |
+
},
|
| 382 |
+
{
|
| 383 |
+
"epoch": 0.5373134328358209,
|
| 384 |
+
"grad_norm": 4.197858810424805,
|
| 385 |
+
"learning_rate": 1e-05,
|
| 386 |
+
"loss": 0.811,
|
| 387 |
+
"step": 108
|
| 388 |
+
},
|
| 389 |
+
{
|
| 390 |
+
"epoch": 0.5472636815920398,
|
| 391 |
+
"grad_norm": 3.6865429878234863,
|
| 392 |
+
"learning_rate": 1e-05,
|
| 393 |
+
"loss": 0.9762,
|
| 394 |
+
"step": 110
|
| 395 |
+
},
|
| 396 |
+
{
|
| 397 |
+
"epoch": 0.5572139303482587,
|
| 398 |
+
"grad_norm": 2.2864203453063965,
|
| 399 |
+
"learning_rate": 1e-05,
|
| 400 |
+
"loss": 0.2215,
|
| 401 |
+
"step": 112
|
| 402 |
+
},
|
| 403 |
+
{
|
| 404 |
+
"epoch": 0.5671641791044776,
|
| 405 |
+
"grad_norm": 0.3491150438785553,
|
| 406 |
+
"learning_rate": 1e-05,
|
| 407 |
+
"loss": 0.1091,
|
| 408 |
+
"step": 114
|
| 409 |
+
},
|
| 410 |
+
{
|
| 411 |
+
"epoch": 0.5771144278606966,
|
| 412 |
+
"grad_norm": 0.8744693398475647,
|
| 413 |
+
"learning_rate": 1e-05,
|
| 414 |
+
"loss": 0.1872,
|
| 415 |
+
"step": 116
|
| 416 |
+
},
|
| 417 |
+
{
|
| 418 |
+
"epoch": 0.5870646766169154,
|
| 419 |
+
"grad_norm": 7.648612976074219,
|
| 420 |
+
"learning_rate": 1e-05,
|
| 421 |
+
"loss": 0.4138,
|
| 422 |
+
"step": 118
|
| 423 |
+
},
|
| 424 |
+
{
|
| 425 |
+
"epoch": 0.5970149253731343,
|
| 426 |
+
"grad_norm": 4.401617050170898,
|
| 427 |
+
"learning_rate": 1e-05,
|
| 428 |
+
"loss": 0.988,
|
| 429 |
+
"step": 120
|
| 430 |
+
},
|
| 431 |
+
{
|
| 432 |
+
"epoch": 0.6069651741293532,
|
| 433 |
+
"grad_norm": 5.498955726623535,
|
| 434 |
+
"learning_rate": 1e-05,
|
| 435 |
+
"loss": 0.2035,
|
| 436 |
+
"step": 122
|
| 437 |
+
},
|
| 438 |
+
{
|
| 439 |
+
"epoch": 0.6169154228855721,
|
| 440 |
+
"grad_norm": 7.8499250411987305,
|
| 441 |
+
"learning_rate": 1e-05,
|
| 442 |
+
"loss": 1.3216,
|
| 443 |
+
"step": 124
|
| 444 |
+
},
|
| 445 |
+
{
|
| 446 |
+
"epoch": 0.6268656716417911,
|
| 447 |
+
"grad_norm": 4.023660182952881,
|
| 448 |
+
"learning_rate": 1e-05,
|
| 449 |
+
"loss": 0.8133,
|
| 450 |
+
"step": 126
|
| 451 |
+
},
|
| 452 |
+
{
|
| 453 |
+
"epoch": 0.6368159203980099,
|
| 454 |
+
"grad_norm": 1.412724494934082,
|
| 455 |
+
"learning_rate": 1e-05,
|
| 456 |
+
"loss": 0.3464,
|
| 457 |
+
"step": 128
|
| 458 |
+
},
|
| 459 |
+
{
|
| 460 |
+
"epoch": 0.6467661691542289,
|
| 461 |
+
"grad_norm": 5.523179531097412,
|
| 462 |
+
"learning_rate": 1e-05,
|
| 463 |
+
"loss": 0.6978,
|
| 464 |
+
"step": 130
|
| 465 |
+
},
|
| 466 |
+
{
|
| 467 |
+
"epoch": 0.6567164179104478,
|
| 468 |
+
"grad_norm": 13.196066856384277,
|
| 469 |
+
"learning_rate": 1e-05,
|
| 470 |
+
"loss": 1.3471,
|
| 471 |
+
"step": 132
|
| 472 |
+
},
|
| 473 |
+
{
|
| 474 |
+
"epoch": 0.6666666666666666,
|
| 475 |
+
"grad_norm": 5.424158096313477,
|
| 476 |
+
"learning_rate": 1e-05,
|
| 477 |
+
"loss": 0.6814,
|
| 478 |
+
"step": 134
|
| 479 |
+
},
|
| 480 |
+
{
|
| 481 |
+
"epoch": 0.6766169154228856,
|
| 482 |
+
"grad_norm": 1.4407273530960083,
|
| 483 |
+
"learning_rate": 1e-05,
|
| 484 |
+
"loss": 0.1066,
|
| 485 |
+
"step": 136
|
| 486 |
+
},
|
| 487 |
+
{
|
| 488 |
+
"epoch": 0.6865671641791045,
|
| 489 |
+
"grad_norm": 6.258295059204102,
|
| 490 |
+
"learning_rate": 1e-05,
|
| 491 |
+
"loss": 0.9792,
|
| 492 |
+
"step": 138
|
| 493 |
+
},
|
| 494 |
+
{
|
| 495 |
+
"epoch": 0.6965174129353234,
|
| 496 |
+
"grad_norm": 4.438701152801514,
|
| 497 |
+
"learning_rate": 1e-05,
|
| 498 |
+
"loss": 0.406,
|
| 499 |
+
"step": 140
|
| 500 |
+
},
|
| 501 |
+
{
|
| 502 |
+
"epoch": 0.7064676616915423,
|
| 503 |
+
"grad_norm": 3.083000659942627,
|
| 504 |
+
"learning_rate": 1e-05,
|
| 505 |
+
"loss": 0.915,
|
| 506 |
+
"step": 142
|
| 507 |
+
},
|
| 508 |
+
{
|
| 509 |
+
"epoch": 0.7164179104477612,
|
| 510 |
+
"grad_norm": 0.6187798976898193,
|
| 511 |
+
"learning_rate": 1e-05,
|
| 512 |
+
"loss": 0.0508,
|
| 513 |
+
"step": 144
|
| 514 |
+
},
|
| 515 |
+
{
|
| 516 |
+
"epoch": 0.7263681592039801,
|
| 517 |
+
"grad_norm": 0.6553718447685242,
|
| 518 |
+
"learning_rate": 1e-05,
|
| 519 |
+
"loss": 0.114,
|
| 520 |
+
"step": 146
|
| 521 |
+
},
|
| 522 |
+
{
|
| 523 |
+
"epoch": 0.736318407960199,
|
| 524 |
+
"grad_norm": 0.5623739957809448,
|
| 525 |
+
"learning_rate": 1e-05,
|
| 526 |
+
"loss": 0.134,
|
| 527 |
+
"step": 148
|
| 528 |
+
},
|
| 529 |
+
{
|
| 530 |
+
"epoch": 0.746268656716418,
|
| 531 |
+
"grad_norm": 9.245420455932617,
|
| 532 |
+
"learning_rate": 1e-05,
|
| 533 |
+
"loss": 1.0717,
|
| 534 |
+
"step": 150
|
| 535 |
+
},
|
| 536 |
+
{
|
| 537 |
+
"epoch": 0.7562189054726368,
|
| 538 |
+
"grad_norm": 3.2727997303009033,
|
| 539 |
+
"learning_rate": 1e-05,
|
| 540 |
+
"loss": 0.3121,
|
| 541 |
+
"step": 152
|
| 542 |
+
},
|
| 543 |
+
{
|
| 544 |
+
"epoch": 0.7661691542288557,
|
| 545 |
+
"grad_norm": 2.23881459236145,
|
| 546 |
+
"learning_rate": 1e-05,
|
| 547 |
+
"loss": 0.3009,
|
| 548 |
+
"step": 154
|
| 549 |
+
},
|
| 550 |
+
{
|
| 551 |
+
"epoch": 0.7761194029850746,
|
| 552 |
+
"grad_norm": 3.4759159088134766,
|
| 553 |
+
"learning_rate": 1e-05,
|
| 554 |
+
"loss": 0.3648,
|
| 555 |
+
"step": 156
|
| 556 |
+
},
|
| 557 |
+
{
|
| 558 |
+
"epoch": 0.7860696517412935,
|
| 559 |
+
"grad_norm": 3.8757474422454834,
|
| 560 |
+
"learning_rate": 1e-05,
|
| 561 |
+
"loss": 1.1435,
|
| 562 |
+
"step": 158
|
| 563 |
+
},
|
| 564 |
+
{
|
| 565 |
+
"epoch": 0.7960199004975125,
|
| 566 |
+
"grad_norm": 2.606724262237549,
|
| 567 |
+
"learning_rate": 1e-05,
|
| 568 |
+
"loss": 0.227,
|
| 569 |
+
"step": 160
|
| 570 |
+
},
|
| 571 |
+
{
|
| 572 |
+
"epoch": 0.8059701492537313,
|
| 573 |
+
"grad_norm": 4.037679672241211,
|
| 574 |
+
"learning_rate": 1e-05,
|
| 575 |
+
"loss": 0.3358,
|
| 576 |
+
"step": 162
|
| 577 |
+
},
|
| 578 |
+
{
|
| 579 |
+
"epoch": 0.8159203980099502,
|
| 580 |
+
"grad_norm": 5.446840286254883,
|
| 581 |
+
"learning_rate": 1e-05,
|
| 582 |
+
"loss": 0.2191,
|
| 583 |
+
"step": 164
|
| 584 |
+
},
|
| 585 |
+
{
|
| 586 |
+
"epoch": 0.8258706467661692,
|
| 587 |
+
"grad_norm": 5.227675437927246,
|
| 588 |
+
"learning_rate": 1e-05,
|
| 589 |
+
"loss": 0.2977,
|
| 590 |
+
"step": 166
|
| 591 |
+
},
|
| 592 |
+
{
|
| 593 |
+
"epoch": 0.835820895522388,
|
| 594 |
+
"grad_norm": 3.955387592315674,
|
| 595 |
+
"learning_rate": 1e-05,
|
| 596 |
+
"loss": 0.2853,
|
| 597 |
+
"step": 168
|
| 598 |
+
},
|
| 599 |
+
{
|
| 600 |
+
"epoch": 0.845771144278607,
|
| 601 |
+
"grad_norm": 3.391467332839966,
|
| 602 |
+
"learning_rate": 1e-05,
|
| 603 |
+
"loss": 1.1612,
|
| 604 |
+
"step": 170
|
| 605 |
+
},
|
| 606 |
+
{
|
| 607 |
+
"epoch": 0.8557213930348259,
|
| 608 |
+
"grad_norm": 3.6372454166412354,
|
| 609 |
+
"learning_rate": 1e-05,
|
| 610 |
+
"loss": 0.2642,
|
| 611 |
+
"step": 172
|
| 612 |
+
},
|
| 613 |
+
{
|
| 614 |
+
"epoch": 0.8656716417910447,
|
| 615 |
+
"grad_norm": 6.628920078277588,
|
| 616 |
+
"learning_rate": 1e-05,
|
| 617 |
+
"loss": 0.7231,
|
| 618 |
+
"step": 174
|
| 619 |
+
},
|
| 620 |
+
{
|
| 621 |
+
"epoch": 0.8756218905472637,
|
| 622 |
+
"grad_norm": 0.42457300424575806,
|
| 623 |
+
"learning_rate": 1e-05,
|
| 624 |
+
"loss": 0.0519,
|
| 625 |
+
"step": 176
|
| 626 |
+
},
|
| 627 |
+
{
|
| 628 |
+
"epoch": 0.8855721393034826,
|
| 629 |
+
"grad_norm": 2.6521382331848145,
|
| 630 |
+
"learning_rate": 1e-05,
|
| 631 |
+
"loss": 0.2382,
|
| 632 |
+
"step": 178
|
| 633 |
+
},
|
| 634 |
+
{
|
| 635 |
+
"epoch": 0.8955223880597015,
|
| 636 |
+
"grad_norm": 0.1870870143175125,
|
| 637 |
+
"learning_rate": 1e-05,
|
| 638 |
+
"loss": 0.6789,
|
| 639 |
+
"step": 180
|
| 640 |
+
},
|
| 641 |
+
{
|
| 642 |
+
"epoch": 0.9054726368159204,
|
| 643 |
+
"grad_norm": 0.5534329414367676,
|
| 644 |
+
"learning_rate": 1e-05,
|
| 645 |
+
"loss": 0.0671,
|
| 646 |
+
"step": 182
|
| 647 |
+
},
|
| 648 |
+
{
|
| 649 |
+
"epoch": 0.9154228855721394,
|
| 650 |
+
"grad_norm": 3.863987922668457,
|
| 651 |
+
"learning_rate": 1e-05,
|
| 652 |
+
"loss": 0.273,
|
| 653 |
+
"step": 184
|
| 654 |
+
},
|
| 655 |
+
{
|
| 656 |
+
"epoch": 0.9253731343283582,
|
| 657 |
+
"grad_norm": 0.2802110016345978,
|
| 658 |
+
"learning_rate": 1e-05,
|
| 659 |
+
"loss": 0.0354,
|
| 660 |
+
"step": 186
|
| 661 |
+
},
|
| 662 |
+
{
|
| 663 |
+
"epoch": 0.9353233830845771,
|
| 664 |
+
"grad_norm": 0.616949200630188,
|
| 665 |
+
"learning_rate": 1e-05,
|
| 666 |
+
"loss": 0.446,
|
| 667 |
+
"step": 188
|
| 668 |
+
},
|
| 669 |
+
{
|
| 670 |
+
"epoch": 0.945273631840796,
|
| 671 |
+
"grad_norm": 1.3538764715194702,
|
| 672 |
+
"learning_rate": 1e-05,
|
| 673 |
+
"loss": 0.1169,
|
| 674 |
+
"step": 190
|
| 675 |
+
},
|
| 676 |
+
{
|
| 677 |
+
"epoch": 0.9552238805970149,
|
| 678 |
+
"grad_norm": 7.0314836502075195,
|
| 679 |
+
"learning_rate": 1e-05,
|
| 680 |
+
"loss": 1.3759,
|
| 681 |
+
"step": 192
|
| 682 |
+
},
|
| 683 |
+
{
|
| 684 |
+
"epoch": 0.9651741293532339,
|
| 685 |
+
"grad_norm": 5.94874906539917,
|
| 686 |
+
"learning_rate": 1e-05,
|
| 687 |
+
"loss": 0.3079,
|
| 688 |
+
"step": 194
|
| 689 |
+
},
|
| 690 |
+
{
|
| 691 |
+
"epoch": 0.9751243781094527,
|
| 692 |
+
"grad_norm": 1.551829218864441,
|
| 693 |
+
"learning_rate": 1e-05,
|
| 694 |
+
"loss": 0.1686,
|
| 695 |
+
"step": 196
|
| 696 |
+
},
|
| 697 |
+
{
|
| 698 |
+
"epoch": 0.9850746268656716,
|
| 699 |
+
"grad_norm": 4.5909647941589355,
|
| 700 |
+
"learning_rate": 1e-05,
|
| 701 |
+
"loss": 0.5421,
|
| 702 |
+
"step": 198
|
| 703 |
+
},
|
| 704 |
+
{
|
| 705 |
+
"epoch": 0.9950248756218906,
|
| 706 |
+
"grad_norm": 9.215164184570312,
|
| 707 |
+
"learning_rate": 1e-05,
|
| 708 |
+
"loss": 1.2799,
|
| 709 |
+
"step": 200
|
| 710 |
+
},
|
| 711 |
+
{
|
| 712 |
+
"epoch": 1.0,
|
| 713 |
+
"step": 201,
|
| 714 |
+
"total_flos": 7.039446888428339e+16,
|
| 715 |
+
"train_loss": 0.7667810383127697,
|
| 716 |
+
"train_runtime": 927.2228,
|
| 717 |
+
"train_samples_per_second": 0.867,
|
| 718 |
+
"train_steps_per_second": 0.217
|
| 719 |
+
}
|
| 720 |
+
],
|
| 721 |
+
"logging_steps": 2,
|
| 722 |
+
"max_steps": 201,
|
| 723 |
+
"num_input_tokens_seen": 0,
|
| 724 |
+
"num_train_epochs": 1,
|
| 725 |
+
"save_steps": 500,
|
| 726 |
+
"stateful_callbacks": {
|
| 727 |
+
"TrainerControl": {
|
| 728 |
+
"args": {
|
| 729 |
+
"should_epoch_stop": false,
|
| 730 |
+
"should_evaluate": false,
|
| 731 |
+
"should_log": false,
|
| 732 |
+
"should_save": false,
|
| 733 |
+
"should_training_stop": false
|
| 734 |
+
},
|
| 735 |
+
"attributes": {}
|
| 736 |
+
}
|
| 737 |
+
},
|
| 738 |
+
"total_flos": 7.039446888428339e+16,
|
| 739 |
+
"train_batch_size": 1,
|
| 740 |
+
"trial_name": null,
|
| 741 |
+
"trial_params": null
|
| 742 |
+
}
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:15df646d9de2827fcb2d37743cc4afd5897a456ad3002625680efb7fa8968c28
|
| 3 |
+
size 389170122
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr0.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9cfb3b8f77215f71c2c566a8e4a38358dba929d768726552fb07421f2b738dca
|
| 3 |
+
size 389172166
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr100.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0149b7f0cf6227341143067492664f94dd64ed1cb7b7a4e43c374acd8e70b13a
|
| 3 |
+
size 389172958
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr125.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6481de33461bc2f9b1d094560da45eab8e4df65fb58f5f7f6b69d331133ff23e
|
| 3 |
+
size 389172958
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr150.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:60f721ef60ea29c44173fee1abab36bf5985328a8f3e6c320bedd9aa00c2ff10
|
| 3 |
+
size 389172958
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr175.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c1f0974504d6f6686847abaf4785397372caac3bec0839b689e6b0e185c3e4bc
|
| 3 |
+
size 389172958
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr200.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:53102b5eda8437b7d9bd9a3adf847ab340de54b86f0be83c930450508841c5dc
|
| 3 |
+
size 389172958
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr25.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:60245923cf2fbdb65974d542218733dcec2b57ac6d6ad769d6dd21604bee7851
|
| 3 |
+
size 389172562
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr50.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b6567e7c366e1050b54f9c259bea308d7550e4a1d4fb3a7a28a92697669d48d8
|
| 3 |
+
size 389172562
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr75.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:823bb7bf8c020345951abefe505a2fe9a7eb24005059c7425b27e933652c1f9c
|
| 3 |
+
size 389172562
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_trainer_state.json
ADDED
|
@@ -0,0 +1,742 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_metric": null,
|
| 3 |
+
"best_model_checkpoint": null,
|
| 4 |
+
"epoch": 1.0,
|
| 5 |
+
"eval_steps": 500,
|
| 6 |
+
"global_step": 201,
|
| 7 |
+
"is_hyper_param_search": false,
|
| 8 |
+
"is_local_process_zero": true,
|
| 9 |
+
"is_world_process_zero": true,
|
| 10 |
+
"log_history": [
|
| 11 |
+
{
|
| 12 |
+
"epoch": 0.009950248756218905,
|
| 13 |
+
"grad_norm": 1.650244116783142,
|
| 14 |
+
"learning_rate": 1e-05,
|
| 15 |
+
"loss": 0.6243,
|
| 16 |
+
"step": 2
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"epoch": 0.01990049751243781,
|
| 20 |
+
"grad_norm": 8.55079460144043,
|
| 21 |
+
"learning_rate": 1e-05,
|
| 22 |
+
"loss": 1.9013,
|
| 23 |
+
"step": 4
|
| 24 |
+
},
|
| 25 |
+
{
|
| 26 |
+
"epoch": 0.029850746268656716,
|
| 27 |
+
"grad_norm": 2.6233367919921875,
|
| 28 |
+
"learning_rate": 1e-05,
|
| 29 |
+
"loss": 0.8224,
|
| 30 |
+
"step": 6
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"epoch": 0.03980099502487562,
|
| 34 |
+
"grad_norm": 2.576277732849121,
|
| 35 |
+
"learning_rate": 1e-05,
|
| 36 |
+
"loss": 0.9551,
|
| 37 |
+
"step": 8
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"epoch": 0.04975124378109453,
|
| 41 |
+
"grad_norm": 2.1575682163238525,
|
| 42 |
+
"learning_rate": 1e-05,
|
| 43 |
+
"loss": 0.8281,
|
| 44 |
+
"step": 10
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
"epoch": 0.05970149253731343,
|
| 48 |
+
"grad_norm": 2.4942679405212402,
|
| 49 |
+
"learning_rate": 1e-05,
|
| 50 |
+
"loss": 1.2534,
|
| 51 |
+
"step": 12
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"epoch": 0.06965174129353234,
|
| 55 |
+
"grad_norm": 2.6151747703552246,
|
| 56 |
+
"learning_rate": 1e-05,
|
| 57 |
+
"loss": 1.4263,
|
| 58 |
+
"step": 14
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"epoch": 0.07960199004975124,
|
| 62 |
+
"grad_norm": 2.6396424770355225,
|
| 63 |
+
"learning_rate": 1e-05,
|
| 64 |
+
"loss": 1.1638,
|
| 65 |
+
"step": 16
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"epoch": 0.08955223880597014,
|
| 69 |
+
"grad_norm": 1.374642014503479,
|
| 70 |
+
"learning_rate": 1e-05,
|
| 71 |
+
"loss": 0.7122,
|
| 72 |
+
"step": 18
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"epoch": 0.09950248756218906,
|
| 76 |
+
"grad_norm": 1.2737079858779907,
|
| 77 |
+
"learning_rate": 1e-05,
|
| 78 |
+
"loss": 0.927,
|
| 79 |
+
"step": 20
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"epoch": 0.10945273631840796,
|
| 83 |
+
"grad_norm": 2.1679718494415283,
|
| 84 |
+
"learning_rate": 1e-05,
|
| 85 |
+
"loss": 1.0146,
|
| 86 |
+
"step": 22
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"epoch": 0.11940298507462686,
|
| 90 |
+
"grad_norm": 1.9755343198776245,
|
| 91 |
+
"learning_rate": 1e-05,
|
| 92 |
+
"loss": 1.0562,
|
| 93 |
+
"step": 24
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
"epoch": 0.12935323383084577,
|
| 97 |
+
"grad_norm": 1.8360259532928467,
|
| 98 |
+
"learning_rate": 1e-05,
|
| 99 |
+
"loss": 0.9011,
|
| 100 |
+
"step": 26
|
| 101 |
+
},
|
| 102 |
+
{
|
| 103 |
+
"epoch": 0.13930348258706468,
|
| 104 |
+
"grad_norm": 1.3684884309768677,
|
| 105 |
+
"learning_rate": 1e-05,
|
| 106 |
+
"loss": 1.0317,
|
| 107 |
+
"step": 28
|
| 108 |
+
},
|
| 109 |
+
{
|
| 110 |
+
"epoch": 0.14925373134328357,
|
| 111 |
+
"grad_norm": 1.1041371822357178,
|
| 112 |
+
"learning_rate": 1e-05,
|
| 113 |
+
"loss": 0.7749,
|
| 114 |
+
"step": 30
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"epoch": 0.15920398009950248,
|
| 118 |
+
"grad_norm": 1.9447084665298462,
|
| 119 |
+
"learning_rate": 1e-05,
|
| 120 |
+
"loss": 0.916,
|
| 121 |
+
"step": 32
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"epoch": 0.1691542288557214,
|
| 125 |
+
"grad_norm": 1.2489606142044067,
|
| 126 |
+
"learning_rate": 1e-05,
|
| 127 |
+
"loss": 0.7324,
|
| 128 |
+
"step": 34
|
| 129 |
+
},
|
| 130 |
+
{
|
| 131 |
+
"epoch": 0.1791044776119403,
|
| 132 |
+
"grad_norm": 1.8743946552276611,
|
| 133 |
+
"learning_rate": 1e-05,
|
| 134 |
+
"loss": 1.1079,
|
| 135 |
+
"step": 36
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"epoch": 0.1890547263681592,
|
| 139 |
+
"grad_norm": 1.102053165435791,
|
| 140 |
+
"learning_rate": 1e-05,
|
| 141 |
+
"loss": 0.9385,
|
| 142 |
+
"step": 38
|
| 143 |
+
},
|
| 144 |
+
{
|
| 145 |
+
"epoch": 0.19900497512437812,
|
| 146 |
+
"grad_norm": 0.8476048707962036,
|
| 147 |
+
"learning_rate": 1e-05,
|
| 148 |
+
"loss": 0.8044,
|
| 149 |
+
"step": 40
|
| 150 |
+
},
|
| 151 |
+
{
|
| 152 |
+
"epoch": 0.208955223880597,
|
| 153 |
+
"grad_norm": 0.9640145301818848,
|
| 154 |
+
"learning_rate": 1e-05,
|
| 155 |
+
"loss": 0.8799,
|
| 156 |
+
"step": 42
|
| 157 |
+
},
|
| 158 |
+
{
|
| 159 |
+
"epoch": 0.21890547263681592,
|
| 160 |
+
"grad_norm": 1.381293535232544,
|
| 161 |
+
"learning_rate": 1e-05,
|
| 162 |
+
"loss": 0.7881,
|
| 163 |
+
"step": 44
|
| 164 |
+
},
|
| 165 |
+
{
|
| 166 |
+
"epoch": 0.22885572139303484,
|
| 167 |
+
"grad_norm": 0.9105871915817261,
|
| 168 |
+
"learning_rate": 1e-05,
|
| 169 |
+
"loss": 1.0356,
|
| 170 |
+
"step": 46
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"epoch": 0.23880597014925373,
|
| 174 |
+
"grad_norm": 2.0499324798583984,
|
| 175 |
+
"learning_rate": 1e-05,
|
| 176 |
+
"loss": 1.1104,
|
| 177 |
+
"step": 48
|
| 178 |
+
},
|
| 179 |
+
{
|
| 180 |
+
"epoch": 0.24875621890547264,
|
| 181 |
+
"grad_norm": 2.4867374897003174,
|
| 182 |
+
"learning_rate": 1e-05,
|
| 183 |
+
"loss": 0.9888,
|
| 184 |
+
"step": 50
|
| 185 |
+
},
|
| 186 |
+
{
|
| 187 |
+
"epoch": 0.25870646766169153,
|
| 188 |
+
"grad_norm": 1.052661418914795,
|
| 189 |
+
"learning_rate": 1e-05,
|
| 190 |
+
"loss": 0.7593,
|
| 191 |
+
"step": 52
|
| 192 |
+
},
|
| 193 |
+
{
|
| 194 |
+
"epoch": 0.26865671641791045,
|
| 195 |
+
"grad_norm": 0.8331828117370605,
|
| 196 |
+
"learning_rate": 1e-05,
|
| 197 |
+
"loss": 0.7891,
|
| 198 |
+
"step": 54
|
| 199 |
+
},
|
| 200 |
+
{
|
| 201 |
+
"epoch": 0.27860696517412936,
|
| 202 |
+
"grad_norm": 1.3015260696411133,
|
| 203 |
+
"learning_rate": 1e-05,
|
| 204 |
+
"loss": 0.8362,
|
| 205 |
+
"step": 56
|
| 206 |
+
},
|
| 207 |
+
{
|
| 208 |
+
"epoch": 0.2885572139303483,
|
| 209 |
+
"grad_norm": 1.1861402988433838,
|
| 210 |
+
"learning_rate": 1e-05,
|
| 211 |
+
"loss": 1.0667,
|
| 212 |
+
"step": 58
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"epoch": 0.29850746268656714,
|
| 216 |
+
"grad_norm": 2.5102596282958984,
|
| 217 |
+
"learning_rate": 1e-05,
|
| 218 |
+
"loss": 1.4307,
|
| 219 |
+
"step": 60
|
| 220 |
+
},
|
| 221 |
+
{
|
| 222 |
+
"epoch": 0.30845771144278605,
|
| 223 |
+
"grad_norm": 1.035914659500122,
|
| 224 |
+
"learning_rate": 1e-05,
|
| 225 |
+
"loss": 0.7061,
|
| 226 |
+
"step": 62
|
| 227 |
+
},
|
| 228 |
+
{
|
| 229 |
+
"epoch": 0.31840796019900497,
|
| 230 |
+
"grad_norm": 1.268302321434021,
|
| 231 |
+
"learning_rate": 1e-05,
|
| 232 |
+
"loss": 1.2146,
|
| 233 |
+
"step": 64
|
| 234 |
+
},
|
| 235 |
+
{
|
| 236 |
+
"epoch": 0.3283582089552239,
|
| 237 |
+
"grad_norm": 1.501561164855957,
|
| 238 |
+
"learning_rate": 1e-05,
|
| 239 |
+
"loss": 1.0767,
|
| 240 |
+
"step": 66
|
| 241 |
+
},
|
| 242 |
+
{
|
| 243 |
+
"epoch": 0.3383084577114428,
|
| 244 |
+
"grad_norm": 0.7221049070358276,
|
| 245 |
+
"learning_rate": 1e-05,
|
| 246 |
+
"loss": 0.9985,
|
| 247 |
+
"step": 68
|
| 248 |
+
},
|
| 249 |
+
{
|
| 250 |
+
"epoch": 0.3482587064676617,
|
| 251 |
+
"grad_norm": 0.9676480293273926,
|
| 252 |
+
"learning_rate": 1e-05,
|
| 253 |
+
"loss": 0.9858,
|
| 254 |
+
"step": 70
|
| 255 |
+
},
|
| 256 |
+
{
|
| 257 |
+
"epoch": 0.3582089552238806,
|
| 258 |
+
"grad_norm": 0.8725219368934631,
|
| 259 |
+
"learning_rate": 1e-05,
|
| 260 |
+
"loss": 0.854,
|
| 261 |
+
"step": 72
|
| 262 |
+
},
|
| 263 |
+
{
|
| 264 |
+
"epoch": 0.3681592039800995,
|
| 265 |
+
"grad_norm": 0.7807052731513977,
|
| 266 |
+
"learning_rate": 1e-05,
|
| 267 |
+
"loss": 0.8716,
|
| 268 |
+
"step": 74
|
| 269 |
+
},
|
| 270 |
+
{
|
| 271 |
+
"epoch": 0.3781094527363184,
|
| 272 |
+
"grad_norm": 0.7535572052001953,
|
| 273 |
+
"learning_rate": 1e-05,
|
| 274 |
+
"loss": 0.8459,
|
| 275 |
+
"step": 76
|
| 276 |
+
},
|
| 277 |
+
{
|
| 278 |
+
"epoch": 0.3880597014925373,
|
| 279 |
+
"grad_norm": 1.4078559875488281,
|
| 280 |
+
"learning_rate": 1e-05,
|
| 281 |
+
"loss": 1.105,
|
| 282 |
+
"step": 78
|
| 283 |
+
},
|
| 284 |
+
{
|
| 285 |
+
"epoch": 0.39800995024875624,
|
| 286 |
+
"grad_norm": 0.957761287689209,
|
| 287 |
+
"learning_rate": 1e-05,
|
| 288 |
+
"loss": 1.0386,
|
| 289 |
+
"step": 80
|
| 290 |
+
},
|
| 291 |
+
{
|
| 292 |
+
"epoch": 0.4079601990049751,
|
| 293 |
+
"grad_norm": 0.8926840424537659,
|
| 294 |
+
"learning_rate": 1e-05,
|
| 295 |
+
"loss": 0.9438,
|
| 296 |
+
"step": 82
|
| 297 |
+
},
|
| 298 |
+
{
|
| 299 |
+
"epoch": 0.417910447761194,
|
| 300 |
+
"grad_norm": 1.8459022045135498,
|
| 301 |
+
"learning_rate": 1e-05,
|
| 302 |
+
"loss": 0.8696,
|
| 303 |
+
"step": 84
|
| 304 |
+
},
|
| 305 |
+
{
|
| 306 |
+
"epoch": 0.42786069651741293,
|
| 307 |
+
"grad_norm": 1.311964511871338,
|
| 308 |
+
"learning_rate": 1e-05,
|
| 309 |
+
"loss": 0.9351,
|
| 310 |
+
"step": 86
|
| 311 |
+
},
|
| 312 |
+
{
|
| 313 |
+
"epoch": 0.43781094527363185,
|
| 314 |
+
"grad_norm": 1.8599036931991577,
|
| 315 |
+
"learning_rate": 1e-05,
|
| 316 |
+
"loss": 1.1685,
|
| 317 |
+
"step": 88
|
| 318 |
+
},
|
| 319 |
+
{
|
| 320 |
+
"epoch": 0.44776119402985076,
|
| 321 |
+
"grad_norm": 0.9435080289840698,
|
| 322 |
+
"learning_rate": 1e-05,
|
| 323 |
+
"loss": 0.8364,
|
| 324 |
+
"step": 90
|
| 325 |
+
},
|
| 326 |
+
{
|
| 327 |
+
"epoch": 0.4577114427860697,
|
| 328 |
+
"grad_norm": 0.8074705600738525,
|
| 329 |
+
"learning_rate": 1e-05,
|
| 330 |
+
"loss": 1.0356,
|
| 331 |
+
"step": 92
|
| 332 |
+
},
|
| 333 |
+
{
|
| 334 |
+
"epoch": 0.46766169154228854,
|
| 335 |
+
"grad_norm": 0.7916580438613892,
|
| 336 |
+
"learning_rate": 1e-05,
|
| 337 |
+
"loss": 0.8716,
|
| 338 |
+
"step": 94
|
| 339 |
+
},
|
| 340 |
+
{
|
| 341 |
+
"epoch": 0.47761194029850745,
|
| 342 |
+
"grad_norm": 1.0159028768539429,
|
| 343 |
+
"learning_rate": 1e-05,
|
| 344 |
+
"loss": 0.9404,
|
| 345 |
+
"step": 96
|
| 346 |
+
},
|
| 347 |
+
{
|
| 348 |
+
"epoch": 0.48756218905472637,
|
| 349 |
+
"grad_norm": 0.6591694355010986,
|
| 350 |
+
"learning_rate": 1e-05,
|
| 351 |
+
"loss": 0.8706,
|
| 352 |
+
"step": 98
|
| 353 |
+
},
|
| 354 |
+
{
|
| 355 |
+
"epoch": 0.4975124378109453,
|
| 356 |
+
"grad_norm": 1.0024625062942505,
|
| 357 |
+
"learning_rate": 1e-05,
|
| 358 |
+
"loss": 0.9551,
|
| 359 |
+
"step": 100
|
| 360 |
+
},
|
| 361 |
+
{
|
| 362 |
+
"epoch": 0.5074626865671642,
|
| 363 |
+
"grad_norm": 1.3378303050994873,
|
| 364 |
+
"learning_rate": 1e-05,
|
| 365 |
+
"loss": 0.8682,
|
| 366 |
+
"step": 102
|
| 367 |
+
},
|
| 368 |
+
{
|
| 369 |
+
"epoch": 0.5174129353233831,
|
| 370 |
+
"grad_norm": 0.9471051096916199,
|
| 371 |
+
"learning_rate": 1e-05,
|
| 372 |
+
"loss": 0.9287,
|
| 373 |
+
"step": 104
|
| 374 |
+
},
|
| 375 |
+
{
|
| 376 |
+
"epoch": 0.527363184079602,
|
| 377 |
+
"grad_norm": 1.0026133060455322,
|
| 378 |
+
"learning_rate": 1e-05,
|
| 379 |
+
"loss": 1.0786,
|
| 380 |
+
"step": 106
|
| 381 |
+
},
|
| 382 |
+
{
|
| 383 |
+
"epoch": 0.5373134328358209,
|
| 384 |
+
"grad_norm": 0.8960136771202087,
|
| 385 |
+
"learning_rate": 1e-05,
|
| 386 |
+
"loss": 1.0117,
|
| 387 |
+
"step": 108
|
| 388 |
+
},
|
| 389 |
+
{
|
| 390 |
+
"epoch": 0.5472636815920398,
|
| 391 |
+
"grad_norm": 0.5560504794120789,
|
| 392 |
+
"learning_rate": 1e-05,
|
| 393 |
+
"loss": 0.8799,
|
| 394 |
+
"step": 110
|
| 395 |
+
},
|
| 396 |
+
{
|
| 397 |
+
"epoch": 0.5572139303482587,
|
| 398 |
+
"grad_norm": 1.0694944858551025,
|
| 399 |
+
"learning_rate": 1e-05,
|
| 400 |
+
"loss": 0.9097,
|
| 401 |
+
"step": 112
|
| 402 |
+
},
|
| 403 |
+
{
|
| 404 |
+
"epoch": 0.5671641791044776,
|
| 405 |
+
"grad_norm": 0.8429641127586365,
|
| 406 |
+
"learning_rate": 1e-05,
|
| 407 |
+
"loss": 0.9556,
|
| 408 |
+
"step": 114
|
| 409 |
+
},
|
| 410 |
+
{
|
| 411 |
+
"epoch": 0.5771144278606966,
|
| 412 |
+
"grad_norm": 0.6551101207733154,
|
| 413 |
+
"learning_rate": 1e-05,
|
| 414 |
+
"loss": 0.9912,
|
| 415 |
+
"step": 116
|
| 416 |
+
},
|
| 417 |
+
{
|
| 418 |
+
"epoch": 0.5870646766169154,
|
| 419 |
+
"grad_norm": 1.2814500331878662,
|
| 420 |
+
"learning_rate": 1e-05,
|
| 421 |
+
"loss": 0.938,
|
| 422 |
+
"step": 118
|
| 423 |
+
},
|
| 424 |
+
{
|
| 425 |
+
"epoch": 0.5970149253731343,
|
| 426 |
+
"grad_norm": 0.5971533060073853,
|
| 427 |
+
"learning_rate": 1e-05,
|
| 428 |
+
"loss": 0.8203,
|
| 429 |
+
"step": 120
|
| 430 |
+
},
|
| 431 |
+
{
|
| 432 |
+
"epoch": 0.6069651741293532,
|
| 433 |
+
"grad_norm": 0.6333916783332825,
|
| 434 |
+
"learning_rate": 1e-05,
|
| 435 |
+
"loss": 0.7949,
|
| 436 |
+
"step": 122
|
| 437 |
+
},
|
| 438 |
+
{
|
| 439 |
+
"epoch": 0.6169154228855721,
|
| 440 |
+
"grad_norm": 1.5460799932479858,
|
| 441 |
+
"learning_rate": 1e-05,
|
| 442 |
+
"loss": 1.0107,
|
| 443 |
+
"step": 124
|
| 444 |
+
},
|
| 445 |
+
{
|
| 446 |
+
"epoch": 0.6268656716417911,
|
| 447 |
+
"grad_norm": 0.6799649596214294,
|
| 448 |
+
"learning_rate": 1e-05,
|
| 449 |
+
"loss": 0.9155,
|
| 450 |
+
"step": 126
|
| 451 |
+
},
|
| 452 |
+
{
|
| 453 |
+
"epoch": 0.6368159203980099,
|
| 454 |
+
"grad_norm": 0.5778260827064514,
|
| 455 |
+
"learning_rate": 1e-05,
|
| 456 |
+
"loss": 0.9985,
|
| 457 |
+
"step": 128
|
| 458 |
+
},
|
| 459 |
+
{
|
| 460 |
+
"epoch": 0.6467661691542289,
|
| 461 |
+
"grad_norm": 0.7546162605285645,
|
| 462 |
+
"learning_rate": 1e-05,
|
| 463 |
+
"loss": 0.9199,
|
| 464 |
+
"step": 130
|
| 465 |
+
},
|
| 466 |
+
{
|
| 467 |
+
"epoch": 0.6567164179104478,
|
| 468 |
+
"grad_norm": 0.5724232196807861,
|
| 469 |
+
"learning_rate": 1e-05,
|
| 470 |
+
"loss": 0.9399,
|
| 471 |
+
"step": 132
|
| 472 |
+
},
|
| 473 |
+
{
|
| 474 |
+
"epoch": 0.6666666666666666,
|
| 475 |
+
"grad_norm": 1.2401442527770996,
|
| 476 |
+
"learning_rate": 1e-05,
|
| 477 |
+
"loss": 0.8687,
|
| 478 |
+
"step": 134
|
| 479 |
+
},
|
| 480 |
+
{
|
| 481 |
+
"epoch": 0.6766169154228856,
|
| 482 |
+
"grad_norm": 0.8218169212341309,
|
| 483 |
+
"learning_rate": 1e-05,
|
| 484 |
+
"loss": 0.8857,
|
| 485 |
+
"step": 136
|
| 486 |
+
},
|
| 487 |
+
{
|
| 488 |
+
"epoch": 0.6865671641791045,
|
| 489 |
+
"grad_norm": 0.690995991230011,
|
| 490 |
+
"learning_rate": 1e-05,
|
| 491 |
+
"loss": 0.9438,
|
| 492 |
+
"step": 138
|
| 493 |
+
},
|
| 494 |
+
{
|
| 495 |
+
"epoch": 0.6965174129353234,
|
| 496 |
+
"grad_norm": 0.9527719020843506,
|
| 497 |
+
"learning_rate": 1e-05,
|
| 498 |
+
"loss": 1.0239,
|
| 499 |
+
"step": 140
|
| 500 |
+
},
|
| 501 |
+
{
|
| 502 |
+
"epoch": 0.7064676616915423,
|
| 503 |
+
"grad_norm": 0.6030732989311218,
|
| 504 |
+
"learning_rate": 1e-05,
|
| 505 |
+
"loss": 0.9722,
|
| 506 |
+
"step": 142
|
| 507 |
+
},
|
| 508 |
+
{
|
| 509 |
+
"epoch": 0.7164179104477612,
|
| 510 |
+
"grad_norm": 0.6105135679244995,
|
| 511 |
+
"learning_rate": 1e-05,
|
| 512 |
+
"loss": 0.8628,
|
| 513 |
+
"step": 144
|
| 514 |
+
},
|
| 515 |
+
{
|
| 516 |
+
"epoch": 0.7263681592039801,
|
| 517 |
+
"grad_norm": 0.7813135981559753,
|
| 518 |
+
"learning_rate": 1e-05,
|
| 519 |
+
"loss": 0.8213,
|
| 520 |
+
"step": 146
|
| 521 |
+
},
|
| 522 |
+
{
|
| 523 |
+
"epoch": 0.736318407960199,
|
| 524 |
+
"grad_norm": 0.5830418467521667,
|
| 525 |
+
"learning_rate": 1e-05,
|
| 526 |
+
"loss": 0.834,
|
| 527 |
+
"step": 148
|
| 528 |
+
},
|
| 529 |
+
{
|
| 530 |
+
"epoch": 0.746268656716418,
|
| 531 |
+
"grad_norm": 1.0577740669250488,
|
| 532 |
+
"learning_rate": 1e-05,
|
| 533 |
+
"loss": 0.9692,
|
| 534 |
+
"step": 150
|
| 535 |
+
},
|
| 536 |
+
{
|
| 537 |
+
"epoch": 0.7562189054726368,
|
| 538 |
+
"grad_norm": 0.813637912273407,
|
| 539 |
+
"learning_rate": 1e-05,
|
| 540 |
+
"loss": 0.8735,
|
| 541 |
+
"step": 152
|
| 542 |
+
},
|
| 543 |
+
{
|
| 544 |
+
"epoch": 0.7661691542288557,
|
| 545 |
+
"grad_norm": 0.5650802254676819,
|
| 546 |
+
"learning_rate": 1e-05,
|
| 547 |
+
"loss": 1.0767,
|
| 548 |
+
"step": 154
|
| 549 |
+
},
|
| 550 |
+
{
|
| 551 |
+
"epoch": 0.7761194029850746,
|
| 552 |
+
"grad_norm": 0.7651078104972839,
|
| 553 |
+
"learning_rate": 1e-05,
|
| 554 |
+
"loss": 0.8862,
|
| 555 |
+
"step": 156
|
| 556 |
+
},
|
| 557 |
+
{
|
| 558 |
+
"epoch": 0.7860696517412935,
|
| 559 |
+
"grad_norm": 0.5638197064399719,
|
| 560 |
+
"learning_rate": 1e-05,
|
| 561 |
+
"loss": 1.0239,
|
| 562 |
+
"step": 158
|
| 563 |
+
},
|
| 564 |
+
{
|
| 565 |
+
"epoch": 0.7960199004975125,
|
| 566 |
+
"grad_norm": 0.5717598795890808,
|
| 567 |
+
"learning_rate": 1e-05,
|
| 568 |
+
"loss": 0.9868,
|
| 569 |
+
"step": 160
|
| 570 |
+
},
|
| 571 |
+
{
|
| 572 |
+
"epoch": 0.8059701492537313,
|
| 573 |
+
"grad_norm": 0.9155240058898926,
|
| 574 |
+
"learning_rate": 1e-05,
|
| 575 |
+
"loss": 0.8545,
|
| 576 |
+
"step": 162
|
| 577 |
+
},
|
| 578 |
+
{
|
| 579 |
+
"epoch": 0.8159203980099502,
|
| 580 |
+
"grad_norm": 0.673218309879303,
|
| 581 |
+
"learning_rate": 1e-05,
|
| 582 |
+
"loss": 0.8979,
|
| 583 |
+
"step": 164
|
| 584 |
+
},
|
| 585 |
+
{
|
| 586 |
+
"epoch": 0.8258706467661692,
|
| 587 |
+
"grad_norm": 0.933534562587738,
|
| 588 |
+
"learning_rate": 1e-05,
|
| 589 |
+
"loss": 0.958,
|
| 590 |
+
"step": 166
|
| 591 |
+
},
|
| 592 |
+
{
|
| 593 |
+
"epoch": 0.835820895522388,
|
| 594 |
+
"grad_norm": 0.6906251907348633,
|
| 595 |
+
"learning_rate": 1e-05,
|
| 596 |
+
"loss": 0.8301,
|
| 597 |
+
"step": 168
|
| 598 |
+
},
|
| 599 |
+
{
|
| 600 |
+
"epoch": 0.845771144278607,
|
| 601 |
+
"grad_norm": 0.9870006442070007,
|
| 602 |
+
"learning_rate": 1e-05,
|
| 603 |
+
"loss": 0.8652,
|
| 604 |
+
"step": 170
|
| 605 |
+
},
|
| 606 |
+
{
|
| 607 |
+
"epoch": 0.8557213930348259,
|
| 608 |
+
"grad_norm": 1.019015908241272,
|
| 609 |
+
"learning_rate": 1e-05,
|
| 610 |
+
"loss": 0.9165,
|
| 611 |
+
"step": 172
|
| 612 |
+
},
|
| 613 |
+
{
|
| 614 |
+
"epoch": 0.8656716417910447,
|
| 615 |
+
"grad_norm": 0.997454047203064,
|
| 616 |
+
"learning_rate": 1e-05,
|
| 617 |
+
"loss": 0.8403,
|
| 618 |
+
"step": 174
|
| 619 |
+
},
|
| 620 |
+
{
|
| 621 |
+
"epoch": 0.8756218905472637,
|
| 622 |
+
"grad_norm": 1.6273800134658813,
|
| 623 |
+
"learning_rate": 1e-05,
|
| 624 |
+
"loss": 0.957,
|
| 625 |
+
"step": 176
|
| 626 |
+
},
|
| 627 |
+
{
|
| 628 |
+
"epoch": 0.8855721393034826,
|
| 629 |
+
"grad_norm": 0.8904904127120972,
|
| 630 |
+
"learning_rate": 1e-05,
|
| 631 |
+
"loss": 0.8452,
|
| 632 |
+
"step": 178
|
| 633 |
+
},
|
| 634 |
+
{
|
| 635 |
+
"epoch": 0.8955223880597015,
|
| 636 |
+
"grad_norm": 0.7554193139076233,
|
| 637 |
+
"learning_rate": 1e-05,
|
| 638 |
+
"loss": 0.7539,
|
| 639 |
+
"step": 180
|
| 640 |
+
},
|
| 641 |
+
{
|
| 642 |
+
"epoch": 0.9054726368159204,
|
| 643 |
+
"grad_norm": 1.757675051689148,
|
| 644 |
+
"learning_rate": 1e-05,
|
| 645 |
+
"loss": 0.9287,
|
| 646 |
+
"step": 182
|
| 647 |
+
},
|
| 648 |
+
{
|
| 649 |
+
"epoch": 0.9154228855721394,
|
| 650 |
+
"grad_norm": 0.8368033170700073,
|
| 651 |
+
"learning_rate": 1e-05,
|
| 652 |
+
"loss": 0.8506,
|
| 653 |
+
"step": 184
|
| 654 |
+
},
|
| 655 |
+
{
|
| 656 |
+
"epoch": 0.9253731343283582,
|
| 657 |
+
"grad_norm": 0.956574022769928,
|
| 658 |
+
"learning_rate": 1e-05,
|
| 659 |
+
"loss": 0.8433,
|
| 660 |
+
"step": 186
|
| 661 |
+
},
|
| 662 |
+
{
|
| 663 |
+
"epoch": 0.9353233830845771,
|
| 664 |
+
"grad_norm": 1.2842135429382324,
|
| 665 |
+
"learning_rate": 1e-05,
|
| 666 |
+
"loss": 0.8799,
|
| 667 |
+
"step": 188
|
| 668 |
+
},
|
| 669 |
+
{
|
| 670 |
+
"epoch": 0.945273631840796,
|
| 671 |
+
"grad_norm": 1.017176628112793,
|
| 672 |
+
"learning_rate": 1e-05,
|
| 673 |
+
"loss": 0.8638,
|
| 674 |
+
"step": 190
|
| 675 |
+
},
|
| 676 |
+
{
|
| 677 |
+
"epoch": 0.9552238805970149,
|
| 678 |
+
"grad_norm": 1.4684029817581177,
|
| 679 |
+
"learning_rate": 1e-05,
|
| 680 |
+
"loss": 0.947,
|
| 681 |
+
"step": 192
|
| 682 |
+
},
|
| 683 |
+
{
|
| 684 |
+
"epoch": 0.9651741293532339,
|
| 685 |
+
"grad_norm": 1.4607092142105103,
|
| 686 |
+
"learning_rate": 1e-05,
|
| 687 |
+
"loss": 0.9966,
|
| 688 |
+
"step": 194
|
| 689 |
+
},
|
| 690 |
+
{
|
| 691 |
+
"epoch": 0.9751243781094527,
|
| 692 |
+
"grad_norm": 1.6244029998779297,
|
| 693 |
+
"learning_rate": 1e-05,
|
| 694 |
+
"loss": 0.6952,
|
| 695 |
+
"step": 196
|
| 696 |
+
},
|
| 697 |
+
{
|
| 698 |
+
"epoch": 0.9850746268656716,
|
| 699 |
+
"grad_norm": 1.253040075302124,
|
| 700 |
+
"learning_rate": 1e-05,
|
| 701 |
+
"loss": 0.9458,
|
| 702 |
+
"step": 198
|
| 703 |
+
},
|
| 704 |
+
{
|
| 705 |
+
"epoch": 0.9950248756218906,
|
| 706 |
+
"grad_norm": 1.4702417850494385,
|
| 707 |
+
"learning_rate": 1e-05,
|
| 708 |
+
"loss": 0.9985,
|
| 709 |
+
"step": 200
|
| 710 |
+
},
|
| 711 |
+
{
|
| 712 |
+
"epoch": 1.0,
|
| 713 |
+
"step": 201,
|
| 714 |
+
"total_flos": 1.9968928570671104e+16,
|
| 715 |
+
"train_loss": 0.9424890926228234,
|
| 716 |
+
"train_runtime": 463.9481,
|
| 717 |
+
"train_samples_per_second": 1.733,
|
| 718 |
+
"train_steps_per_second": 0.433
|
| 719 |
+
}
|
| 720 |
+
],
|
| 721 |
+
"logging_steps": 2,
|
| 722 |
+
"max_steps": 201,
|
| 723 |
+
"num_input_tokens_seen": 0,
|
| 724 |
+
"num_train_epochs": 1,
|
| 725 |
+
"save_steps": 500,
|
| 726 |
+
"stateful_callbacks": {
|
| 727 |
+
"TrainerControl": {
|
| 728 |
+
"args": {
|
| 729 |
+
"should_epoch_stop": false,
|
| 730 |
+
"should_evaluate": false,
|
| 731 |
+
"should_log": false,
|
| 732 |
+
"should_save": false,
|
| 733 |
+
"should_training_stop": false
|
| 734 |
+
},
|
| 735 |
+
"attributes": {}
|
| 736 |
+
}
|
| 737 |
+
},
|
| 738 |
+
"total_flos": 1.9968928570671104e+16,
|
| 739 |
+
"train_batch_size": 1,
|
| 740 |
+
"trial_name": null,
|
| 741 |
+
"trial_params": null
|
| 742 |
+
}
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:30cc75502c7f91affcd69bae806c3a2fe927b1221cd916baa5bea77645e25e78
|
| 3 |
+
size 389170122
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr0.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6a601c0dba0cc0f6c171ddac346957dc7e71b334cb4bbf3956e3bde1916356a6
|
| 3 |
+
size 389172166
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr100.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:101636b78258e532fc44f1cee697217979c28b0d326484f53cf6c0abce3c37f7
|
| 3 |
+
size 389172958
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr125.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:83f89ec80d240b002848de09dac06b77d84b42e1e37182bdb52d999334b81ad8
|
| 3 |
+
size 389172958
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr150.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9f987a5f3a9d1914fe9208860d108d875a3959e3d2381703a5f24d86b47c9d53
|
| 3 |
+
size 389172958
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr175.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6415975ce46cd58092b9080498c7c786dc519f47e055cea660db53da5c6f9111
|
| 3 |
+
size 389172958
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr200.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b7556d4da4dd89b709e46b2a2f07409fd60900bb2f19d815cf84e74c51b1732a
|
| 3 |
+
size 389172958
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr25.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5363c67f1cb193fd3ddc8d555a59035a42e9696642c620732b30151b2ac03ecd
|
| 3 |
+
size 389172562
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr50.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:16702cf0d38a1e9b1e390bdb1b52c302265fc3095e097bacff50f0802b9c0bef
|
| 3 |
+
size 389172562
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr75.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7fe002259d80ce003b73b25fa6f46c45a5e4aae678bce953929a2360c7d0acec
|
| 3 |
+
size 389172562
|
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_trainer_state.json
ADDED
|
@@ -0,0 +1,742 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_metric": null,
|
| 3 |
+
"best_model_checkpoint": null,
|
| 4 |
+
"epoch": 1.0,
|
| 5 |
+
"eval_steps": 500,
|
| 6 |
+
"global_step": 201,
|
| 7 |
+
"is_hyper_param_search": false,
|
| 8 |
+
"is_local_process_zero": true,
|
| 9 |
+
"is_world_process_zero": true,
|
| 10 |
+
"log_history": [
|
| 11 |
+
{
|
| 12 |
+
"epoch": 0.009950248756218905,
|
| 13 |
+
"grad_norm": 3.4984848499298096,
|
| 14 |
+
"learning_rate": 1e-05,
|
| 15 |
+
"loss": 2.4546,
|
| 16 |
+
"step": 2
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"epoch": 0.01990049751243781,
|
| 20 |
+
"grad_norm": 6.905043125152588,
|
| 21 |
+
"learning_rate": 1e-05,
|
| 22 |
+
"loss": 2.5471,
|
| 23 |
+
"step": 4
|
| 24 |
+
},
|
| 25 |
+
{
|
| 26 |
+
"epoch": 0.029850746268656716,
|
| 27 |
+
"grad_norm": 3.0716166496276855,
|
| 28 |
+
"learning_rate": 1e-05,
|
| 29 |
+
"loss": 1.7067,
|
| 30 |
+
"step": 6
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"epoch": 0.03980099502487562,
|
| 34 |
+
"grad_norm": 3.0438055992126465,
|
| 35 |
+
"learning_rate": 1e-05,
|
| 36 |
+
"loss": 1.0265,
|
| 37 |
+
"step": 8
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"epoch": 0.04975124378109453,
|
| 41 |
+
"grad_norm": 5.619898319244385,
|
| 42 |
+
"learning_rate": 1e-05,
|
| 43 |
+
"loss": 1.8415,
|
| 44 |
+
"step": 10
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
"epoch": 0.05970149253731343,
|
| 48 |
+
"grad_norm": 5.910048961639404,
|
| 49 |
+
"learning_rate": 1e-05,
|
| 50 |
+
"loss": 2.0034,
|
| 51 |
+
"step": 12
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"epoch": 0.06965174129353234,
|
| 55 |
+
"grad_norm": 4.535861492156982,
|
| 56 |
+
"learning_rate": 1e-05,
|
| 57 |
+
"loss": 1.6565,
|
| 58 |
+
"step": 14
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"epoch": 0.07960199004975124,
|
| 62 |
+
"grad_norm": 3.565920114517212,
|
| 63 |
+
"learning_rate": 1e-05,
|
| 64 |
+
"loss": 1.3917,
|
| 65 |
+
"step": 16
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"epoch": 0.08955223880597014,
|
| 69 |
+
"grad_norm": 3.367178440093994,
|
| 70 |
+
"learning_rate": 1e-05,
|
| 71 |
+
"loss": 1.2771,
|
| 72 |
+
"step": 18
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"epoch": 0.09950248756218906,
|
| 76 |
+
"grad_norm": 3.4126410484313965,
|
| 77 |
+
"learning_rate": 1e-05,
|
| 78 |
+
"loss": 1.2509,
|
| 79 |
+
"step": 20
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"epoch": 0.10945273631840796,
|
| 83 |
+
"grad_norm": 2.830953598022461,
|
| 84 |
+
"learning_rate": 1e-05,
|
| 85 |
+
"loss": 0.8848,
|
| 86 |
+
"step": 22
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"epoch": 0.11940298507462686,
|
| 90 |
+
"grad_norm": 2.7264418601989746,
|
| 91 |
+
"learning_rate": 1e-05,
|
| 92 |
+
"loss": 1.4029,
|
| 93 |
+
"step": 24
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
"epoch": 0.12935323383084577,
|
| 97 |
+
"grad_norm": 4.596713066101074,
|
| 98 |
+
"learning_rate": 1e-05,
|
| 99 |
+
"loss": 1.8621,
|
| 100 |
+
"step": 26
|
| 101 |
+
},
|
| 102 |
+
{
|
| 103 |
+
"epoch": 0.13930348258706468,
|
| 104 |
+
"grad_norm": 3.3301849365234375,
|
| 105 |
+
"learning_rate": 1e-05,
|
| 106 |
+
"loss": 1.1167,
|
| 107 |
+
"step": 28
|
| 108 |
+
},
|
| 109 |
+
{
|
| 110 |
+
"epoch": 0.14925373134328357,
|
| 111 |
+
"grad_norm": 3.9882688522338867,
|
| 112 |
+
"learning_rate": 1e-05,
|
| 113 |
+
"loss": 1.4297,
|
| 114 |
+
"step": 30
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"epoch": 0.15920398009950248,
|
| 118 |
+
"grad_norm": 0.8713480830192566,
|
| 119 |
+
"learning_rate": 1e-05,
|
| 120 |
+
"loss": 0.8509,
|
| 121 |
+
"step": 32
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"epoch": 0.1691542288557214,
|
| 125 |
+
"grad_norm": 5.365267753601074,
|
| 126 |
+
"learning_rate": 1e-05,
|
| 127 |
+
"loss": 1.2606,
|
| 128 |
+
"step": 34
|
| 129 |
+
},
|
| 130 |
+
{
|
| 131 |
+
"epoch": 0.1791044776119403,
|
| 132 |
+
"grad_norm": 3.6241588592529297,
|
| 133 |
+
"learning_rate": 1e-05,
|
| 134 |
+
"loss": 1.1967,
|
| 135 |
+
"step": 36
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"epoch": 0.1890547263681592,
|
| 139 |
+
"grad_norm": 2.176697254180908,
|
| 140 |
+
"learning_rate": 1e-05,
|
| 141 |
+
"loss": 0.8659,
|
| 142 |
+
"step": 38
|
| 143 |
+
},
|
| 144 |
+
{
|
| 145 |
+
"epoch": 0.19900497512437812,
|
| 146 |
+
"grad_norm": 3.8057022094726562,
|
| 147 |
+
"learning_rate": 1e-05,
|
| 148 |
+
"loss": 1.3048,
|
| 149 |
+
"step": 40
|
| 150 |
+
},
|
| 151 |
+
{
|
| 152 |
+
"epoch": 0.208955223880597,
|
| 153 |
+
"grad_norm": 5.057631015777588,
|
| 154 |
+
"learning_rate": 1e-05,
|
| 155 |
+
"loss": 0.8538,
|
| 156 |
+
"step": 42
|
| 157 |
+
},
|
| 158 |
+
{
|
| 159 |
+
"epoch": 0.21890547263681592,
|
| 160 |
+
"grad_norm": 3.2521402835845947,
|
| 161 |
+
"learning_rate": 1e-05,
|
| 162 |
+
"loss": 0.967,
|
| 163 |
+
"step": 44
|
| 164 |
+
},
|
| 165 |
+
{
|
| 166 |
+
"epoch": 0.22885572139303484,
|
| 167 |
+
"grad_norm": 4.557372570037842,
|
| 168 |
+
"learning_rate": 1e-05,
|
| 169 |
+
"loss": 1.2981,
|
| 170 |
+
"step": 46
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"epoch": 0.23880597014925373,
|
| 174 |
+
"grad_norm": 4.477596759796143,
|
| 175 |
+
"learning_rate": 1e-05,
|
| 176 |
+
"loss": 0.5941,
|
| 177 |
+
"step": 48
|
| 178 |
+
},
|
| 179 |
+
{
|
| 180 |
+
"epoch": 0.24875621890547264,
|
| 181 |
+
"grad_norm": 3.518733263015747,
|
| 182 |
+
"learning_rate": 1e-05,
|
| 183 |
+
"loss": 1.6752,
|
| 184 |
+
"step": 50
|
| 185 |
+
},
|
| 186 |
+
{
|
| 187 |
+
"epoch": 0.25870646766169153,
|
| 188 |
+
"grad_norm": 7.677566051483154,
|
| 189 |
+
"learning_rate": 1e-05,
|
| 190 |
+
"loss": 1.4171,
|
| 191 |
+
"step": 52
|
| 192 |
+
},
|
| 193 |
+
{
|
| 194 |
+
"epoch": 0.26865671641791045,
|
| 195 |
+
"grad_norm": 2.7589364051818848,
|
| 196 |
+
"learning_rate": 1e-05,
|
| 197 |
+
"loss": 0.5706,
|
| 198 |
+
"step": 54
|
| 199 |
+
},
|
| 200 |
+
{
|
| 201 |
+
"epoch": 0.27860696517412936,
|
| 202 |
+
"grad_norm": 2.9053287506103516,
|
| 203 |
+
"learning_rate": 1e-05,
|
| 204 |
+
"loss": 1.2631,
|
| 205 |
+
"step": 56
|
| 206 |
+
},
|
| 207 |
+
{
|
| 208 |
+
"epoch": 0.2885572139303483,
|
| 209 |
+
"grad_norm": 8.476534843444824,
|
| 210 |
+
"learning_rate": 1e-05,
|
| 211 |
+
"loss": 1.4675,
|
| 212 |
+
"step": 58
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"epoch": 0.29850746268656714,
|
| 216 |
+
"grad_norm": 7.858043670654297,
|
| 217 |
+
"learning_rate": 1e-05,
|
| 218 |
+
"loss": 2.94,
|
| 219 |
+
"step": 60
|
| 220 |
+
},
|
| 221 |
+
{
|
| 222 |
+
"epoch": 0.30845771144278605,
|
| 223 |
+
"grad_norm": 3.2298996448516846,
|
| 224 |
+
"learning_rate": 1e-05,
|
| 225 |
+
"loss": 0.5,
|
| 226 |
+
"step": 62
|
| 227 |
+
},
|
| 228 |
+
{
|
| 229 |
+
"epoch": 0.31840796019900497,
|
| 230 |
+
"grad_norm": 5.179959774017334,
|
| 231 |
+
"learning_rate": 1e-05,
|
| 232 |
+
"loss": 0.8592,
|
| 233 |
+
"step": 64
|
| 234 |
+
},
|
| 235 |
+
{
|
| 236 |
+
"epoch": 0.3283582089552239,
|
| 237 |
+
"grad_norm": 10.46849536895752,
|
| 238 |
+
"learning_rate": 1e-05,
|
| 239 |
+
"loss": 2.096,
|
| 240 |
+
"step": 66
|
| 241 |
+
},
|
| 242 |
+
{
|
| 243 |
+
"epoch": 0.3383084577114428,
|
| 244 |
+
"grad_norm": 1.686103105545044,
|
| 245 |
+
"learning_rate": 1e-05,
|
| 246 |
+
"loss": 1.3382,
|
| 247 |
+
"step": 68
|
| 248 |
+
},
|
| 249 |
+
{
|
| 250 |
+
"epoch": 0.3482587064676617,
|
| 251 |
+
"grad_norm": 3.9357430934906006,
|
| 252 |
+
"learning_rate": 1e-05,
|
| 253 |
+
"loss": 1.5427,
|
| 254 |
+
"step": 70
|
| 255 |
+
},
|
| 256 |
+
{
|
| 257 |
+
"epoch": 0.3582089552238806,
|
| 258 |
+
"grad_norm": 6.08726692199707,
|
| 259 |
+
"learning_rate": 1e-05,
|
| 260 |
+
"loss": 1.7477,
|
| 261 |
+
"step": 72
|
| 262 |
+
},
|
| 263 |
+
{
|
| 264 |
+
"epoch": 0.3681592039800995,
|
| 265 |
+
"grad_norm": 4.0442376136779785,
|
| 266 |
+
"learning_rate": 1e-05,
|
| 267 |
+
"loss": 1.3599,
|
| 268 |
+
"step": 74
|
| 269 |
+
},
|
| 270 |
+
{
|
| 271 |
+
"epoch": 0.3781094527363184,
|
| 272 |
+
"grad_norm": 4.393518447875977,
|
| 273 |
+
"learning_rate": 1e-05,
|
| 274 |
+
"loss": 1.3049,
|
| 275 |
+
"step": 76
|
| 276 |
+
},
|
| 277 |
+
{
|
| 278 |
+
"epoch": 0.3880597014925373,
|
| 279 |
+
"grad_norm": 4.748154163360596,
|
| 280 |
+
"learning_rate": 1e-05,
|
| 281 |
+
"loss": 1.2163,
|
| 282 |
+
"step": 78
|
| 283 |
+
},
|
| 284 |
+
{
|
| 285 |
+
"epoch": 0.39800995024875624,
|
| 286 |
+
"grad_norm": 2.4480156898498535,
|
| 287 |
+
"learning_rate": 1e-05,
|
| 288 |
+
"loss": 1.5779,
|
| 289 |
+
"step": 80
|
| 290 |
+
},
|
| 291 |
+
{
|
| 292 |
+
"epoch": 0.4079601990049751,
|
| 293 |
+
"grad_norm": 4.978269577026367,
|
| 294 |
+
"learning_rate": 1e-05,
|
| 295 |
+
"loss": 1.5108,
|
| 296 |
+
"step": 82
|
| 297 |
+
},
|
| 298 |
+
{
|
| 299 |
+
"epoch": 0.417910447761194,
|
| 300 |
+
"grad_norm": 8.956459999084473,
|
| 301 |
+
"learning_rate": 1e-05,
|
| 302 |
+
"loss": 1.2402,
|
| 303 |
+
"step": 84
|
| 304 |
+
},
|
| 305 |
+
{
|
| 306 |
+
"epoch": 0.42786069651741293,
|
| 307 |
+
"grad_norm": 3.989821434020996,
|
| 308 |
+
"learning_rate": 1e-05,
|
| 309 |
+
"loss": 0.8792,
|
| 310 |
+
"step": 86
|
| 311 |
+
},
|
| 312 |
+
{
|
| 313 |
+
"epoch": 0.43781094527363185,
|
| 314 |
+
"grad_norm": 7.240758895874023,
|
| 315 |
+
"learning_rate": 1e-05,
|
| 316 |
+
"loss": 1.8413,
|
| 317 |
+
"step": 88
|
| 318 |
+
},
|
| 319 |
+
{
|
| 320 |
+
"epoch": 0.44776119402985076,
|
| 321 |
+
"grad_norm": 1.3386205434799194,
|
| 322 |
+
"learning_rate": 1e-05,
|
| 323 |
+
"loss": 0.5992,
|
| 324 |
+
"step": 90
|
| 325 |
+
},
|
| 326 |
+
{
|
| 327 |
+
"epoch": 0.4577114427860697,
|
| 328 |
+
"grad_norm": 5.485062599182129,
|
| 329 |
+
"learning_rate": 1e-05,
|
| 330 |
+
"loss": 0.9109,
|
| 331 |
+
"step": 92
|
| 332 |
+
},
|
| 333 |
+
{
|
| 334 |
+
"epoch": 0.46766169154228854,
|
| 335 |
+
"grad_norm": 5.22202205657959,
|
| 336 |
+
"learning_rate": 1e-05,
|
| 337 |
+
"loss": 0.8113,
|
| 338 |
+
"step": 94
|
| 339 |
+
},
|
| 340 |
+
{
|
| 341 |
+
"epoch": 0.47761194029850745,
|
| 342 |
+
"grad_norm": 2.953240156173706,
|
| 343 |
+
"learning_rate": 1e-05,
|
| 344 |
+
"loss": 1.0452,
|
| 345 |
+
"step": 96
|
| 346 |
+
},
|
| 347 |
+
{
|
| 348 |
+
"epoch": 0.48756218905472637,
|
| 349 |
+
"grad_norm": 3.98473858833313,
|
| 350 |
+
"learning_rate": 1e-05,
|
| 351 |
+
"loss": 1.4785,
|
| 352 |
+
"step": 98
|
| 353 |
+
},
|
| 354 |
+
{
|
| 355 |
+
"epoch": 0.4975124378109453,
|
| 356 |
+
"grad_norm": 0.524372935295105,
|
| 357 |
+
"learning_rate": 1e-05,
|
| 358 |
+
"loss": 1.0392,
|
| 359 |
+
"step": 100
|
| 360 |
+
},
|
| 361 |
+
{
|
| 362 |
+
"epoch": 0.5074626865671642,
|
| 363 |
+
"grad_norm": 5.757716655731201,
|
| 364 |
+
"learning_rate": 1e-05,
|
| 365 |
+
"loss": 1.507,
|
| 366 |
+
"step": 102
|
| 367 |
+
},
|
| 368 |
+
{
|
| 369 |
+
"epoch": 0.5174129353233831,
|
| 370 |
+
"grad_norm": 3.7972941398620605,
|
| 371 |
+
"learning_rate": 1e-05,
|
| 372 |
+
"loss": 2.0817,
|
| 373 |
+
"step": 104
|
| 374 |
+
},
|
| 375 |
+
{
|
| 376 |
+
"epoch": 0.527363184079602,
|
| 377 |
+
"grad_norm": 2.1441078186035156,
|
| 378 |
+
"learning_rate": 1e-05,
|
| 379 |
+
"loss": 1.1439,
|
| 380 |
+
"step": 106
|
| 381 |
+
},
|
| 382 |
+
{
|
| 383 |
+
"epoch": 0.5373134328358209,
|
| 384 |
+
"grad_norm": 4.19448184967041,
|
| 385 |
+
"learning_rate": 1e-05,
|
| 386 |
+
"loss": 0.5984,
|
| 387 |
+
"step": 108
|
| 388 |
+
},
|
| 389 |
+
{
|
| 390 |
+
"epoch": 0.5472636815920398,
|
| 391 |
+
"grad_norm": 2.471952438354492,
|
| 392 |
+
"learning_rate": 1e-05,
|
| 393 |
+
"loss": 0.6786,
|
| 394 |
+
"step": 110
|
| 395 |
+
},
|
| 396 |
+
{
|
| 397 |
+
"epoch": 0.5572139303482587,
|
| 398 |
+
"grad_norm": 3.152708053588867,
|
| 399 |
+
"learning_rate": 1e-05,
|
| 400 |
+
"loss": 0.441,
|
| 401 |
+
"step": 112
|
| 402 |
+
},
|
| 403 |
+
{
|
| 404 |
+
"epoch": 0.5671641791044776,
|
| 405 |
+
"grad_norm": 5.703269004821777,
|
| 406 |
+
"learning_rate": 1e-05,
|
| 407 |
+
"loss": 0.662,
|
| 408 |
+
"step": 114
|
| 409 |
+
},
|
| 410 |
+
{
|
| 411 |
+
"epoch": 0.5771144278606966,
|
| 412 |
+
"grad_norm": 4.732028007507324,
|
| 413 |
+
"learning_rate": 1e-05,
|
| 414 |
+
"loss": 0.7527,
|
| 415 |
+
"step": 116
|
| 416 |
+
},
|
| 417 |
+
{
|
| 418 |
+
"epoch": 0.5870646766169154,
|
| 419 |
+
"grad_norm": 10.553655624389648,
|
| 420 |
+
"learning_rate": 1e-05,
|
| 421 |
+
"loss": 2.7411,
|
| 422 |
+
"step": 118
|
| 423 |
+
},
|
| 424 |
+
{
|
| 425 |
+
"epoch": 0.5970149253731343,
|
| 426 |
+
"grad_norm": 6.645718574523926,
|
| 427 |
+
"learning_rate": 1e-05,
|
| 428 |
+
"loss": 1.6926,
|
| 429 |
+
"step": 120
|
| 430 |
+
},
|
| 431 |
+
{
|
| 432 |
+
"epoch": 0.6069651741293532,
|
| 433 |
+
"grad_norm": 2.5227789878845215,
|
| 434 |
+
"learning_rate": 1e-05,
|
| 435 |
+
"loss": 1.2725,
|
| 436 |
+
"step": 122
|
| 437 |
+
},
|
| 438 |
+
{
|
| 439 |
+
"epoch": 0.6169154228855721,
|
| 440 |
+
"grad_norm": 4.154623508453369,
|
| 441 |
+
"learning_rate": 1e-05,
|
| 442 |
+
"loss": 1.1329,
|
| 443 |
+
"step": 124
|
| 444 |
+
},
|
| 445 |
+
{
|
| 446 |
+
"epoch": 0.6268656716417911,
|
| 447 |
+
"grad_norm": 3.382685661315918,
|
| 448 |
+
"learning_rate": 1e-05,
|
| 449 |
+
"loss": 0.5432,
|
| 450 |
+
"step": 126
|
| 451 |
+
},
|
| 452 |
+
{
|
| 453 |
+
"epoch": 0.6368159203980099,
|
| 454 |
+
"grad_norm": 11.674966812133789,
|
| 455 |
+
"learning_rate": 1e-05,
|
| 456 |
+
"loss": 0.6193,
|
| 457 |
+
"step": 128
|
| 458 |
+
},
|
| 459 |
+
{
|
| 460 |
+
"epoch": 0.6467661691542289,
|
| 461 |
+
"grad_norm": 3.64872145652771,
|
| 462 |
+
"learning_rate": 1e-05,
|
| 463 |
+
"loss": 0.9732,
|
| 464 |
+
"step": 130
|
| 465 |
+
},
|
| 466 |
+
{
|
| 467 |
+
"epoch": 0.6567164179104478,
|
| 468 |
+
"grad_norm": 6.72369384765625,
|
| 469 |
+
"learning_rate": 1e-05,
|
| 470 |
+
"loss": 1.1707,
|
| 471 |
+
"step": 132
|
| 472 |
+
},
|
| 473 |
+
{
|
| 474 |
+
"epoch": 0.6666666666666666,
|
| 475 |
+
"grad_norm": 5.803842067718506,
|
| 476 |
+
"learning_rate": 1e-05,
|
| 477 |
+
"loss": 1.6948,
|
| 478 |
+
"step": 134
|
| 479 |
+
},
|
| 480 |
+
{
|
| 481 |
+
"epoch": 0.6766169154228856,
|
| 482 |
+
"grad_norm": 6.422171592712402,
|
| 483 |
+
"learning_rate": 1e-05,
|
| 484 |
+
"loss": 0.734,
|
| 485 |
+
"step": 136
|
| 486 |
+
},
|
| 487 |
+
{
|
| 488 |
+
"epoch": 0.6865671641791045,
|
| 489 |
+
"grad_norm": 11.723003387451172,
|
| 490 |
+
"learning_rate": 1e-05,
|
| 491 |
+
"loss": 2.1305,
|
| 492 |
+
"step": 138
|
| 493 |
+
},
|
| 494 |
+
{
|
| 495 |
+
"epoch": 0.6965174129353234,
|
| 496 |
+
"grad_norm": 4.657910346984863,
|
| 497 |
+
"learning_rate": 1e-05,
|
| 498 |
+
"loss": 1.7126,
|
| 499 |
+
"step": 140
|
| 500 |
+
},
|
| 501 |
+
{
|
| 502 |
+
"epoch": 0.7064676616915423,
|
| 503 |
+
"grad_norm": 6.460371494293213,
|
| 504 |
+
"learning_rate": 1e-05,
|
| 505 |
+
"loss": 1.6042,
|
| 506 |
+
"step": 142
|
| 507 |
+
},
|
| 508 |
+
{
|
| 509 |
+
"epoch": 0.7164179104477612,
|
| 510 |
+
"grad_norm": 2.946357250213623,
|
| 511 |
+
"learning_rate": 1e-05,
|
| 512 |
+
"loss": 1.3644,
|
| 513 |
+
"step": 144
|
| 514 |
+
},
|
| 515 |
+
{
|
| 516 |
+
"epoch": 0.7263681592039801,
|
| 517 |
+
"grad_norm": 3.000802993774414,
|
| 518 |
+
"learning_rate": 1e-05,
|
| 519 |
+
"loss": 0.7483,
|
| 520 |
+
"step": 146
|
| 521 |
+
},
|
| 522 |
+
{
|
| 523 |
+
"epoch": 0.736318407960199,
|
| 524 |
+
"grad_norm": 5.282987594604492,
|
| 525 |
+
"learning_rate": 1e-05,
|
| 526 |
+
"loss": 1.0917,
|
| 527 |
+
"step": 148
|
| 528 |
+
},
|
| 529 |
+
{
|
| 530 |
+
"epoch": 0.746268656716418,
|
| 531 |
+
"grad_norm": 0.4844614565372467,
|
| 532 |
+
"learning_rate": 1e-05,
|
| 533 |
+
"loss": 0.3645,
|
| 534 |
+
"step": 150
|
| 535 |
+
},
|
| 536 |
+
{
|
| 537 |
+
"epoch": 0.7562189054726368,
|
| 538 |
+
"grad_norm": 4.852270126342773,
|
| 539 |
+
"learning_rate": 1e-05,
|
| 540 |
+
"loss": 1.1297,
|
| 541 |
+
"step": 152
|
| 542 |
+
},
|
| 543 |
+
{
|
| 544 |
+
"epoch": 0.7661691542288557,
|
| 545 |
+
"grad_norm": 3.115569829940796,
|
| 546 |
+
"learning_rate": 1e-05,
|
| 547 |
+
"loss": 1.2097,
|
| 548 |
+
"step": 154
|
| 549 |
+
},
|
| 550 |
+
{
|
| 551 |
+
"epoch": 0.7761194029850746,
|
| 552 |
+
"grad_norm": 4.892626762390137,
|
| 553 |
+
"learning_rate": 1e-05,
|
| 554 |
+
"loss": 0.8909,
|
| 555 |
+
"step": 156
|
| 556 |
+
},
|
| 557 |
+
{
|
| 558 |
+
"epoch": 0.7860696517412935,
|
| 559 |
+
"grad_norm": 4.782143592834473,
|
| 560 |
+
"learning_rate": 1e-05,
|
| 561 |
+
"loss": 0.7592,
|
| 562 |
+
"step": 158
|
| 563 |
+
},
|
| 564 |
+
{
|
| 565 |
+
"epoch": 0.7960199004975125,
|
| 566 |
+
"grad_norm": 1.9109928607940674,
|
| 567 |
+
"learning_rate": 1e-05,
|
| 568 |
+
"loss": 0.4162,
|
| 569 |
+
"step": 160
|
| 570 |
+
},
|
| 571 |
+
{
|
| 572 |
+
"epoch": 0.8059701492537313,
|
| 573 |
+
"grad_norm": 8.50790786743164,
|
| 574 |
+
"learning_rate": 1e-05,
|
| 575 |
+
"loss": 2.4984,
|
| 576 |
+
"step": 162
|
| 577 |
+
},
|
| 578 |
+
{
|
| 579 |
+
"epoch": 0.8159203980099502,
|
| 580 |
+
"grad_norm": 3.661428213119507,
|
| 581 |
+
"learning_rate": 1e-05,
|
| 582 |
+
"loss": 1.0022,
|
| 583 |
+
"step": 164
|
| 584 |
+
},
|
| 585 |
+
{
|
| 586 |
+
"epoch": 0.8258706467661692,
|
| 587 |
+
"grad_norm": 5.116476058959961,
|
| 588 |
+
"learning_rate": 1e-05,
|
| 589 |
+
"loss": 0.9979,
|
| 590 |
+
"step": 166
|
| 591 |
+
},
|
| 592 |
+
{
|
| 593 |
+
"epoch": 0.835820895522388,
|
| 594 |
+
"grad_norm": 6.289146900177002,
|
| 595 |
+
"learning_rate": 1e-05,
|
| 596 |
+
"loss": 0.8444,
|
| 597 |
+
"step": 168
|
| 598 |
+
},
|
| 599 |
+
{
|
| 600 |
+
"epoch": 0.845771144278607,
|
| 601 |
+
"grad_norm": 2.4712114334106445,
|
| 602 |
+
"learning_rate": 1e-05,
|
| 603 |
+
"loss": 0.7441,
|
| 604 |
+
"step": 170
|
| 605 |
+
},
|
| 606 |
+
{
|
| 607 |
+
"epoch": 0.8557213930348259,
|
| 608 |
+
"grad_norm": 4.545423984527588,
|
| 609 |
+
"learning_rate": 1e-05,
|
| 610 |
+
"loss": 0.4958,
|
| 611 |
+
"step": 172
|
| 612 |
+
},
|
| 613 |
+
{
|
| 614 |
+
"epoch": 0.8656716417910447,
|
| 615 |
+
"grad_norm": 2.7957515716552734,
|
| 616 |
+
"learning_rate": 1e-05,
|
| 617 |
+
"loss": 0.6294,
|
| 618 |
+
"step": 174
|
| 619 |
+
},
|
| 620 |
+
{
|
| 621 |
+
"epoch": 0.8756218905472637,
|
| 622 |
+
"grad_norm": 5.590768337249756,
|
| 623 |
+
"learning_rate": 1e-05,
|
| 624 |
+
"loss": 0.4968,
|
| 625 |
+
"step": 176
|
| 626 |
+
},
|
| 627 |
+
{
|
| 628 |
+
"epoch": 0.8855721393034826,
|
| 629 |
+
"grad_norm": 5.343775749206543,
|
| 630 |
+
"learning_rate": 1e-05,
|
| 631 |
+
"loss": 0.4072,
|
| 632 |
+
"step": 178
|
| 633 |
+
},
|
| 634 |
+
{
|
| 635 |
+
"epoch": 0.8955223880597015,
|
| 636 |
+
"grad_norm": 8.360288619995117,
|
| 637 |
+
"learning_rate": 1e-05,
|
| 638 |
+
"loss": 1.0587,
|
| 639 |
+
"step": 180
|
| 640 |
+
},
|
| 641 |
+
{
|
| 642 |
+
"epoch": 0.9054726368159204,
|
| 643 |
+
"grad_norm": 3.4952993392944336,
|
| 644 |
+
"learning_rate": 1e-05,
|
| 645 |
+
"loss": 0.742,
|
| 646 |
+
"step": 182
|
| 647 |
+
},
|
| 648 |
+
{
|
| 649 |
+
"epoch": 0.9154228855721394,
|
| 650 |
+
"grad_norm": 5.865167617797852,
|
| 651 |
+
"learning_rate": 1e-05,
|
| 652 |
+
"loss": 2.4214,
|
| 653 |
+
"step": 184
|
| 654 |
+
},
|
| 655 |
+
{
|
| 656 |
+
"epoch": 0.9253731343283582,
|
| 657 |
+
"grad_norm": 3.2211215496063232,
|
| 658 |
+
"learning_rate": 1e-05,
|
| 659 |
+
"loss": 1.2642,
|
| 660 |
+
"step": 186
|
| 661 |
+
},
|
| 662 |
+
{
|
| 663 |
+
"epoch": 0.9353233830845771,
|
| 664 |
+
"grad_norm": 4.869852066040039,
|
| 665 |
+
"learning_rate": 1e-05,
|
| 666 |
+
"loss": 1.7789,
|
| 667 |
+
"step": 188
|
| 668 |
+
},
|
| 669 |
+
{
|
| 670 |
+
"epoch": 0.945273631840796,
|
| 671 |
+
"grad_norm": 9.350594520568848,
|
| 672 |
+
"learning_rate": 1e-05,
|
| 673 |
+
"loss": 0.9147,
|
| 674 |
+
"step": 190
|
| 675 |
+
},
|
| 676 |
+
{
|
| 677 |
+
"epoch": 0.9552238805970149,
|
| 678 |
+
"grad_norm": 2.942012071609497,
|
| 679 |
+
"learning_rate": 1e-05,
|
| 680 |
+
"loss": 0.6123,
|
| 681 |
+
"step": 192
|
| 682 |
+
},
|
| 683 |
+
{
|
| 684 |
+
"epoch": 0.9651741293532339,
|
| 685 |
+
"grad_norm": 5.4307332038879395,
|
| 686 |
+
"learning_rate": 1e-05,
|
| 687 |
+
"loss": 1.2541,
|
| 688 |
+
"step": 194
|
| 689 |
+
},
|
| 690 |
+
{
|
| 691 |
+
"epoch": 0.9751243781094527,
|
| 692 |
+
"grad_norm": 4.55341911315918,
|
| 693 |
+
"learning_rate": 1e-05,
|
| 694 |
+
"loss": 1.9212,
|
| 695 |
+
"step": 196
|
| 696 |
+
},
|
| 697 |
+
{
|
| 698 |
+
"epoch": 0.9850746268656716,
|
| 699 |
+
"grad_norm": 5.160548210144043,
|
| 700 |
+
"learning_rate": 1e-05,
|
| 701 |
+
"loss": 0.4836,
|
| 702 |
+
"step": 198
|
| 703 |
+
},
|
| 704 |
+
{
|
| 705 |
+
"epoch": 0.9950248756218906,
|
| 706 |
+
"grad_norm": 0.6315759420394897,
|
| 707 |
+
"learning_rate": 1e-05,
|
| 708 |
+
"loss": 0.063,
|
| 709 |
+
"step": 200
|
| 710 |
+
},
|
| 711 |
+
{
|
| 712 |
+
"epoch": 1.0,
|
| 713 |
+
"step": 201,
|
| 714 |
+
"total_flos": 3.816239406461747e+16,
|
| 715 |
+
"train_loss": 1.217404284880529,
|
| 716 |
+
"train_runtime": 508.9307,
|
| 717 |
+
"train_samples_per_second": 1.58,
|
| 718 |
+
"train_steps_per_second": 0.395
|
| 719 |
+
}
|
| 720 |
+
],
|
| 721 |
+
"logging_steps": 2,
|
| 722 |
+
"max_steps": 201,
|
| 723 |
+
"num_input_tokens_seen": 0,
|
| 724 |
+
"num_train_epochs": 1,
|
| 725 |
+
"save_steps": 500,
|
| 726 |
+
"stateful_callbacks": {
|
| 727 |
+
"TrainerControl": {
|
| 728 |
+
"args": {
|
| 729 |
+
"should_epoch_stop": false,
|
| 730 |
+
"should_evaluate": false,
|
| 731 |
+
"should_log": false,
|
| 732 |
+
"should_save": false,
|
| 733 |
+
"should_training_stop": false
|
| 734 |
+
},
|
| 735 |
+
"attributes": {}
|
| 736 |
+
}
|
| 737 |
+
},
|
| 738 |
+
"total_flos": 3.816239406461747e+16,
|
| 739 |
+
"train_batch_size": 1,
|
| 740 |
+
"trial_name": null,
|
| 741 |
+
"trial_params": null
|
| 742 |
+
}
|