thkim0305 commited on
Commit
c3dd1b8
·
verified ·
1 Parent(s): 163326b

Upload folder using huggingface_hub

Browse files
Files changed (44) hide show
  1. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1.pth +3 -0
  2. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr0.pth +3 -0
  3. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr100.pth +3 -0
  4. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr125.pth +3 -0
  5. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr150.pth +3 -0
  6. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr175.pth +3 -0
  7. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr200.pth +3 -0
  8. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr25.pth +3 -0
  9. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr50.pth +3 -0
  10. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr75.pth +3 -0
  11. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_trainer_state.json +742 -0
  12. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1.pth +3 -0
  13. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr0.pth +3 -0
  14. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr100.pth +3 -0
  15. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr125.pth +3 -0
  16. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr150.pth +3 -0
  17. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr175.pth +3 -0
  18. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr200.pth +3 -0
  19. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr25.pth +3 -0
  20. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr50.pth +3 -0
  21. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr75.pth +3 -0
  22. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_trainer_state.json +742 -0
  23. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1.pth +3 -0
  24. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr0.pth +3 -0
  25. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr100.pth +3 -0
  26. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr125.pth +3 -0
  27. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr150.pth +3 -0
  28. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr175.pth +3 -0
  29. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr200.pth +3 -0
  30. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr25.pth +3 -0
  31. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr50.pth +3 -0
  32. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr75.pth +3 -0
  33. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_trainer_state.json +742 -0
  34. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1.pth +3 -0
  35. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr0.pth +3 -0
  36. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr100.pth +3 -0
  37. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr125.pth +3 -0
  38. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr150.pth +3 -0
  39. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr175.pth +3 -0
  40. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr200.pth +3 -0
  41. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr25.pth +3 -0
  42. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr50.pth +3 -0
  43. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr75.pth +3 -0
  44. client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_trainer_state.json +742 -0
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9c87175d6670ea0a7e93a81d8836225b3c726b3daa4fbfe87fefa157cea616a
3
+ size 389170122
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b0102e813ed9d28a49d25fcc64b847bda863108e19f3ff5681263b527d6a413
3
+ size 389172166
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr100.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d8efd7b883c051ce014f750e26cb5cb90a1e5fb3319b48953a83b49f23c5546
3
+ size 389172958
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr125.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1eaa483fa6f46352a8b7d5f44c5c792f501ea238c1ce4fd396993be54dd2ef58
3
+ size 389172958
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr150.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcb63eb11795c0a75df6ba716dec5a301d613230b740101ba1422ad03a8d3455
3
+ size 389172958
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr175.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:128b4087fc1df81576986c9f476fa1d82ff7268f03b0fe5cc6e193a5079c8f6b
3
+ size 389172958
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr200.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72c214c649182fabc10bd7248537f81c5df845d5b4cb6592ba47f932f0576bc3
3
+ size 389172958
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr25.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd3b132d03a92166634fa1e31f278cb279365653994dcb731d5e955b870c8272
3
+ size 389172562
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr50.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bf804f790082b1e20fc3ee8d7fe51388c1fb419aee382d3e8052b939525aaf0
3
+ size 389172562
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_client_model_round1_itr75.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce18d753cf3cd6d1fc131bfbdf829243456103ba84069d3fb80747cac5beb3af
3
+ size 389172562
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/0_trainer_state.json ADDED
@@ -0,0 +1,742 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 500,
6
+ "global_step": 201,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.009950248756218905,
13
+ "grad_norm": 1.316588044166565,
14
+ "learning_rate": 1e-05,
15
+ "loss": 1.7881,
16
+ "step": 2
17
+ },
18
+ {
19
+ "epoch": 0.01990049751243781,
20
+ "grad_norm": 1.1037206649780273,
21
+ "learning_rate": 1e-05,
22
+ "loss": 1.7363,
23
+ "step": 4
24
+ },
25
+ {
26
+ "epoch": 0.029850746268656716,
27
+ "grad_norm": 0.7940536141395569,
28
+ "learning_rate": 1e-05,
29
+ "loss": 1.707,
30
+ "step": 6
31
+ },
32
+ {
33
+ "epoch": 0.03980099502487562,
34
+ "grad_norm": 0.8492249250411987,
35
+ "learning_rate": 1e-05,
36
+ "loss": 1.6855,
37
+ "step": 8
38
+ },
39
+ {
40
+ "epoch": 0.04975124378109453,
41
+ "grad_norm": 1.1886231899261475,
42
+ "learning_rate": 1e-05,
43
+ "loss": 1.7168,
44
+ "step": 10
45
+ },
46
+ {
47
+ "epoch": 0.05970149253731343,
48
+ "grad_norm": 1.4966411590576172,
49
+ "learning_rate": 1e-05,
50
+ "loss": 2.1992,
51
+ "step": 12
52
+ },
53
+ {
54
+ "epoch": 0.06965174129353234,
55
+ "grad_norm": 1.4271999597549438,
56
+ "learning_rate": 1e-05,
57
+ "loss": 1.7031,
58
+ "step": 14
59
+ },
60
+ {
61
+ "epoch": 0.07960199004975124,
62
+ "grad_norm": 1.644882321357727,
63
+ "learning_rate": 1e-05,
64
+ "loss": 2.1426,
65
+ "step": 16
66
+ },
67
+ {
68
+ "epoch": 0.08955223880597014,
69
+ "grad_norm": 1.7688827514648438,
70
+ "learning_rate": 1e-05,
71
+ "loss": 2.1768,
72
+ "step": 18
73
+ },
74
+ {
75
+ "epoch": 0.09950248756218906,
76
+ "grad_norm": 0.8430641889572144,
77
+ "learning_rate": 1e-05,
78
+ "loss": 1.9033,
79
+ "step": 20
80
+ },
81
+ {
82
+ "epoch": 0.10945273631840796,
83
+ "grad_norm": 1.0195722579956055,
84
+ "learning_rate": 1e-05,
85
+ "loss": 1.7842,
86
+ "step": 22
87
+ },
88
+ {
89
+ "epoch": 0.11940298507462686,
90
+ "grad_norm": 0.7112452387809753,
91
+ "learning_rate": 1e-05,
92
+ "loss": 1.8604,
93
+ "step": 24
94
+ },
95
+ {
96
+ "epoch": 0.12935323383084577,
97
+ "grad_norm": 1.6654636859893799,
98
+ "learning_rate": 1e-05,
99
+ "loss": 1.5967,
100
+ "step": 26
101
+ },
102
+ {
103
+ "epoch": 0.13930348258706468,
104
+ "grad_norm": 0.5782097578048706,
105
+ "learning_rate": 1e-05,
106
+ "loss": 1.7471,
107
+ "step": 28
108
+ },
109
+ {
110
+ "epoch": 0.14925373134328357,
111
+ "grad_norm": 1.0965440273284912,
112
+ "learning_rate": 1e-05,
113
+ "loss": 1.6221,
114
+ "step": 30
115
+ },
116
+ {
117
+ "epoch": 0.15920398009950248,
118
+ "grad_norm": 1.0025074481964111,
119
+ "learning_rate": 1e-05,
120
+ "loss": 1.9658,
121
+ "step": 32
122
+ },
123
+ {
124
+ "epoch": 0.1691542288557214,
125
+ "grad_norm": 1.0948214530944824,
126
+ "learning_rate": 1e-05,
127
+ "loss": 2.0059,
128
+ "step": 34
129
+ },
130
+ {
131
+ "epoch": 0.1791044776119403,
132
+ "grad_norm": 1.1663291454315186,
133
+ "learning_rate": 1e-05,
134
+ "loss": 1.8125,
135
+ "step": 36
136
+ },
137
+ {
138
+ "epoch": 0.1890547263681592,
139
+ "grad_norm": 0.6280285120010376,
140
+ "learning_rate": 1e-05,
141
+ "loss": 1.833,
142
+ "step": 38
143
+ },
144
+ {
145
+ "epoch": 0.19900497512437812,
146
+ "grad_norm": 0.7364129424095154,
147
+ "learning_rate": 1e-05,
148
+ "loss": 1.6533,
149
+ "step": 40
150
+ },
151
+ {
152
+ "epoch": 0.208955223880597,
153
+ "grad_norm": 1.1327072381973267,
154
+ "learning_rate": 1e-05,
155
+ "loss": 1.9336,
156
+ "step": 42
157
+ },
158
+ {
159
+ "epoch": 0.21890547263681592,
160
+ "grad_norm": 0.7770842909812927,
161
+ "learning_rate": 1e-05,
162
+ "loss": 1.7471,
163
+ "step": 44
164
+ },
165
+ {
166
+ "epoch": 0.22885572139303484,
167
+ "grad_norm": 0.7920796871185303,
168
+ "learning_rate": 1e-05,
169
+ "loss": 1.9375,
170
+ "step": 46
171
+ },
172
+ {
173
+ "epoch": 0.23880597014925373,
174
+ "grad_norm": 0.8180975914001465,
175
+ "learning_rate": 1e-05,
176
+ "loss": 1.999,
177
+ "step": 48
178
+ },
179
+ {
180
+ "epoch": 0.24875621890547264,
181
+ "grad_norm": 0.9668822884559631,
182
+ "learning_rate": 1e-05,
183
+ "loss": 1.8721,
184
+ "step": 50
185
+ },
186
+ {
187
+ "epoch": 0.25870646766169153,
188
+ "grad_norm": 0.6620003581047058,
189
+ "learning_rate": 1e-05,
190
+ "loss": 1.6611,
191
+ "step": 52
192
+ },
193
+ {
194
+ "epoch": 0.26865671641791045,
195
+ "grad_norm": 1.0094668865203857,
196
+ "learning_rate": 1e-05,
197
+ "loss": 2.0352,
198
+ "step": 54
199
+ },
200
+ {
201
+ "epoch": 0.27860696517412936,
202
+ "grad_norm": 0.8333507776260376,
203
+ "learning_rate": 1e-05,
204
+ "loss": 1.9297,
205
+ "step": 56
206
+ },
207
+ {
208
+ "epoch": 0.2885572139303483,
209
+ "grad_norm": 0.6568053364753723,
210
+ "learning_rate": 1e-05,
211
+ "loss": 1.8447,
212
+ "step": 58
213
+ },
214
+ {
215
+ "epoch": 0.29850746268656714,
216
+ "grad_norm": 1.129006028175354,
217
+ "learning_rate": 1e-05,
218
+ "loss": 1.8936,
219
+ "step": 60
220
+ },
221
+ {
222
+ "epoch": 0.30845771144278605,
223
+ "grad_norm": 0.7393130660057068,
224
+ "learning_rate": 1e-05,
225
+ "loss": 1.9336,
226
+ "step": 62
227
+ },
228
+ {
229
+ "epoch": 0.31840796019900497,
230
+ "grad_norm": 0.4612615704536438,
231
+ "learning_rate": 1e-05,
232
+ "loss": 1.9365,
233
+ "step": 64
234
+ },
235
+ {
236
+ "epoch": 0.3283582089552239,
237
+ "grad_norm": 0.6561993360519409,
238
+ "learning_rate": 1e-05,
239
+ "loss": 1.8389,
240
+ "step": 66
241
+ },
242
+ {
243
+ "epoch": 0.3383084577114428,
244
+ "grad_norm": 1.0325121879577637,
245
+ "learning_rate": 1e-05,
246
+ "loss": 1.8486,
247
+ "step": 68
248
+ },
249
+ {
250
+ "epoch": 0.3482587064676617,
251
+ "grad_norm": 0.7401711344718933,
252
+ "learning_rate": 1e-05,
253
+ "loss": 1.8662,
254
+ "step": 70
255
+ },
256
+ {
257
+ "epoch": 0.3582089552238806,
258
+ "grad_norm": 0.6198751330375671,
259
+ "learning_rate": 1e-05,
260
+ "loss": 1.8506,
261
+ "step": 72
262
+ },
263
+ {
264
+ "epoch": 0.3681592039800995,
265
+ "grad_norm": 0.6299334764480591,
266
+ "learning_rate": 1e-05,
267
+ "loss": 1.8555,
268
+ "step": 74
269
+ },
270
+ {
271
+ "epoch": 0.3781094527363184,
272
+ "grad_norm": 0.8257051706314087,
273
+ "learning_rate": 1e-05,
274
+ "loss": 1.7344,
275
+ "step": 76
276
+ },
277
+ {
278
+ "epoch": 0.3880597014925373,
279
+ "grad_norm": 0.8762025237083435,
280
+ "learning_rate": 1e-05,
281
+ "loss": 1.7891,
282
+ "step": 78
283
+ },
284
+ {
285
+ "epoch": 0.39800995024875624,
286
+ "grad_norm": 1.2744340896606445,
287
+ "learning_rate": 1e-05,
288
+ "loss": 1.9102,
289
+ "step": 80
290
+ },
291
+ {
292
+ "epoch": 0.4079601990049751,
293
+ "grad_norm": 0.5431731939315796,
294
+ "learning_rate": 1e-05,
295
+ "loss": 1.7705,
296
+ "step": 82
297
+ },
298
+ {
299
+ "epoch": 0.417910447761194,
300
+ "grad_norm": 0.8810946345329285,
301
+ "learning_rate": 1e-05,
302
+ "loss": 1.6855,
303
+ "step": 84
304
+ },
305
+ {
306
+ "epoch": 0.42786069651741293,
307
+ "grad_norm": 0.8568848967552185,
308
+ "learning_rate": 1e-05,
309
+ "loss": 1.7959,
310
+ "step": 86
311
+ },
312
+ {
313
+ "epoch": 0.43781094527363185,
314
+ "grad_norm": 0.9605632424354553,
315
+ "learning_rate": 1e-05,
316
+ "loss": 1.873,
317
+ "step": 88
318
+ },
319
+ {
320
+ "epoch": 0.44776119402985076,
321
+ "grad_norm": 0.512973964214325,
322
+ "learning_rate": 1e-05,
323
+ "loss": 1.7891,
324
+ "step": 90
325
+ },
326
+ {
327
+ "epoch": 0.4577114427860697,
328
+ "grad_norm": 0.723425567150116,
329
+ "learning_rate": 1e-05,
330
+ "loss": 1.877,
331
+ "step": 92
332
+ },
333
+ {
334
+ "epoch": 0.46766169154228854,
335
+ "grad_norm": 0.5228793025016785,
336
+ "learning_rate": 1e-05,
337
+ "loss": 1.999,
338
+ "step": 94
339
+ },
340
+ {
341
+ "epoch": 0.47761194029850745,
342
+ "grad_norm": 0.7799379825592041,
343
+ "learning_rate": 1e-05,
344
+ "loss": 1.751,
345
+ "step": 96
346
+ },
347
+ {
348
+ "epoch": 0.48756218905472637,
349
+ "grad_norm": 1.0080820322036743,
350
+ "learning_rate": 1e-05,
351
+ "loss": 1.877,
352
+ "step": 98
353
+ },
354
+ {
355
+ "epoch": 0.4975124378109453,
356
+ "grad_norm": 0.9821782112121582,
357
+ "learning_rate": 1e-05,
358
+ "loss": 1.8867,
359
+ "step": 100
360
+ },
361
+ {
362
+ "epoch": 0.5074626865671642,
363
+ "grad_norm": 0.5222265720367432,
364
+ "learning_rate": 1e-05,
365
+ "loss": 1.793,
366
+ "step": 102
367
+ },
368
+ {
369
+ "epoch": 0.5174129353233831,
370
+ "grad_norm": 0.5731136798858643,
371
+ "learning_rate": 1e-05,
372
+ "loss": 1.915,
373
+ "step": 104
374
+ },
375
+ {
376
+ "epoch": 0.527363184079602,
377
+ "grad_norm": 0.6745629906654358,
378
+ "learning_rate": 1e-05,
379
+ "loss": 1.7998,
380
+ "step": 106
381
+ },
382
+ {
383
+ "epoch": 0.5373134328358209,
384
+ "grad_norm": 0.7346249222755432,
385
+ "learning_rate": 1e-05,
386
+ "loss": 1.7988,
387
+ "step": 108
388
+ },
389
+ {
390
+ "epoch": 0.5472636815920398,
391
+ "grad_norm": 0.6089544892311096,
392
+ "learning_rate": 1e-05,
393
+ "loss": 1.7949,
394
+ "step": 110
395
+ },
396
+ {
397
+ "epoch": 0.5572139303482587,
398
+ "grad_norm": 0.9230899214744568,
399
+ "learning_rate": 1e-05,
400
+ "loss": 1.9463,
401
+ "step": 112
402
+ },
403
+ {
404
+ "epoch": 0.5671641791044776,
405
+ "grad_norm": 0.8394888639450073,
406
+ "learning_rate": 1e-05,
407
+ "loss": 1.9131,
408
+ "step": 114
409
+ },
410
+ {
411
+ "epoch": 0.5771144278606966,
412
+ "grad_norm": 0.603209376335144,
413
+ "learning_rate": 1e-05,
414
+ "loss": 1.8389,
415
+ "step": 116
416
+ },
417
+ {
418
+ "epoch": 0.5870646766169154,
419
+ "grad_norm": 0.6753935813903809,
420
+ "learning_rate": 1e-05,
421
+ "loss": 1.8379,
422
+ "step": 118
423
+ },
424
+ {
425
+ "epoch": 0.5970149253731343,
426
+ "grad_norm": 0.7781857252120972,
427
+ "learning_rate": 1e-05,
428
+ "loss": 1.8662,
429
+ "step": 120
430
+ },
431
+ {
432
+ "epoch": 0.6069651741293532,
433
+ "grad_norm": 0.6543675661087036,
434
+ "learning_rate": 1e-05,
435
+ "loss": 1.8711,
436
+ "step": 122
437
+ },
438
+ {
439
+ "epoch": 0.6169154228855721,
440
+ "grad_norm": 0.7465837001800537,
441
+ "learning_rate": 1e-05,
442
+ "loss": 1.8457,
443
+ "step": 124
444
+ },
445
+ {
446
+ "epoch": 0.6268656716417911,
447
+ "grad_norm": 0.6059397459030151,
448
+ "learning_rate": 1e-05,
449
+ "loss": 1.8184,
450
+ "step": 126
451
+ },
452
+ {
453
+ "epoch": 0.6368159203980099,
454
+ "grad_norm": 0.6485504508018494,
455
+ "learning_rate": 1e-05,
456
+ "loss": 1.877,
457
+ "step": 128
458
+ },
459
+ {
460
+ "epoch": 0.6467661691542289,
461
+ "grad_norm": 0.6433750987052917,
462
+ "learning_rate": 1e-05,
463
+ "loss": 1.7803,
464
+ "step": 130
465
+ },
466
+ {
467
+ "epoch": 0.6567164179104478,
468
+ "grad_norm": 0.6054277420043945,
469
+ "learning_rate": 1e-05,
470
+ "loss": 1.8145,
471
+ "step": 132
472
+ },
473
+ {
474
+ "epoch": 0.6666666666666666,
475
+ "grad_norm": 0.9794463515281677,
476
+ "learning_rate": 1e-05,
477
+ "loss": 1.6592,
478
+ "step": 134
479
+ },
480
+ {
481
+ "epoch": 0.6766169154228856,
482
+ "grad_norm": 1.128212332725525,
483
+ "learning_rate": 1e-05,
484
+ "loss": 2.042,
485
+ "step": 136
486
+ },
487
+ {
488
+ "epoch": 0.6865671641791045,
489
+ "grad_norm": 0.848319947719574,
490
+ "learning_rate": 1e-05,
491
+ "loss": 1.9219,
492
+ "step": 138
493
+ },
494
+ {
495
+ "epoch": 0.6965174129353234,
496
+ "grad_norm": 1.446349859237671,
497
+ "learning_rate": 1e-05,
498
+ "loss": 1.9805,
499
+ "step": 140
500
+ },
501
+ {
502
+ "epoch": 0.7064676616915423,
503
+ "grad_norm": 0.8592532277107239,
504
+ "learning_rate": 1e-05,
505
+ "loss": 1.9404,
506
+ "step": 142
507
+ },
508
+ {
509
+ "epoch": 0.7164179104477612,
510
+ "grad_norm": 0.7463251948356628,
511
+ "learning_rate": 1e-05,
512
+ "loss": 2.0098,
513
+ "step": 144
514
+ },
515
+ {
516
+ "epoch": 0.7263681592039801,
517
+ "grad_norm": 0.6972345113754272,
518
+ "learning_rate": 1e-05,
519
+ "loss": 1.9043,
520
+ "step": 146
521
+ },
522
+ {
523
+ "epoch": 0.736318407960199,
524
+ "grad_norm": 1.0360370874404907,
525
+ "learning_rate": 1e-05,
526
+ "loss": 1.918,
527
+ "step": 148
528
+ },
529
+ {
530
+ "epoch": 0.746268656716418,
531
+ "grad_norm": 0.7613181471824646,
532
+ "learning_rate": 1e-05,
533
+ "loss": 1.8232,
534
+ "step": 150
535
+ },
536
+ {
537
+ "epoch": 0.7562189054726368,
538
+ "grad_norm": 0.8578123450279236,
539
+ "learning_rate": 1e-05,
540
+ "loss": 1.9209,
541
+ "step": 152
542
+ },
543
+ {
544
+ "epoch": 0.7661691542288557,
545
+ "grad_norm": 0.6234486103057861,
546
+ "learning_rate": 1e-05,
547
+ "loss": 1.8105,
548
+ "step": 154
549
+ },
550
+ {
551
+ "epoch": 0.7761194029850746,
552
+ "grad_norm": 0.8788239359855652,
553
+ "learning_rate": 1e-05,
554
+ "loss": 1.7852,
555
+ "step": 156
556
+ },
557
+ {
558
+ "epoch": 0.7860696517412935,
559
+ "grad_norm": 0.5887688994407654,
560
+ "learning_rate": 1e-05,
561
+ "loss": 1.835,
562
+ "step": 158
563
+ },
564
+ {
565
+ "epoch": 0.7960199004975125,
566
+ "grad_norm": 0.5808454155921936,
567
+ "learning_rate": 1e-05,
568
+ "loss": 1.8691,
569
+ "step": 160
570
+ },
571
+ {
572
+ "epoch": 0.8059701492537313,
573
+ "grad_norm": 0.8322702050209045,
574
+ "learning_rate": 1e-05,
575
+ "loss": 1.8652,
576
+ "step": 162
577
+ },
578
+ {
579
+ "epoch": 0.8159203980099502,
580
+ "grad_norm": 0.6851075291633606,
581
+ "learning_rate": 1e-05,
582
+ "loss": 2.043,
583
+ "step": 164
584
+ },
585
+ {
586
+ "epoch": 0.8258706467661692,
587
+ "grad_norm": 0.6591010093688965,
588
+ "learning_rate": 1e-05,
589
+ "loss": 1.8418,
590
+ "step": 166
591
+ },
592
+ {
593
+ "epoch": 0.835820895522388,
594
+ "grad_norm": 0.9328513145446777,
595
+ "learning_rate": 1e-05,
596
+ "loss": 1.8467,
597
+ "step": 168
598
+ },
599
+ {
600
+ "epoch": 0.845771144278607,
601
+ "grad_norm": 0.7491399049758911,
602
+ "learning_rate": 1e-05,
603
+ "loss": 1.7734,
604
+ "step": 170
605
+ },
606
+ {
607
+ "epoch": 0.8557213930348259,
608
+ "grad_norm": 0.6368930339813232,
609
+ "learning_rate": 1e-05,
610
+ "loss": 1.8408,
611
+ "step": 172
612
+ },
613
+ {
614
+ "epoch": 0.8656716417910447,
615
+ "grad_norm": 0.6846456527709961,
616
+ "learning_rate": 1e-05,
617
+ "loss": 1.9053,
618
+ "step": 174
619
+ },
620
+ {
621
+ "epoch": 0.8756218905472637,
622
+ "grad_norm": 0.5860757231712341,
623
+ "learning_rate": 1e-05,
624
+ "loss": 1.8438,
625
+ "step": 176
626
+ },
627
+ {
628
+ "epoch": 0.8855721393034826,
629
+ "grad_norm": 0.6338534355163574,
630
+ "learning_rate": 1e-05,
631
+ "loss": 1.8496,
632
+ "step": 178
633
+ },
634
+ {
635
+ "epoch": 0.8955223880597015,
636
+ "grad_norm": 0.5710776448249817,
637
+ "learning_rate": 1e-05,
638
+ "loss": 1.7627,
639
+ "step": 180
640
+ },
641
+ {
642
+ "epoch": 0.9054726368159204,
643
+ "grad_norm": 0.7385186553001404,
644
+ "learning_rate": 1e-05,
645
+ "loss": 1.791,
646
+ "step": 182
647
+ },
648
+ {
649
+ "epoch": 0.9154228855721394,
650
+ "grad_norm": 0.5550143122673035,
651
+ "learning_rate": 1e-05,
652
+ "loss": 1.7607,
653
+ "step": 184
654
+ },
655
+ {
656
+ "epoch": 0.9253731343283582,
657
+ "grad_norm": 0.6846106648445129,
658
+ "learning_rate": 1e-05,
659
+ "loss": 1.8447,
660
+ "step": 186
661
+ },
662
+ {
663
+ "epoch": 0.9353233830845771,
664
+ "grad_norm": 0.43355798721313477,
665
+ "learning_rate": 1e-05,
666
+ "loss": 1.7822,
667
+ "step": 188
668
+ },
669
+ {
670
+ "epoch": 0.945273631840796,
671
+ "grad_norm": 0.6083195209503174,
672
+ "learning_rate": 1e-05,
673
+ "loss": 1.8418,
674
+ "step": 190
675
+ },
676
+ {
677
+ "epoch": 0.9552238805970149,
678
+ "grad_norm": 1.3910738229751587,
679
+ "learning_rate": 1e-05,
680
+ "loss": 2.0508,
681
+ "step": 192
682
+ },
683
+ {
684
+ "epoch": 0.9651741293532339,
685
+ "grad_norm": 0.6805091500282288,
686
+ "learning_rate": 1e-05,
687
+ "loss": 1.8906,
688
+ "step": 194
689
+ },
690
+ {
691
+ "epoch": 0.9751243781094527,
692
+ "grad_norm": 0.7249168753623962,
693
+ "learning_rate": 1e-05,
694
+ "loss": 1.9424,
695
+ "step": 196
696
+ },
697
+ {
698
+ "epoch": 0.9850746268656716,
699
+ "grad_norm": 0.6910979747772217,
700
+ "learning_rate": 1e-05,
701
+ "loss": 1.8467,
702
+ "step": 198
703
+ },
704
+ {
705
+ "epoch": 0.9950248756218906,
706
+ "grad_norm": 0.5376845598220825,
707
+ "learning_rate": 1e-05,
708
+ "loss": 1.8105,
709
+ "step": 200
710
+ },
711
+ {
712
+ "epoch": 1.0,
713
+ "step": 201,
714
+ "total_flos": 4.207228707510682e+16,
715
+ "train_loss": 1.8549926150497513,
716
+ "train_runtime": 548.5019,
717
+ "train_samples_per_second": 1.466,
718
+ "train_steps_per_second": 0.366
719
+ }
720
+ ],
721
+ "logging_steps": 2,
722
+ "max_steps": 201,
723
+ "num_input_tokens_seen": 0,
724
+ "num_train_epochs": 1,
725
+ "save_steps": 500,
726
+ "stateful_callbacks": {
727
+ "TrainerControl": {
728
+ "args": {
729
+ "should_epoch_stop": false,
730
+ "should_evaluate": false,
731
+ "should_log": false,
732
+ "should_save": false,
733
+ "should_training_stop": false
734
+ },
735
+ "attributes": {}
736
+ }
737
+ },
738
+ "total_flos": 4.207228707510682e+16,
739
+ "train_batch_size": 1,
740
+ "trial_name": null,
741
+ "trial_params": null
742
+ }
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:001f4000001fb736aff464f30f0d782601e31161ce407d88322d15b792880c1a
3
+ size 389170122
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e8b0d276fa4d098986bf6074ca63416ea79b7ec5a916f8e4a01940fc76660ea
3
+ size 389172166
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr100.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d437b04572abbed85cb39c65fabeb508959096c75388d5bef11b19e797566c0e
3
+ size 389172958
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr125.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8a4da4cc37ea14cc4b5aee2b93783d9e7237d59b56cb34e8efac177e0efb89b
3
+ size 389172958
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr150.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10b27d2f9cf8fe79ff703abcfc09de76db592720e6c8c20b3111e702fe7ac719
3
+ size 389172958
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr175.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:191069f1dce0ec6078bf81d2ff2e78e66a559f44093cc8003ac4893a73b8e058
3
+ size 389172958
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr200.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:102805372f12920f3fa2ed2d140065828e2e2d25ab3432d393c954c4ce1d6c98
3
+ size 389172958
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr25.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d82e1b14da8bbba34bdd50adbbf25114daf82ba7984c890086c6b7e42fefba2
3
+ size 389172562
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr50.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4a1906c160dc30bf726a4f54af8b913ba4043c992dbc5d778a3e5f4cf346c13
3
+ size 389172562
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_client_model_round1_itr75.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:069a89736b0f91c14b1703db6b7d7c151a1213b54437d9a99f4972fe03ffab7b
3
+ size 389172562
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/1_trainer_state.json ADDED
@@ -0,0 +1,742 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 500,
6
+ "global_step": 201,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.009950248756218905,
13
+ "grad_norm": 1.3227640390396118,
14
+ "learning_rate": 1e-05,
15
+ "loss": 1.8506,
16
+ "step": 2
17
+ },
18
+ {
19
+ "epoch": 0.01990049751243781,
20
+ "grad_norm": 1.6141871213912964,
21
+ "learning_rate": 1e-05,
22
+ "loss": 1.8975,
23
+ "step": 4
24
+ },
25
+ {
26
+ "epoch": 0.029850746268656716,
27
+ "grad_norm": 1.6484756469726562,
28
+ "learning_rate": 1e-05,
29
+ "loss": 1.749,
30
+ "step": 6
31
+ },
32
+ {
33
+ "epoch": 0.03980099502487562,
34
+ "grad_norm": 1.487459659576416,
35
+ "learning_rate": 1e-05,
36
+ "loss": 1.6318,
37
+ "step": 8
38
+ },
39
+ {
40
+ "epoch": 0.04975124378109453,
41
+ "grad_norm": 1.5136044025421143,
42
+ "learning_rate": 1e-05,
43
+ "loss": 1.5537,
44
+ "step": 10
45
+ },
46
+ {
47
+ "epoch": 0.05970149253731343,
48
+ "grad_norm": 1.2955031394958496,
49
+ "learning_rate": 1e-05,
50
+ "loss": 1.623,
51
+ "step": 12
52
+ },
53
+ {
54
+ "epoch": 0.06965174129353234,
55
+ "grad_norm": 1.6998140811920166,
56
+ "learning_rate": 1e-05,
57
+ "loss": 2.0244,
58
+ "step": 14
59
+ },
60
+ {
61
+ "epoch": 0.07960199004975124,
62
+ "grad_norm": 2.039724111557007,
63
+ "learning_rate": 1e-05,
64
+ "loss": 1.3848,
65
+ "step": 16
66
+ },
67
+ {
68
+ "epoch": 0.08955223880597014,
69
+ "grad_norm": 3.201810598373413,
70
+ "learning_rate": 1e-05,
71
+ "loss": 1.9473,
72
+ "step": 18
73
+ },
74
+ {
75
+ "epoch": 0.09950248756218906,
76
+ "grad_norm": 2.301619291305542,
77
+ "learning_rate": 1e-05,
78
+ "loss": 0.9546,
79
+ "step": 20
80
+ },
81
+ {
82
+ "epoch": 0.10945273631840796,
83
+ "grad_norm": 3.1304359436035156,
84
+ "learning_rate": 1e-05,
85
+ "loss": 1.5273,
86
+ "step": 22
87
+ },
88
+ {
89
+ "epoch": 0.11940298507462686,
90
+ "grad_norm": 1.8952662944793701,
91
+ "learning_rate": 1e-05,
92
+ "loss": 0.742,
93
+ "step": 24
94
+ },
95
+ {
96
+ "epoch": 0.12935323383084577,
97
+ "grad_norm": 4.763426780700684,
98
+ "learning_rate": 1e-05,
99
+ "loss": 1.4138,
100
+ "step": 26
101
+ },
102
+ {
103
+ "epoch": 0.13930348258706468,
104
+ "grad_norm": 3.3053810596466064,
105
+ "learning_rate": 1e-05,
106
+ "loss": 1.009,
107
+ "step": 28
108
+ },
109
+ {
110
+ "epoch": 0.14925373134328357,
111
+ "grad_norm": 3.5452332496643066,
112
+ "learning_rate": 1e-05,
113
+ "loss": 1.4029,
114
+ "step": 30
115
+ },
116
+ {
117
+ "epoch": 0.15920398009950248,
118
+ "grad_norm": 3.621952533721924,
119
+ "learning_rate": 1e-05,
120
+ "loss": 0.972,
121
+ "step": 32
122
+ },
123
+ {
124
+ "epoch": 0.1691542288557214,
125
+ "grad_norm": 3.8620715141296387,
126
+ "learning_rate": 1e-05,
127
+ "loss": 1.1833,
128
+ "step": 34
129
+ },
130
+ {
131
+ "epoch": 0.1791044776119403,
132
+ "grad_norm": 5.8020195960998535,
133
+ "learning_rate": 1e-05,
134
+ "loss": 1.7115,
135
+ "step": 36
136
+ },
137
+ {
138
+ "epoch": 0.1890547263681592,
139
+ "grad_norm": 3.4086718559265137,
140
+ "learning_rate": 1e-05,
141
+ "loss": 1.1666,
142
+ "step": 38
143
+ },
144
+ {
145
+ "epoch": 0.19900497512437812,
146
+ "grad_norm": 2.975222587585449,
147
+ "learning_rate": 1e-05,
148
+ "loss": 0.7056,
149
+ "step": 40
150
+ },
151
+ {
152
+ "epoch": 0.208955223880597,
153
+ "grad_norm": 3.689805507659912,
154
+ "learning_rate": 1e-05,
155
+ "loss": 1.5175,
156
+ "step": 42
157
+ },
158
+ {
159
+ "epoch": 0.21890547263681592,
160
+ "grad_norm": 5.3562912940979,
161
+ "learning_rate": 1e-05,
162
+ "loss": 0.6652,
163
+ "step": 44
164
+ },
165
+ {
166
+ "epoch": 0.22885572139303484,
167
+ "grad_norm": 4.174887657165527,
168
+ "learning_rate": 1e-05,
169
+ "loss": 0.4799,
170
+ "step": 46
171
+ },
172
+ {
173
+ "epoch": 0.23880597014925373,
174
+ "grad_norm": 3.432663679122925,
175
+ "learning_rate": 1e-05,
176
+ "loss": 0.7211,
177
+ "step": 48
178
+ },
179
+ {
180
+ "epoch": 0.24875621890547264,
181
+ "grad_norm": 4.918137073516846,
182
+ "learning_rate": 1e-05,
183
+ "loss": 0.8524,
184
+ "step": 50
185
+ },
186
+ {
187
+ "epoch": 0.25870646766169153,
188
+ "grad_norm": 1.390620470046997,
189
+ "learning_rate": 1e-05,
190
+ "loss": 0.1488,
191
+ "step": 52
192
+ },
193
+ {
194
+ "epoch": 0.26865671641791045,
195
+ "grad_norm": 4.325483322143555,
196
+ "learning_rate": 1e-05,
197
+ "loss": 0.9623,
198
+ "step": 54
199
+ },
200
+ {
201
+ "epoch": 0.27860696517412936,
202
+ "grad_norm": 1.1009166240692139,
203
+ "learning_rate": 1e-05,
204
+ "loss": 0.3015,
205
+ "step": 56
206
+ },
207
+ {
208
+ "epoch": 0.2885572139303483,
209
+ "grad_norm": 5.028674125671387,
210
+ "learning_rate": 1e-05,
211
+ "loss": 0.8217,
212
+ "step": 58
213
+ },
214
+ {
215
+ "epoch": 0.29850746268656714,
216
+ "grad_norm": 6.246382713317871,
217
+ "learning_rate": 1e-05,
218
+ "loss": 0.5652,
219
+ "step": 60
220
+ },
221
+ {
222
+ "epoch": 0.30845771144278605,
223
+ "grad_norm": 3.5103182792663574,
224
+ "learning_rate": 1e-05,
225
+ "loss": 0.4114,
226
+ "step": 62
227
+ },
228
+ {
229
+ "epoch": 0.31840796019900497,
230
+ "grad_norm": 5.664974689483643,
231
+ "learning_rate": 1e-05,
232
+ "loss": 0.918,
233
+ "step": 64
234
+ },
235
+ {
236
+ "epoch": 0.3283582089552239,
237
+ "grad_norm": 10.550684928894043,
238
+ "learning_rate": 1e-05,
239
+ "loss": 0.8192,
240
+ "step": 66
241
+ },
242
+ {
243
+ "epoch": 0.3383084577114428,
244
+ "grad_norm": 0.7836717367172241,
245
+ "learning_rate": 1e-05,
246
+ "loss": 0.3381,
247
+ "step": 68
248
+ },
249
+ {
250
+ "epoch": 0.3482587064676617,
251
+ "grad_norm": 1.888235330581665,
252
+ "learning_rate": 1e-05,
253
+ "loss": 0.1491,
254
+ "step": 70
255
+ },
256
+ {
257
+ "epoch": 0.3582089552238806,
258
+ "grad_norm": 7.468411445617676,
259
+ "learning_rate": 1e-05,
260
+ "loss": 0.9326,
261
+ "step": 72
262
+ },
263
+ {
264
+ "epoch": 0.3681592039800995,
265
+ "grad_norm": 8.028440475463867,
266
+ "learning_rate": 1e-05,
267
+ "loss": 1.3224,
268
+ "step": 74
269
+ },
270
+ {
271
+ "epoch": 0.3781094527363184,
272
+ "grad_norm": 10.142037391662598,
273
+ "learning_rate": 1e-05,
274
+ "loss": 0.6093,
275
+ "step": 76
276
+ },
277
+ {
278
+ "epoch": 0.3880597014925373,
279
+ "grad_norm": 4.81419563293457,
280
+ "learning_rate": 1e-05,
281
+ "loss": 0.4134,
282
+ "step": 78
283
+ },
284
+ {
285
+ "epoch": 0.39800995024875624,
286
+ "grad_norm": 7.888396739959717,
287
+ "learning_rate": 1e-05,
288
+ "loss": 1.6637,
289
+ "step": 80
290
+ },
291
+ {
292
+ "epoch": 0.4079601990049751,
293
+ "grad_norm": 1.8005106449127197,
294
+ "learning_rate": 1e-05,
295
+ "loss": 1.7748,
296
+ "step": 82
297
+ },
298
+ {
299
+ "epoch": 0.417910447761194,
300
+ "grad_norm": 0.9087793827056885,
301
+ "learning_rate": 1e-05,
302
+ "loss": 0.3785,
303
+ "step": 84
304
+ },
305
+ {
306
+ "epoch": 0.42786069651741293,
307
+ "grad_norm": 4.730865955352783,
308
+ "learning_rate": 1e-05,
309
+ "loss": 0.5779,
310
+ "step": 86
311
+ },
312
+ {
313
+ "epoch": 0.43781094527363185,
314
+ "grad_norm": 8.102535247802734,
315
+ "learning_rate": 1e-05,
316
+ "loss": 0.6772,
317
+ "step": 88
318
+ },
319
+ {
320
+ "epoch": 0.44776119402985076,
321
+ "grad_norm": 6.577178001403809,
322
+ "learning_rate": 1e-05,
323
+ "loss": 1.6124,
324
+ "step": 90
325
+ },
326
+ {
327
+ "epoch": 0.4577114427860697,
328
+ "grad_norm": 1.7844473123550415,
329
+ "learning_rate": 1e-05,
330
+ "loss": 0.4683,
331
+ "step": 92
332
+ },
333
+ {
334
+ "epoch": 0.46766169154228854,
335
+ "grad_norm": 5.1499247550964355,
336
+ "learning_rate": 1e-05,
337
+ "loss": 0.8164,
338
+ "step": 94
339
+ },
340
+ {
341
+ "epoch": 0.47761194029850745,
342
+ "grad_norm": 3.6172220706939697,
343
+ "learning_rate": 1e-05,
344
+ "loss": 0.9205,
345
+ "step": 96
346
+ },
347
+ {
348
+ "epoch": 0.48756218905472637,
349
+ "grad_norm": 1.3999346494674683,
350
+ "learning_rate": 1e-05,
351
+ "loss": 0.4349,
352
+ "step": 98
353
+ },
354
+ {
355
+ "epoch": 0.4975124378109453,
356
+ "grad_norm": 4.574583530426025,
357
+ "learning_rate": 1e-05,
358
+ "loss": 0.5055,
359
+ "step": 100
360
+ },
361
+ {
362
+ "epoch": 0.5074626865671642,
363
+ "grad_norm": 3.0469908714294434,
364
+ "learning_rate": 1e-05,
365
+ "loss": 0.6799,
366
+ "step": 102
367
+ },
368
+ {
369
+ "epoch": 0.5174129353233831,
370
+ "grad_norm": 1.137192726135254,
371
+ "learning_rate": 1e-05,
372
+ "loss": 0.2079,
373
+ "step": 104
374
+ },
375
+ {
376
+ "epoch": 0.527363184079602,
377
+ "grad_norm": 8.398505210876465,
378
+ "learning_rate": 1e-05,
379
+ "loss": 0.8173,
380
+ "step": 106
381
+ },
382
+ {
383
+ "epoch": 0.5373134328358209,
384
+ "grad_norm": 4.197858810424805,
385
+ "learning_rate": 1e-05,
386
+ "loss": 0.811,
387
+ "step": 108
388
+ },
389
+ {
390
+ "epoch": 0.5472636815920398,
391
+ "grad_norm": 3.6865429878234863,
392
+ "learning_rate": 1e-05,
393
+ "loss": 0.9762,
394
+ "step": 110
395
+ },
396
+ {
397
+ "epoch": 0.5572139303482587,
398
+ "grad_norm": 2.2864203453063965,
399
+ "learning_rate": 1e-05,
400
+ "loss": 0.2215,
401
+ "step": 112
402
+ },
403
+ {
404
+ "epoch": 0.5671641791044776,
405
+ "grad_norm": 0.3491150438785553,
406
+ "learning_rate": 1e-05,
407
+ "loss": 0.1091,
408
+ "step": 114
409
+ },
410
+ {
411
+ "epoch": 0.5771144278606966,
412
+ "grad_norm": 0.8744693398475647,
413
+ "learning_rate": 1e-05,
414
+ "loss": 0.1872,
415
+ "step": 116
416
+ },
417
+ {
418
+ "epoch": 0.5870646766169154,
419
+ "grad_norm": 7.648612976074219,
420
+ "learning_rate": 1e-05,
421
+ "loss": 0.4138,
422
+ "step": 118
423
+ },
424
+ {
425
+ "epoch": 0.5970149253731343,
426
+ "grad_norm": 4.401617050170898,
427
+ "learning_rate": 1e-05,
428
+ "loss": 0.988,
429
+ "step": 120
430
+ },
431
+ {
432
+ "epoch": 0.6069651741293532,
433
+ "grad_norm": 5.498955726623535,
434
+ "learning_rate": 1e-05,
435
+ "loss": 0.2035,
436
+ "step": 122
437
+ },
438
+ {
439
+ "epoch": 0.6169154228855721,
440
+ "grad_norm": 7.8499250411987305,
441
+ "learning_rate": 1e-05,
442
+ "loss": 1.3216,
443
+ "step": 124
444
+ },
445
+ {
446
+ "epoch": 0.6268656716417911,
447
+ "grad_norm": 4.023660182952881,
448
+ "learning_rate": 1e-05,
449
+ "loss": 0.8133,
450
+ "step": 126
451
+ },
452
+ {
453
+ "epoch": 0.6368159203980099,
454
+ "grad_norm": 1.412724494934082,
455
+ "learning_rate": 1e-05,
456
+ "loss": 0.3464,
457
+ "step": 128
458
+ },
459
+ {
460
+ "epoch": 0.6467661691542289,
461
+ "grad_norm": 5.523179531097412,
462
+ "learning_rate": 1e-05,
463
+ "loss": 0.6978,
464
+ "step": 130
465
+ },
466
+ {
467
+ "epoch": 0.6567164179104478,
468
+ "grad_norm": 13.196066856384277,
469
+ "learning_rate": 1e-05,
470
+ "loss": 1.3471,
471
+ "step": 132
472
+ },
473
+ {
474
+ "epoch": 0.6666666666666666,
475
+ "grad_norm": 5.424158096313477,
476
+ "learning_rate": 1e-05,
477
+ "loss": 0.6814,
478
+ "step": 134
479
+ },
480
+ {
481
+ "epoch": 0.6766169154228856,
482
+ "grad_norm": 1.4407273530960083,
483
+ "learning_rate": 1e-05,
484
+ "loss": 0.1066,
485
+ "step": 136
486
+ },
487
+ {
488
+ "epoch": 0.6865671641791045,
489
+ "grad_norm": 6.258295059204102,
490
+ "learning_rate": 1e-05,
491
+ "loss": 0.9792,
492
+ "step": 138
493
+ },
494
+ {
495
+ "epoch": 0.6965174129353234,
496
+ "grad_norm": 4.438701152801514,
497
+ "learning_rate": 1e-05,
498
+ "loss": 0.406,
499
+ "step": 140
500
+ },
501
+ {
502
+ "epoch": 0.7064676616915423,
503
+ "grad_norm": 3.083000659942627,
504
+ "learning_rate": 1e-05,
505
+ "loss": 0.915,
506
+ "step": 142
507
+ },
508
+ {
509
+ "epoch": 0.7164179104477612,
510
+ "grad_norm": 0.6187798976898193,
511
+ "learning_rate": 1e-05,
512
+ "loss": 0.0508,
513
+ "step": 144
514
+ },
515
+ {
516
+ "epoch": 0.7263681592039801,
517
+ "grad_norm": 0.6553718447685242,
518
+ "learning_rate": 1e-05,
519
+ "loss": 0.114,
520
+ "step": 146
521
+ },
522
+ {
523
+ "epoch": 0.736318407960199,
524
+ "grad_norm": 0.5623739957809448,
525
+ "learning_rate": 1e-05,
526
+ "loss": 0.134,
527
+ "step": 148
528
+ },
529
+ {
530
+ "epoch": 0.746268656716418,
531
+ "grad_norm": 9.245420455932617,
532
+ "learning_rate": 1e-05,
533
+ "loss": 1.0717,
534
+ "step": 150
535
+ },
536
+ {
537
+ "epoch": 0.7562189054726368,
538
+ "grad_norm": 3.2727997303009033,
539
+ "learning_rate": 1e-05,
540
+ "loss": 0.3121,
541
+ "step": 152
542
+ },
543
+ {
544
+ "epoch": 0.7661691542288557,
545
+ "grad_norm": 2.23881459236145,
546
+ "learning_rate": 1e-05,
547
+ "loss": 0.3009,
548
+ "step": 154
549
+ },
550
+ {
551
+ "epoch": 0.7761194029850746,
552
+ "grad_norm": 3.4759159088134766,
553
+ "learning_rate": 1e-05,
554
+ "loss": 0.3648,
555
+ "step": 156
556
+ },
557
+ {
558
+ "epoch": 0.7860696517412935,
559
+ "grad_norm": 3.8757474422454834,
560
+ "learning_rate": 1e-05,
561
+ "loss": 1.1435,
562
+ "step": 158
563
+ },
564
+ {
565
+ "epoch": 0.7960199004975125,
566
+ "grad_norm": 2.606724262237549,
567
+ "learning_rate": 1e-05,
568
+ "loss": 0.227,
569
+ "step": 160
570
+ },
571
+ {
572
+ "epoch": 0.8059701492537313,
573
+ "grad_norm": 4.037679672241211,
574
+ "learning_rate": 1e-05,
575
+ "loss": 0.3358,
576
+ "step": 162
577
+ },
578
+ {
579
+ "epoch": 0.8159203980099502,
580
+ "grad_norm": 5.446840286254883,
581
+ "learning_rate": 1e-05,
582
+ "loss": 0.2191,
583
+ "step": 164
584
+ },
585
+ {
586
+ "epoch": 0.8258706467661692,
587
+ "grad_norm": 5.227675437927246,
588
+ "learning_rate": 1e-05,
589
+ "loss": 0.2977,
590
+ "step": 166
591
+ },
592
+ {
593
+ "epoch": 0.835820895522388,
594
+ "grad_norm": 3.955387592315674,
595
+ "learning_rate": 1e-05,
596
+ "loss": 0.2853,
597
+ "step": 168
598
+ },
599
+ {
600
+ "epoch": 0.845771144278607,
601
+ "grad_norm": 3.391467332839966,
602
+ "learning_rate": 1e-05,
603
+ "loss": 1.1612,
604
+ "step": 170
605
+ },
606
+ {
607
+ "epoch": 0.8557213930348259,
608
+ "grad_norm": 3.6372454166412354,
609
+ "learning_rate": 1e-05,
610
+ "loss": 0.2642,
611
+ "step": 172
612
+ },
613
+ {
614
+ "epoch": 0.8656716417910447,
615
+ "grad_norm": 6.628920078277588,
616
+ "learning_rate": 1e-05,
617
+ "loss": 0.7231,
618
+ "step": 174
619
+ },
620
+ {
621
+ "epoch": 0.8756218905472637,
622
+ "grad_norm": 0.42457300424575806,
623
+ "learning_rate": 1e-05,
624
+ "loss": 0.0519,
625
+ "step": 176
626
+ },
627
+ {
628
+ "epoch": 0.8855721393034826,
629
+ "grad_norm": 2.6521382331848145,
630
+ "learning_rate": 1e-05,
631
+ "loss": 0.2382,
632
+ "step": 178
633
+ },
634
+ {
635
+ "epoch": 0.8955223880597015,
636
+ "grad_norm": 0.1870870143175125,
637
+ "learning_rate": 1e-05,
638
+ "loss": 0.6789,
639
+ "step": 180
640
+ },
641
+ {
642
+ "epoch": 0.9054726368159204,
643
+ "grad_norm": 0.5534329414367676,
644
+ "learning_rate": 1e-05,
645
+ "loss": 0.0671,
646
+ "step": 182
647
+ },
648
+ {
649
+ "epoch": 0.9154228855721394,
650
+ "grad_norm": 3.863987922668457,
651
+ "learning_rate": 1e-05,
652
+ "loss": 0.273,
653
+ "step": 184
654
+ },
655
+ {
656
+ "epoch": 0.9253731343283582,
657
+ "grad_norm": 0.2802110016345978,
658
+ "learning_rate": 1e-05,
659
+ "loss": 0.0354,
660
+ "step": 186
661
+ },
662
+ {
663
+ "epoch": 0.9353233830845771,
664
+ "grad_norm": 0.616949200630188,
665
+ "learning_rate": 1e-05,
666
+ "loss": 0.446,
667
+ "step": 188
668
+ },
669
+ {
670
+ "epoch": 0.945273631840796,
671
+ "grad_norm": 1.3538764715194702,
672
+ "learning_rate": 1e-05,
673
+ "loss": 0.1169,
674
+ "step": 190
675
+ },
676
+ {
677
+ "epoch": 0.9552238805970149,
678
+ "grad_norm": 7.0314836502075195,
679
+ "learning_rate": 1e-05,
680
+ "loss": 1.3759,
681
+ "step": 192
682
+ },
683
+ {
684
+ "epoch": 0.9651741293532339,
685
+ "grad_norm": 5.94874906539917,
686
+ "learning_rate": 1e-05,
687
+ "loss": 0.3079,
688
+ "step": 194
689
+ },
690
+ {
691
+ "epoch": 0.9751243781094527,
692
+ "grad_norm": 1.551829218864441,
693
+ "learning_rate": 1e-05,
694
+ "loss": 0.1686,
695
+ "step": 196
696
+ },
697
+ {
698
+ "epoch": 0.9850746268656716,
699
+ "grad_norm": 4.5909647941589355,
700
+ "learning_rate": 1e-05,
701
+ "loss": 0.5421,
702
+ "step": 198
703
+ },
704
+ {
705
+ "epoch": 0.9950248756218906,
706
+ "grad_norm": 9.215164184570312,
707
+ "learning_rate": 1e-05,
708
+ "loss": 1.2799,
709
+ "step": 200
710
+ },
711
+ {
712
+ "epoch": 1.0,
713
+ "step": 201,
714
+ "total_flos": 7.039446888428339e+16,
715
+ "train_loss": 0.7667810383127697,
716
+ "train_runtime": 927.2228,
717
+ "train_samples_per_second": 0.867,
718
+ "train_steps_per_second": 0.217
719
+ }
720
+ ],
721
+ "logging_steps": 2,
722
+ "max_steps": 201,
723
+ "num_input_tokens_seen": 0,
724
+ "num_train_epochs": 1,
725
+ "save_steps": 500,
726
+ "stateful_callbacks": {
727
+ "TrainerControl": {
728
+ "args": {
729
+ "should_epoch_stop": false,
730
+ "should_evaluate": false,
731
+ "should_log": false,
732
+ "should_save": false,
733
+ "should_training_stop": false
734
+ },
735
+ "attributes": {}
736
+ }
737
+ },
738
+ "total_flos": 7.039446888428339e+16,
739
+ "train_batch_size": 1,
740
+ "trial_name": null,
741
+ "trial_params": null
742
+ }
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15df646d9de2827fcb2d37743cc4afd5897a456ad3002625680efb7fa8968c28
3
+ size 389170122
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cfb3b8f77215f71c2c566a8e4a38358dba929d768726552fb07421f2b738dca
3
+ size 389172166
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr100.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0149b7f0cf6227341143067492664f94dd64ed1cb7b7a4e43c374acd8e70b13a
3
+ size 389172958
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr125.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6481de33461bc2f9b1d094560da45eab8e4df65fb58f5f7f6b69d331133ff23e
3
+ size 389172958
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr150.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60f721ef60ea29c44173fee1abab36bf5985328a8f3e6c320bedd9aa00c2ff10
3
+ size 389172958
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr175.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1f0974504d6f6686847abaf4785397372caac3bec0839b689e6b0e185c3e4bc
3
+ size 389172958
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr200.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53102b5eda8437b7d9bd9a3adf847ab340de54b86f0be83c930450508841c5dc
3
+ size 389172958
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr25.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60245923cf2fbdb65974d542218733dcec2b57ac6d6ad769d6dd21604bee7851
3
+ size 389172562
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr50.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6567e7c366e1050b54f9c259bea308d7550e4a1d4fb3a7a28a92697669d48d8
3
+ size 389172562
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_client_model_round1_itr75.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:823bb7bf8c020345951abefe505a2fe9a7eb24005059c7425b27e933652c1f9c
3
+ size 389172562
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/2_trainer_state.json ADDED
@@ -0,0 +1,742 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 500,
6
+ "global_step": 201,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.009950248756218905,
13
+ "grad_norm": 1.650244116783142,
14
+ "learning_rate": 1e-05,
15
+ "loss": 0.6243,
16
+ "step": 2
17
+ },
18
+ {
19
+ "epoch": 0.01990049751243781,
20
+ "grad_norm": 8.55079460144043,
21
+ "learning_rate": 1e-05,
22
+ "loss": 1.9013,
23
+ "step": 4
24
+ },
25
+ {
26
+ "epoch": 0.029850746268656716,
27
+ "grad_norm": 2.6233367919921875,
28
+ "learning_rate": 1e-05,
29
+ "loss": 0.8224,
30
+ "step": 6
31
+ },
32
+ {
33
+ "epoch": 0.03980099502487562,
34
+ "grad_norm": 2.576277732849121,
35
+ "learning_rate": 1e-05,
36
+ "loss": 0.9551,
37
+ "step": 8
38
+ },
39
+ {
40
+ "epoch": 0.04975124378109453,
41
+ "grad_norm": 2.1575682163238525,
42
+ "learning_rate": 1e-05,
43
+ "loss": 0.8281,
44
+ "step": 10
45
+ },
46
+ {
47
+ "epoch": 0.05970149253731343,
48
+ "grad_norm": 2.4942679405212402,
49
+ "learning_rate": 1e-05,
50
+ "loss": 1.2534,
51
+ "step": 12
52
+ },
53
+ {
54
+ "epoch": 0.06965174129353234,
55
+ "grad_norm": 2.6151747703552246,
56
+ "learning_rate": 1e-05,
57
+ "loss": 1.4263,
58
+ "step": 14
59
+ },
60
+ {
61
+ "epoch": 0.07960199004975124,
62
+ "grad_norm": 2.6396424770355225,
63
+ "learning_rate": 1e-05,
64
+ "loss": 1.1638,
65
+ "step": 16
66
+ },
67
+ {
68
+ "epoch": 0.08955223880597014,
69
+ "grad_norm": 1.374642014503479,
70
+ "learning_rate": 1e-05,
71
+ "loss": 0.7122,
72
+ "step": 18
73
+ },
74
+ {
75
+ "epoch": 0.09950248756218906,
76
+ "grad_norm": 1.2737079858779907,
77
+ "learning_rate": 1e-05,
78
+ "loss": 0.927,
79
+ "step": 20
80
+ },
81
+ {
82
+ "epoch": 0.10945273631840796,
83
+ "grad_norm": 2.1679718494415283,
84
+ "learning_rate": 1e-05,
85
+ "loss": 1.0146,
86
+ "step": 22
87
+ },
88
+ {
89
+ "epoch": 0.11940298507462686,
90
+ "grad_norm": 1.9755343198776245,
91
+ "learning_rate": 1e-05,
92
+ "loss": 1.0562,
93
+ "step": 24
94
+ },
95
+ {
96
+ "epoch": 0.12935323383084577,
97
+ "grad_norm": 1.8360259532928467,
98
+ "learning_rate": 1e-05,
99
+ "loss": 0.9011,
100
+ "step": 26
101
+ },
102
+ {
103
+ "epoch": 0.13930348258706468,
104
+ "grad_norm": 1.3684884309768677,
105
+ "learning_rate": 1e-05,
106
+ "loss": 1.0317,
107
+ "step": 28
108
+ },
109
+ {
110
+ "epoch": 0.14925373134328357,
111
+ "grad_norm": 1.1041371822357178,
112
+ "learning_rate": 1e-05,
113
+ "loss": 0.7749,
114
+ "step": 30
115
+ },
116
+ {
117
+ "epoch": 0.15920398009950248,
118
+ "grad_norm": 1.9447084665298462,
119
+ "learning_rate": 1e-05,
120
+ "loss": 0.916,
121
+ "step": 32
122
+ },
123
+ {
124
+ "epoch": 0.1691542288557214,
125
+ "grad_norm": 1.2489606142044067,
126
+ "learning_rate": 1e-05,
127
+ "loss": 0.7324,
128
+ "step": 34
129
+ },
130
+ {
131
+ "epoch": 0.1791044776119403,
132
+ "grad_norm": 1.8743946552276611,
133
+ "learning_rate": 1e-05,
134
+ "loss": 1.1079,
135
+ "step": 36
136
+ },
137
+ {
138
+ "epoch": 0.1890547263681592,
139
+ "grad_norm": 1.102053165435791,
140
+ "learning_rate": 1e-05,
141
+ "loss": 0.9385,
142
+ "step": 38
143
+ },
144
+ {
145
+ "epoch": 0.19900497512437812,
146
+ "grad_norm": 0.8476048707962036,
147
+ "learning_rate": 1e-05,
148
+ "loss": 0.8044,
149
+ "step": 40
150
+ },
151
+ {
152
+ "epoch": 0.208955223880597,
153
+ "grad_norm": 0.9640145301818848,
154
+ "learning_rate": 1e-05,
155
+ "loss": 0.8799,
156
+ "step": 42
157
+ },
158
+ {
159
+ "epoch": 0.21890547263681592,
160
+ "grad_norm": 1.381293535232544,
161
+ "learning_rate": 1e-05,
162
+ "loss": 0.7881,
163
+ "step": 44
164
+ },
165
+ {
166
+ "epoch": 0.22885572139303484,
167
+ "grad_norm": 0.9105871915817261,
168
+ "learning_rate": 1e-05,
169
+ "loss": 1.0356,
170
+ "step": 46
171
+ },
172
+ {
173
+ "epoch": 0.23880597014925373,
174
+ "grad_norm": 2.0499324798583984,
175
+ "learning_rate": 1e-05,
176
+ "loss": 1.1104,
177
+ "step": 48
178
+ },
179
+ {
180
+ "epoch": 0.24875621890547264,
181
+ "grad_norm": 2.4867374897003174,
182
+ "learning_rate": 1e-05,
183
+ "loss": 0.9888,
184
+ "step": 50
185
+ },
186
+ {
187
+ "epoch": 0.25870646766169153,
188
+ "grad_norm": 1.052661418914795,
189
+ "learning_rate": 1e-05,
190
+ "loss": 0.7593,
191
+ "step": 52
192
+ },
193
+ {
194
+ "epoch": 0.26865671641791045,
195
+ "grad_norm": 0.8331828117370605,
196
+ "learning_rate": 1e-05,
197
+ "loss": 0.7891,
198
+ "step": 54
199
+ },
200
+ {
201
+ "epoch": 0.27860696517412936,
202
+ "grad_norm": 1.3015260696411133,
203
+ "learning_rate": 1e-05,
204
+ "loss": 0.8362,
205
+ "step": 56
206
+ },
207
+ {
208
+ "epoch": 0.2885572139303483,
209
+ "grad_norm": 1.1861402988433838,
210
+ "learning_rate": 1e-05,
211
+ "loss": 1.0667,
212
+ "step": 58
213
+ },
214
+ {
215
+ "epoch": 0.29850746268656714,
216
+ "grad_norm": 2.5102596282958984,
217
+ "learning_rate": 1e-05,
218
+ "loss": 1.4307,
219
+ "step": 60
220
+ },
221
+ {
222
+ "epoch": 0.30845771144278605,
223
+ "grad_norm": 1.035914659500122,
224
+ "learning_rate": 1e-05,
225
+ "loss": 0.7061,
226
+ "step": 62
227
+ },
228
+ {
229
+ "epoch": 0.31840796019900497,
230
+ "grad_norm": 1.268302321434021,
231
+ "learning_rate": 1e-05,
232
+ "loss": 1.2146,
233
+ "step": 64
234
+ },
235
+ {
236
+ "epoch": 0.3283582089552239,
237
+ "grad_norm": 1.501561164855957,
238
+ "learning_rate": 1e-05,
239
+ "loss": 1.0767,
240
+ "step": 66
241
+ },
242
+ {
243
+ "epoch": 0.3383084577114428,
244
+ "grad_norm": 0.7221049070358276,
245
+ "learning_rate": 1e-05,
246
+ "loss": 0.9985,
247
+ "step": 68
248
+ },
249
+ {
250
+ "epoch": 0.3482587064676617,
251
+ "grad_norm": 0.9676480293273926,
252
+ "learning_rate": 1e-05,
253
+ "loss": 0.9858,
254
+ "step": 70
255
+ },
256
+ {
257
+ "epoch": 0.3582089552238806,
258
+ "grad_norm": 0.8725219368934631,
259
+ "learning_rate": 1e-05,
260
+ "loss": 0.854,
261
+ "step": 72
262
+ },
263
+ {
264
+ "epoch": 0.3681592039800995,
265
+ "grad_norm": 0.7807052731513977,
266
+ "learning_rate": 1e-05,
267
+ "loss": 0.8716,
268
+ "step": 74
269
+ },
270
+ {
271
+ "epoch": 0.3781094527363184,
272
+ "grad_norm": 0.7535572052001953,
273
+ "learning_rate": 1e-05,
274
+ "loss": 0.8459,
275
+ "step": 76
276
+ },
277
+ {
278
+ "epoch": 0.3880597014925373,
279
+ "grad_norm": 1.4078559875488281,
280
+ "learning_rate": 1e-05,
281
+ "loss": 1.105,
282
+ "step": 78
283
+ },
284
+ {
285
+ "epoch": 0.39800995024875624,
286
+ "grad_norm": 0.957761287689209,
287
+ "learning_rate": 1e-05,
288
+ "loss": 1.0386,
289
+ "step": 80
290
+ },
291
+ {
292
+ "epoch": 0.4079601990049751,
293
+ "grad_norm": 0.8926840424537659,
294
+ "learning_rate": 1e-05,
295
+ "loss": 0.9438,
296
+ "step": 82
297
+ },
298
+ {
299
+ "epoch": 0.417910447761194,
300
+ "grad_norm": 1.8459022045135498,
301
+ "learning_rate": 1e-05,
302
+ "loss": 0.8696,
303
+ "step": 84
304
+ },
305
+ {
306
+ "epoch": 0.42786069651741293,
307
+ "grad_norm": 1.311964511871338,
308
+ "learning_rate": 1e-05,
309
+ "loss": 0.9351,
310
+ "step": 86
311
+ },
312
+ {
313
+ "epoch": 0.43781094527363185,
314
+ "grad_norm": 1.8599036931991577,
315
+ "learning_rate": 1e-05,
316
+ "loss": 1.1685,
317
+ "step": 88
318
+ },
319
+ {
320
+ "epoch": 0.44776119402985076,
321
+ "grad_norm": 0.9435080289840698,
322
+ "learning_rate": 1e-05,
323
+ "loss": 0.8364,
324
+ "step": 90
325
+ },
326
+ {
327
+ "epoch": 0.4577114427860697,
328
+ "grad_norm": 0.8074705600738525,
329
+ "learning_rate": 1e-05,
330
+ "loss": 1.0356,
331
+ "step": 92
332
+ },
333
+ {
334
+ "epoch": 0.46766169154228854,
335
+ "grad_norm": 0.7916580438613892,
336
+ "learning_rate": 1e-05,
337
+ "loss": 0.8716,
338
+ "step": 94
339
+ },
340
+ {
341
+ "epoch": 0.47761194029850745,
342
+ "grad_norm": 1.0159028768539429,
343
+ "learning_rate": 1e-05,
344
+ "loss": 0.9404,
345
+ "step": 96
346
+ },
347
+ {
348
+ "epoch": 0.48756218905472637,
349
+ "grad_norm": 0.6591694355010986,
350
+ "learning_rate": 1e-05,
351
+ "loss": 0.8706,
352
+ "step": 98
353
+ },
354
+ {
355
+ "epoch": 0.4975124378109453,
356
+ "grad_norm": 1.0024625062942505,
357
+ "learning_rate": 1e-05,
358
+ "loss": 0.9551,
359
+ "step": 100
360
+ },
361
+ {
362
+ "epoch": 0.5074626865671642,
363
+ "grad_norm": 1.3378303050994873,
364
+ "learning_rate": 1e-05,
365
+ "loss": 0.8682,
366
+ "step": 102
367
+ },
368
+ {
369
+ "epoch": 0.5174129353233831,
370
+ "grad_norm": 0.9471051096916199,
371
+ "learning_rate": 1e-05,
372
+ "loss": 0.9287,
373
+ "step": 104
374
+ },
375
+ {
376
+ "epoch": 0.527363184079602,
377
+ "grad_norm": 1.0026133060455322,
378
+ "learning_rate": 1e-05,
379
+ "loss": 1.0786,
380
+ "step": 106
381
+ },
382
+ {
383
+ "epoch": 0.5373134328358209,
384
+ "grad_norm": 0.8960136771202087,
385
+ "learning_rate": 1e-05,
386
+ "loss": 1.0117,
387
+ "step": 108
388
+ },
389
+ {
390
+ "epoch": 0.5472636815920398,
391
+ "grad_norm": 0.5560504794120789,
392
+ "learning_rate": 1e-05,
393
+ "loss": 0.8799,
394
+ "step": 110
395
+ },
396
+ {
397
+ "epoch": 0.5572139303482587,
398
+ "grad_norm": 1.0694944858551025,
399
+ "learning_rate": 1e-05,
400
+ "loss": 0.9097,
401
+ "step": 112
402
+ },
403
+ {
404
+ "epoch": 0.5671641791044776,
405
+ "grad_norm": 0.8429641127586365,
406
+ "learning_rate": 1e-05,
407
+ "loss": 0.9556,
408
+ "step": 114
409
+ },
410
+ {
411
+ "epoch": 0.5771144278606966,
412
+ "grad_norm": 0.6551101207733154,
413
+ "learning_rate": 1e-05,
414
+ "loss": 0.9912,
415
+ "step": 116
416
+ },
417
+ {
418
+ "epoch": 0.5870646766169154,
419
+ "grad_norm": 1.2814500331878662,
420
+ "learning_rate": 1e-05,
421
+ "loss": 0.938,
422
+ "step": 118
423
+ },
424
+ {
425
+ "epoch": 0.5970149253731343,
426
+ "grad_norm": 0.5971533060073853,
427
+ "learning_rate": 1e-05,
428
+ "loss": 0.8203,
429
+ "step": 120
430
+ },
431
+ {
432
+ "epoch": 0.6069651741293532,
433
+ "grad_norm": 0.6333916783332825,
434
+ "learning_rate": 1e-05,
435
+ "loss": 0.7949,
436
+ "step": 122
437
+ },
438
+ {
439
+ "epoch": 0.6169154228855721,
440
+ "grad_norm": 1.5460799932479858,
441
+ "learning_rate": 1e-05,
442
+ "loss": 1.0107,
443
+ "step": 124
444
+ },
445
+ {
446
+ "epoch": 0.6268656716417911,
447
+ "grad_norm": 0.6799649596214294,
448
+ "learning_rate": 1e-05,
449
+ "loss": 0.9155,
450
+ "step": 126
451
+ },
452
+ {
453
+ "epoch": 0.6368159203980099,
454
+ "grad_norm": 0.5778260827064514,
455
+ "learning_rate": 1e-05,
456
+ "loss": 0.9985,
457
+ "step": 128
458
+ },
459
+ {
460
+ "epoch": 0.6467661691542289,
461
+ "grad_norm": 0.7546162605285645,
462
+ "learning_rate": 1e-05,
463
+ "loss": 0.9199,
464
+ "step": 130
465
+ },
466
+ {
467
+ "epoch": 0.6567164179104478,
468
+ "grad_norm": 0.5724232196807861,
469
+ "learning_rate": 1e-05,
470
+ "loss": 0.9399,
471
+ "step": 132
472
+ },
473
+ {
474
+ "epoch": 0.6666666666666666,
475
+ "grad_norm": 1.2401442527770996,
476
+ "learning_rate": 1e-05,
477
+ "loss": 0.8687,
478
+ "step": 134
479
+ },
480
+ {
481
+ "epoch": 0.6766169154228856,
482
+ "grad_norm": 0.8218169212341309,
483
+ "learning_rate": 1e-05,
484
+ "loss": 0.8857,
485
+ "step": 136
486
+ },
487
+ {
488
+ "epoch": 0.6865671641791045,
489
+ "grad_norm": 0.690995991230011,
490
+ "learning_rate": 1e-05,
491
+ "loss": 0.9438,
492
+ "step": 138
493
+ },
494
+ {
495
+ "epoch": 0.6965174129353234,
496
+ "grad_norm": 0.9527719020843506,
497
+ "learning_rate": 1e-05,
498
+ "loss": 1.0239,
499
+ "step": 140
500
+ },
501
+ {
502
+ "epoch": 0.7064676616915423,
503
+ "grad_norm": 0.6030732989311218,
504
+ "learning_rate": 1e-05,
505
+ "loss": 0.9722,
506
+ "step": 142
507
+ },
508
+ {
509
+ "epoch": 0.7164179104477612,
510
+ "grad_norm": 0.6105135679244995,
511
+ "learning_rate": 1e-05,
512
+ "loss": 0.8628,
513
+ "step": 144
514
+ },
515
+ {
516
+ "epoch": 0.7263681592039801,
517
+ "grad_norm": 0.7813135981559753,
518
+ "learning_rate": 1e-05,
519
+ "loss": 0.8213,
520
+ "step": 146
521
+ },
522
+ {
523
+ "epoch": 0.736318407960199,
524
+ "grad_norm": 0.5830418467521667,
525
+ "learning_rate": 1e-05,
526
+ "loss": 0.834,
527
+ "step": 148
528
+ },
529
+ {
530
+ "epoch": 0.746268656716418,
531
+ "grad_norm": 1.0577740669250488,
532
+ "learning_rate": 1e-05,
533
+ "loss": 0.9692,
534
+ "step": 150
535
+ },
536
+ {
537
+ "epoch": 0.7562189054726368,
538
+ "grad_norm": 0.813637912273407,
539
+ "learning_rate": 1e-05,
540
+ "loss": 0.8735,
541
+ "step": 152
542
+ },
543
+ {
544
+ "epoch": 0.7661691542288557,
545
+ "grad_norm": 0.5650802254676819,
546
+ "learning_rate": 1e-05,
547
+ "loss": 1.0767,
548
+ "step": 154
549
+ },
550
+ {
551
+ "epoch": 0.7761194029850746,
552
+ "grad_norm": 0.7651078104972839,
553
+ "learning_rate": 1e-05,
554
+ "loss": 0.8862,
555
+ "step": 156
556
+ },
557
+ {
558
+ "epoch": 0.7860696517412935,
559
+ "grad_norm": 0.5638197064399719,
560
+ "learning_rate": 1e-05,
561
+ "loss": 1.0239,
562
+ "step": 158
563
+ },
564
+ {
565
+ "epoch": 0.7960199004975125,
566
+ "grad_norm": 0.5717598795890808,
567
+ "learning_rate": 1e-05,
568
+ "loss": 0.9868,
569
+ "step": 160
570
+ },
571
+ {
572
+ "epoch": 0.8059701492537313,
573
+ "grad_norm": 0.9155240058898926,
574
+ "learning_rate": 1e-05,
575
+ "loss": 0.8545,
576
+ "step": 162
577
+ },
578
+ {
579
+ "epoch": 0.8159203980099502,
580
+ "grad_norm": 0.673218309879303,
581
+ "learning_rate": 1e-05,
582
+ "loss": 0.8979,
583
+ "step": 164
584
+ },
585
+ {
586
+ "epoch": 0.8258706467661692,
587
+ "grad_norm": 0.933534562587738,
588
+ "learning_rate": 1e-05,
589
+ "loss": 0.958,
590
+ "step": 166
591
+ },
592
+ {
593
+ "epoch": 0.835820895522388,
594
+ "grad_norm": 0.6906251907348633,
595
+ "learning_rate": 1e-05,
596
+ "loss": 0.8301,
597
+ "step": 168
598
+ },
599
+ {
600
+ "epoch": 0.845771144278607,
601
+ "grad_norm": 0.9870006442070007,
602
+ "learning_rate": 1e-05,
603
+ "loss": 0.8652,
604
+ "step": 170
605
+ },
606
+ {
607
+ "epoch": 0.8557213930348259,
608
+ "grad_norm": 1.019015908241272,
609
+ "learning_rate": 1e-05,
610
+ "loss": 0.9165,
611
+ "step": 172
612
+ },
613
+ {
614
+ "epoch": 0.8656716417910447,
615
+ "grad_norm": 0.997454047203064,
616
+ "learning_rate": 1e-05,
617
+ "loss": 0.8403,
618
+ "step": 174
619
+ },
620
+ {
621
+ "epoch": 0.8756218905472637,
622
+ "grad_norm": 1.6273800134658813,
623
+ "learning_rate": 1e-05,
624
+ "loss": 0.957,
625
+ "step": 176
626
+ },
627
+ {
628
+ "epoch": 0.8855721393034826,
629
+ "grad_norm": 0.8904904127120972,
630
+ "learning_rate": 1e-05,
631
+ "loss": 0.8452,
632
+ "step": 178
633
+ },
634
+ {
635
+ "epoch": 0.8955223880597015,
636
+ "grad_norm": 0.7554193139076233,
637
+ "learning_rate": 1e-05,
638
+ "loss": 0.7539,
639
+ "step": 180
640
+ },
641
+ {
642
+ "epoch": 0.9054726368159204,
643
+ "grad_norm": 1.757675051689148,
644
+ "learning_rate": 1e-05,
645
+ "loss": 0.9287,
646
+ "step": 182
647
+ },
648
+ {
649
+ "epoch": 0.9154228855721394,
650
+ "grad_norm": 0.8368033170700073,
651
+ "learning_rate": 1e-05,
652
+ "loss": 0.8506,
653
+ "step": 184
654
+ },
655
+ {
656
+ "epoch": 0.9253731343283582,
657
+ "grad_norm": 0.956574022769928,
658
+ "learning_rate": 1e-05,
659
+ "loss": 0.8433,
660
+ "step": 186
661
+ },
662
+ {
663
+ "epoch": 0.9353233830845771,
664
+ "grad_norm": 1.2842135429382324,
665
+ "learning_rate": 1e-05,
666
+ "loss": 0.8799,
667
+ "step": 188
668
+ },
669
+ {
670
+ "epoch": 0.945273631840796,
671
+ "grad_norm": 1.017176628112793,
672
+ "learning_rate": 1e-05,
673
+ "loss": 0.8638,
674
+ "step": 190
675
+ },
676
+ {
677
+ "epoch": 0.9552238805970149,
678
+ "grad_norm": 1.4684029817581177,
679
+ "learning_rate": 1e-05,
680
+ "loss": 0.947,
681
+ "step": 192
682
+ },
683
+ {
684
+ "epoch": 0.9651741293532339,
685
+ "grad_norm": 1.4607092142105103,
686
+ "learning_rate": 1e-05,
687
+ "loss": 0.9966,
688
+ "step": 194
689
+ },
690
+ {
691
+ "epoch": 0.9751243781094527,
692
+ "grad_norm": 1.6244029998779297,
693
+ "learning_rate": 1e-05,
694
+ "loss": 0.6952,
695
+ "step": 196
696
+ },
697
+ {
698
+ "epoch": 0.9850746268656716,
699
+ "grad_norm": 1.253040075302124,
700
+ "learning_rate": 1e-05,
701
+ "loss": 0.9458,
702
+ "step": 198
703
+ },
704
+ {
705
+ "epoch": 0.9950248756218906,
706
+ "grad_norm": 1.4702417850494385,
707
+ "learning_rate": 1e-05,
708
+ "loss": 0.9985,
709
+ "step": 200
710
+ },
711
+ {
712
+ "epoch": 1.0,
713
+ "step": 201,
714
+ "total_flos": 1.9968928570671104e+16,
715
+ "train_loss": 0.9424890926228234,
716
+ "train_runtime": 463.9481,
717
+ "train_samples_per_second": 1.733,
718
+ "train_steps_per_second": 0.433
719
+ }
720
+ ],
721
+ "logging_steps": 2,
722
+ "max_steps": 201,
723
+ "num_input_tokens_seen": 0,
724
+ "num_train_epochs": 1,
725
+ "save_steps": 500,
726
+ "stateful_callbacks": {
727
+ "TrainerControl": {
728
+ "args": {
729
+ "should_epoch_stop": false,
730
+ "should_evaluate": false,
731
+ "should_log": false,
732
+ "should_save": false,
733
+ "should_training_stop": false
734
+ },
735
+ "attributes": {}
736
+ }
737
+ },
738
+ "total_flos": 1.9968928570671104e+16,
739
+ "train_batch_size": 1,
740
+ "trial_name": null,
741
+ "trial_params": null
742
+ }
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30cc75502c7f91affcd69bae806c3a2fe927b1221cd916baa5bea77645e25e78
3
+ size 389170122
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a601c0dba0cc0f6c171ddac346957dc7e71b334cb4bbf3956e3bde1916356a6
3
+ size 389172166
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr100.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:101636b78258e532fc44f1cee697217979c28b0d326484f53cf6c0abce3c37f7
3
+ size 389172958
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr125.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83f89ec80d240b002848de09dac06b77d84b42e1e37182bdb52d999334b81ad8
3
+ size 389172958
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr150.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f987a5f3a9d1914fe9208860d108d875a3959e3d2381703a5f24d86b47c9d53
3
+ size 389172958
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr175.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6415975ce46cd58092b9080498c7c786dc519f47e055cea660db53da5c6f9111
3
+ size 389172958
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr200.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7556d4da4dd89b709e46b2a2f07409fd60900bb2f19d815cf84e74c51b1732a
3
+ size 389172958
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr25.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5363c67f1cb193fd3ddc8d555a59035a42e9696642c620732b30151b2ac03ecd
3
+ size 389172562
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr50.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16702cf0d38a1e9b1e390bdb1b52c302265fc3095e097bacff50f0802b9c0bef
3
+ size 389172562
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_client_model_round1_itr75.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fe002259d80ce003b73b25fa6f46c45a5e4aae678bce953929a2360c7d0acec
3
+ size 389172562
client_states_sft_bs4_saveoptim_lr1e-5_sc8_1tasks_1rounds_fixitr201_T0125_decay099_sft_r20_20/3_trainer_state.json ADDED
@@ -0,0 +1,742 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 500,
6
+ "global_step": 201,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.009950248756218905,
13
+ "grad_norm": 3.4984848499298096,
14
+ "learning_rate": 1e-05,
15
+ "loss": 2.4546,
16
+ "step": 2
17
+ },
18
+ {
19
+ "epoch": 0.01990049751243781,
20
+ "grad_norm": 6.905043125152588,
21
+ "learning_rate": 1e-05,
22
+ "loss": 2.5471,
23
+ "step": 4
24
+ },
25
+ {
26
+ "epoch": 0.029850746268656716,
27
+ "grad_norm": 3.0716166496276855,
28
+ "learning_rate": 1e-05,
29
+ "loss": 1.7067,
30
+ "step": 6
31
+ },
32
+ {
33
+ "epoch": 0.03980099502487562,
34
+ "grad_norm": 3.0438055992126465,
35
+ "learning_rate": 1e-05,
36
+ "loss": 1.0265,
37
+ "step": 8
38
+ },
39
+ {
40
+ "epoch": 0.04975124378109453,
41
+ "grad_norm": 5.619898319244385,
42
+ "learning_rate": 1e-05,
43
+ "loss": 1.8415,
44
+ "step": 10
45
+ },
46
+ {
47
+ "epoch": 0.05970149253731343,
48
+ "grad_norm": 5.910048961639404,
49
+ "learning_rate": 1e-05,
50
+ "loss": 2.0034,
51
+ "step": 12
52
+ },
53
+ {
54
+ "epoch": 0.06965174129353234,
55
+ "grad_norm": 4.535861492156982,
56
+ "learning_rate": 1e-05,
57
+ "loss": 1.6565,
58
+ "step": 14
59
+ },
60
+ {
61
+ "epoch": 0.07960199004975124,
62
+ "grad_norm": 3.565920114517212,
63
+ "learning_rate": 1e-05,
64
+ "loss": 1.3917,
65
+ "step": 16
66
+ },
67
+ {
68
+ "epoch": 0.08955223880597014,
69
+ "grad_norm": 3.367178440093994,
70
+ "learning_rate": 1e-05,
71
+ "loss": 1.2771,
72
+ "step": 18
73
+ },
74
+ {
75
+ "epoch": 0.09950248756218906,
76
+ "grad_norm": 3.4126410484313965,
77
+ "learning_rate": 1e-05,
78
+ "loss": 1.2509,
79
+ "step": 20
80
+ },
81
+ {
82
+ "epoch": 0.10945273631840796,
83
+ "grad_norm": 2.830953598022461,
84
+ "learning_rate": 1e-05,
85
+ "loss": 0.8848,
86
+ "step": 22
87
+ },
88
+ {
89
+ "epoch": 0.11940298507462686,
90
+ "grad_norm": 2.7264418601989746,
91
+ "learning_rate": 1e-05,
92
+ "loss": 1.4029,
93
+ "step": 24
94
+ },
95
+ {
96
+ "epoch": 0.12935323383084577,
97
+ "grad_norm": 4.596713066101074,
98
+ "learning_rate": 1e-05,
99
+ "loss": 1.8621,
100
+ "step": 26
101
+ },
102
+ {
103
+ "epoch": 0.13930348258706468,
104
+ "grad_norm": 3.3301849365234375,
105
+ "learning_rate": 1e-05,
106
+ "loss": 1.1167,
107
+ "step": 28
108
+ },
109
+ {
110
+ "epoch": 0.14925373134328357,
111
+ "grad_norm": 3.9882688522338867,
112
+ "learning_rate": 1e-05,
113
+ "loss": 1.4297,
114
+ "step": 30
115
+ },
116
+ {
117
+ "epoch": 0.15920398009950248,
118
+ "grad_norm": 0.8713480830192566,
119
+ "learning_rate": 1e-05,
120
+ "loss": 0.8509,
121
+ "step": 32
122
+ },
123
+ {
124
+ "epoch": 0.1691542288557214,
125
+ "grad_norm": 5.365267753601074,
126
+ "learning_rate": 1e-05,
127
+ "loss": 1.2606,
128
+ "step": 34
129
+ },
130
+ {
131
+ "epoch": 0.1791044776119403,
132
+ "grad_norm": 3.6241588592529297,
133
+ "learning_rate": 1e-05,
134
+ "loss": 1.1967,
135
+ "step": 36
136
+ },
137
+ {
138
+ "epoch": 0.1890547263681592,
139
+ "grad_norm": 2.176697254180908,
140
+ "learning_rate": 1e-05,
141
+ "loss": 0.8659,
142
+ "step": 38
143
+ },
144
+ {
145
+ "epoch": 0.19900497512437812,
146
+ "grad_norm": 3.8057022094726562,
147
+ "learning_rate": 1e-05,
148
+ "loss": 1.3048,
149
+ "step": 40
150
+ },
151
+ {
152
+ "epoch": 0.208955223880597,
153
+ "grad_norm": 5.057631015777588,
154
+ "learning_rate": 1e-05,
155
+ "loss": 0.8538,
156
+ "step": 42
157
+ },
158
+ {
159
+ "epoch": 0.21890547263681592,
160
+ "grad_norm": 3.2521402835845947,
161
+ "learning_rate": 1e-05,
162
+ "loss": 0.967,
163
+ "step": 44
164
+ },
165
+ {
166
+ "epoch": 0.22885572139303484,
167
+ "grad_norm": 4.557372570037842,
168
+ "learning_rate": 1e-05,
169
+ "loss": 1.2981,
170
+ "step": 46
171
+ },
172
+ {
173
+ "epoch": 0.23880597014925373,
174
+ "grad_norm": 4.477596759796143,
175
+ "learning_rate": 1e-05,
176
+ "loss": 0.5941,
177
+ "step": 48
178
+ },
179
+ {
180
+ "epoch": 0.24875621890547264,
181
+ "grad_norm": 3.518733263015747,
182
+ "learning_rate": 1e-05,
183
+ "loss": 1.6752,
184
+ "step": 50
185
+ },
186
+ {
187
+ "epoch": 0.25870646766169153,
188
+ "grad_norm": 7.677566051483154,
189
+ "learning_rate": 1e-05,
190
+ "loss": 1.4171,
191
+ "step": 52
192
+ },
193
+ {
194
+ "epoch": 0.26865671641791045,
195
+ "grad_norm": 2.7589364051818848,
196
+ "learning_rate": 1e-05,
197
+ "loss": 0.5706,
198
+ "step": 54
199
+ },
200
+ {
201
+ "epoch": 0.27860696517412936,
202
+ "grad_norm": 2.9053287506103516,
203
+ "learning_rate": 1e-05,
204
+ "loss": 1.2631,
205
+ "step": 56
206
+ },
207
+ {
208
+ "epoch": 0.2885572139303483,
209
+ "grad_norm": 8.476534843444824,
210
+ "learning_rate": 1e-05,
211
+ "loss": 1.4675,
212
+ "step": 58
213
+ },
214
+ {
215
+ "epoch": 0.29850746268656714,
216
+ "grad_norm": 7.858043670654297,
217
+ "learning_rate": 1e-05,
218
+ "loss": 2.94,
219
+ "step": 60
220
+ },
221
+ {
222
+ "epoch": 0.30845771144278605,
223
+ "grad_norm": 3.2298996448516846,
224
+ "learning_rate": 1e-05,
225
+ "loss": 0.5,
226
+ "step": 62
227
+ },
228
+ {
229
+ "epoch": 0.31840796019900497,
230
+ "grad_norm": 5.179959774017334,
231
+ "learning_rate": 1e-05,
232
+ "loss": 0.8592,
233
+ "step": 64
234
+ },
235
+ {
236
+ "epoch": 0.3283582089552239,
237
+ "grad_norm": 10.46849536895752,
238
+ "learning_rate": 1e-05,
239
+ "loss": 2.096,
240
+ "step": 66
241
+ },
242
+ {
243
+ "epoch": 0.3383084577114428,
244
+ "grad_norm": 1.686103105545044,
245
+ "learning_rate": 1e-05,
246
+ "loss": 1.3382,
247
+ "step": 68
248
+ },
249
+ {
250
+ "epoch": 0.3482587064676617,
251
+ "grad_norm": 3.9357430934906006,
252
+ "learning_rate": 1e-05,
253
+ "loss": 1.5427,
254
+ "step": 70
255
+ },
256
+ {
257
+ "epoch": 0.3582089552238806,
258
+ "grad_norm": 6.08726692199707,
259
+ "learning_rate": 1e-05,
260
+ "loss": 1.7477,
261
+ "step": 72
262
+ },
263
+ {
264
+ "epoch": 0.3681592039800995,
265
+ "grad_norm": 4.0442376136779785,
266
+ "learning_rate": 1e-05,
267
+ "loss": 1.3599,
268
+ "step": 74
269
+ },
270
+ {
271
+ "epoch": 0.3781094527363184,
272
+ "grad_norm": 4.393518447875977,
273
+ "learning_rate": 1e-05,
274
+ "loss": 1.3049,
275
+ "step": 76
276
+ },
277
+ {
278
+ "epoch": 0.3880597014925373,
279
+ "grad_norm": 4.748154163360596,
280
+ "learning_rate": 1e-05,
281
+ "loss": 1.2163,
282
+ "step": 78
283
+ },
284
+ {
285
+ "epoch": 0.39800995024875624,
286
+ "grad_norm": 2.4480156898498535,
287
+ "learning_rate": 1e-05,
288
+ "loss": 1.5779,
289
+ "step": 80
290
+ },
291
+ {
292
+ "epoch": 0.4079601990049751,
293
+ "grad_norm": 4.978269577026367,
294
+ "learning_rate": 1e-05,
295
+ "loss": 1.5108,
296
+ "step": 82
297
+ },
298
+ {
299
+ "epoch": 0.417910447761194,
300
+ "grad_norm": 8.956459999084473,
301
+ "learning_rate": 1e-05,
302
+ "loss": 1.2402,
303
+ "step": 84
304
+ },
305
+ {
306
+ "epoch": 0.42786069651741293,
307
+ "grad_norm": 3.989821434020996,
308
+ "learning_rate": 1e-05,
309
+ "loss": 0.8792,
310
+ "step": 86
311
+ },
312
+ {
313
+ "epoch": 0.43781094527363185,
314
+ "grad_norm": 7.240758895874023,
315
+ "learning_rate": 1e-05,
316
+ "loss": 1.8413,
317
+ "step": 88
318
+ },
319
+ {
320
+ "epoch": 0.44776119402985076,
321
+ "grad_norm": 1.3386205434799194,
322
+ "learning_rate": 1e-05,
323
+ "loss": 0.5992,
324
+ "step": 90
325
+ },
326
+ {
327
+ "epoch": 0.4577114427860697,
328
+ "grad_norm": 5.485062599182129,
329
+ "learning_rate": 1e-05,
330
+ "loss": 0.9109,
331
+ "step": 92
332
+ },
333
+ {
334
+ "epoch": 0.46766169154228854,
335
+ "grad_norm": 5.22202205657959,
336
+ "learning_rate": 1e-05,
337
+ "loss": 0.8113,
338
+ "step": 94
339
+ },
340
+ {
341
+ "epoch": 0.47761194029850745,
342
+ "grad_norm": 2.953240156173706,
343
+ "learning_rate": 1e-05,
344
+ "loss": 1.0452,
345
+ "step": 96
346
+ },
347
+ {
348
+ "epoch": 0.48756218905472637,
349
+ "grad_norm": 3.98473858833313,
350
+ "learning_rate": 1e-05,
351
+ "loss": 1.4785,
352
+ "step": 98
353
+ },
354
+ {
355
+ "epoch": 0.4975124378109453,
356
+ "grad_norm": 0.524372935295105,
357
+ "learning_rate": 1e-05,
358
+ "loss": 1.0392,
359
+ "step": 100
360
+ },
361
+ {
362
+ "epoch": 0.5074626865671642,
363
+ "grad_norm": 5.757716655731201,
364
+ "learning_rate": 1e-05,
365
+ "loss": 1.507,
366
+ "step": 102
367
+ },
368
+ {
369
+ "epoch": 0.5174129353233831,
370
+ "grad_norm": 3.7972941398620605,
371
+ "learning_rate": 1e-05,
372
+ "loss": 2.0817,
373
+ "step": 104
374
+ },
375
+ {
376
+ "epoch": 0.527363184079602,
377
+ "grad_norm": 2.1441078186035156,
378
+ "learning_rate": 1e-05,
379
+ "loss": 1.1439,
380
+ "step": 106
381
+ },
382
+ {
383
+ "epoch": 0.5373134328358209,
384
+ "grad_norm": 4.19448184967041,
385
+ "learning_rate": 1e-05,
386
+ "loss": 0.5984,
387
+ "step": 108
388
+ },
389
+ {
390
+ "epoch": 0.5472636815920398,
391
+ "grad_norm": 2.471952438354492,
392
+ "learning_rate": 1e-05,
393
+ "loss": 0.6786,
394
+ "step": 110
395
+ },
396
+ {
397
+ "epoch": 0.5572139303482587,
398
+ "grad_norm": 3.152708053588867,
399
+ "learning_rate": 1e-05,
400
+ "loss": 0.441,
401
+ "step": 112
402
+ },
403
+ {
404
+ "epoch": 0.5671641791044776,
405
+ "grad_norm": 5.703269004821777,
406
+ "learning_rate": 1e-05,
407
+ "loss": 0.662,
408
+ "step": 114
409
+ },
410
+ {
411
+ "epoch": 0.5771144278606966,
412
+ "grad_norm": 4.732028007507324,
413
+ "learning_rate": 1e-05,
414
+ "loss": 0.7527,
415
+ "step": 116
416
+ },
417
+ {
418
+ "epoch": 0.5870646766169154,
419
+ "grad_norm": 10.553655624389648,
420
+ "learning_rate": 1e-05,
421
+ "loss": 2.7411,
422
+ "step": 118
423
+ },
424
+ {
425
+ "epoch": 0.5970149253731343,
426
+ "grad_norm": 6.645718574523926,
427
+ "learning_rate": 1e-05,
428
+ "loss": 1.6926,
429
+ "step": 120
430
+ },
431
+ {
432
+ "epoch": 0.6069651741293532,
433
+ "grad_norm": 2.5227789878845215,
434
+ "learning_rate": 1e-05,
435
+ "loss": 1.2725,
436
+ "step": 122
437
+ },
438
+ {
439
+ "epoch": 0.6169154228855721,
440
+ "grad_norm": 4.154623508453369,
441
+ "learning_rate": 1e-05,
442
+ "loss": 1.1329,
443
+ "step": 124
444
+ },
445
+ {
446
+ "epoch": 0.6268656716417911,
447
+ "grad_norm": 3.382685661315918,
448
+ "learning_rate": 1e-05,
449
+ "loss": 0.5432,
450
+ "step": 126
451
+ },
452
+ {
453
+ "epoch": 0.6368159203980099,
454
+ "grad_norm": 11.674966812133789,
455
+ "learning_rate": 1e-05,
456
+ "loss": 0.6193,
457
+ "step": 128
458
+ },
459
+ {
460
+ "epoch": 0.6467661691542289,
461
+ "grad_norm": 3.64872145652771,
462
+ "learning_rate": 1e-05,
463
+ "loss": 0.9732,
464
+ "step": 130
465
+ },
466
+ {
467
+ "epoch": 0.6567164179104478,
468
+ "grad_norm": 6.72369384765625,
469
+ "learning_rate": 1e-05,
470
+ "loss": 1.1707,
471
+ "step": 132
472
+ },
473
+ {
474
+ "epoch": 0.6666666666666666,
475
+ "grad_norm": 5.803842067718506,
476
+ "learning_rate": 1e-05,
477
+ "loss": 1.6948,
478
+ "step": 134
479
+ },
480
+ {
481
+ "epoch": 0.6766169154228856,
482
+ "grad_norm": 6.422171592712402,
483
+ "learning_rate": 1e-05,
484
+ "loss": 0.734,
485
+ "step": 136
486
+ },
487
+ {
488
+ "epoch": 0.6865671641791045,
489
+ "grad_norm": 11.723003387451172,
490
+ "learning_rate": 1e-05,
491
+ "loss": 2.1305,
492
+ "step": 138
493
+ },
494
+ {
495
+ "epoch": 0.6965174129353234,
496
+ "grad_norm": 4.657910346984863,
497
+ "learning_rate": 1e-05,
498
+ "loss": 1.7126,
499
+ "step": 140
500
+ },
501
+ {
502
+ "epoch": 0.7064676616915423,
503
+ "grad_norm": 6.460371494293213,
504
+ "learning_rate": 1e-05,
505
+ "loss": 1.6042,
506
+ "step": 142
507
+ },
508
+ {
509
+ "epoch": 0.7164179104477612,
510
+ "grad_norm": 2.946357250213623,
511
+ "learning_rate": 1e-05,
512
+ "loss": 1.3644,
513
+ "step": 144
514
+ },
515
+ {
516
+ "epoch": 0.7263681592039801,
517
+ "grad_norm": 3.000802993774414,
518
+ "learning_rate": 1e-05,
519
+ "loss": 0.7483,
520
+ "step": 146
521
+ },
522
+ {
523
+ "epoch": 0.736318407960199,
524
+ "grad_norm": 5.282987594604492,
525
+ "learning_rate": 1e-05,
526
+ "loss": 1.0917,
527
+ "step": 148
528
+ },
529
+ {
530
+ "epoch": 0.746268656716418,
531
+ "grad_norm": 0.4844614565372467,
532
+ "learning_rate": 1e-05,
533
+ "loss": 0.3645,
534
+ "step": 150
535
+ },
536
+ {
537
+ "epoch": 0.7562189054726368,
538
+ "grad_norm": 4.852270126342773,
539
+ "learning_rate": 1e-05,
540
+ "loss": 1.1297,
541
+ "step": 152
542
+ },
543
+ {
544
+ "epoch": 0.7661691542288557,
545
+ "grad_norm": 3.115569829940796,
546
+ "learning_rate": 1e-05,
547
+ "loss": 1.2097,
548
+ "step": 154
549
+ },
550
+ {
551
+ "epoch": 0.7761194029850746,
552
+ "grad_norm": 4.892626762390137,
553
+ "learning_rate": 1e-05,
554
+ "loss": 0.8909,
555
+ "step": 156
556
+ },
557
+ {
558
+ "epoch": 0.7860696517412935,
559
+ "grad_norm": 4.782143592834473,
560
+ "learning_rate": 1e-05,
561
+ "loss": 0.7592,
562
+ "step": 158
563
+ },
564
+ {
565
+ "epoch": 0.7960199004975125,
566
+ "grad_norm": 1.9109928607940674,
567
+ "learning_rate": 1e-05,
568
+ "loss": 0.4162,
569
+ "step": 160
570
+ },
571
+ {
572
+ "epoch": 0.8059701492537313,
573
+ "grad_norm": 8.50790786743164,
574
+ "learning_rate": 1e-05,
575
+ "loss": 2.4984,
576
+ "step": 162
577
+ },
578
+ {
579
+ "epoch": 0.8159203980099502,
580
+ "grad_norm": 3.661428213119507,
581
+ "learning_rate": 1e-05,
582
+ "loss": 1.0022,
583
+ "step": 164
584
+ },
585
+ {
586
+ "epoch": 0.8258706467661692,
587
+ "grad_norm": 5.116476058959961,
588
+ "learning_rate": 1e-05,
589
+ "loss": 0.9979,
590
+ "step": 166
591
+ },
592
+ {
593
+ "epoch": 0.835820895522388,
594
+ "grad_norm": 6.289146900177002,
595
+ "learning_rate": 1e-05,
596
+ "loss": 0.8444,
597
+ "step": 168
598
+ },
599
+ {
600
+ "epoch": 0.845771144278607,
601
+ "grad_norm": 2.4712114334106445,
602
+ "learning_rate": 1e-05,
603
+ "loss": 0.7441,
604
+ "step": 170
605
+ },
606
+ {
607
+ "epoch": 0.8557213930348259,
608
+ "grad_norm": 4.545423984527588,
609
+ "learning_rate": 1e-05,
610
+ "loss": 0.4958,
611
+ "step": 172
612
+ },
613
+ {
614
+ "epoch": 0.8656716417910447,
615
+ "grad_norm": 2.7957515716552734,
616
+ "learning_rate": 1e-05,
617
+ "loss": 0.6294,
618
+ "step": 174
619
+ },
620
+ {
621
+ "epoch": 0.8756218905472637,
622
+ "grad_norm": 5.590768337249756,
623
+ "learning_rate": 1e-05,
624
+ "loss": 0.4968,
625
+ "step": 176
626
+ },
627
+ {
628
+ "epoch": 0.8855721393034826,
629
+ "grad_norm": 5.343775749206543,
630
+ "learning_rate": 1e-05,
631
+ "loss": 0.4072,
632
+ "step": 178
633
+ },
634
+ {
635
+ "epoch": 0.8955223880597015,
636
+ "grad_norm": 8.360288619995117,
637
+ "learning_rate": 1e-05,
638
+ "loss": 1.0587,
639
+ "step": 180
640
+ },
641
+ {
642
+ "epoch": 0.9054726368159204,
643
+ "grad_norm": 3.4952993392944336,
644
+ "learning_rate": 1e-05,
645
+ "loss": 0.742,
646
+ "step": 182
647
+ },
648
+ {
649
+ "epoch": 0.9154228855721394,
650
+ "grad_norm": 5.865167617797852,
651
+ "learning_rate": 1e-05,
652
+ "loss": 2.4214,
653
+ "step": 184
654
+ },
655
+ {
656
+ "epoch": 0.9253731343283582,
657
+ "grad_norm": 3.2211215496063232,
658
+ "learning_rate": 1e-05,
659
+ "loss": 1.2642,
660
+ "step": 186
661
+ },
662
+ {
663
+ "epoch": 0.9353233830845771,
664
+ "grad_norm": 4.869852066040039,
665
+ "learning_rate": 1e-05,
666
+ "loss": 1.7789,
667
+ "step": 188
668
+ },
669
+ {
670
+ "epoch": 0.945273631840796,
671
+ "grad_norm": 9.350594520568848,
672
+ "learning_rate": 1e-05,
673
+ "loss": 0.9147,
674
+ "step": 190
675
+ },
676
+ {
677
+ "epoch": 0.9552238805970149,
678
+ "grad_norm": 2.942012071609497,
679
+ "learning_rate": 1e-05,
680
+ "loss": 0.6123,
681
+ "step": 192
682
+ },
683
+ {
684
+ "epoch": 0.9651741293532339,
685
+ "grad_norm": 5.4307332038879395,
686
+ "learning_rate": 1e-05,
687
+ "loss": 1.2541,
688
+ "step": 194
689
+ },
690
+ {
691
+ "epoch": 0.9751243781094527,
692
+ "grad_norm": 4.55341911315918,
693
+ "learning_rate": 1e-05,
694
+ "loss": 1.9212,
695
+ "step": 196
696
+ },
697
+ {
698
+ "epoch": 0.9850746268656716,
699
+ "grad_norm": 5.160548210144043,
700
+ "learning_rate": 1e-05,
701
+ "loss": 0.4836,
702
+ "step": 198
703
+ },
704
+ {
705
+ "epoch": 0.9950248756218906,
706
+ "grad_norm": 0.6315759420394897,
707
+ "learning_rate": 1e-05,
708
+ "loss": 0.063,
709
+ "step": 200
710
+ },
711
+ {
712
+ "epoch": 1.0,
713
+ "step": 201,
714
+ "total_flos": 3.816239406461747e+16,
715
+ "train_loss": 1.217404284880529,
716
+ "train_runtime": 508.9307,
717
+ "train_samples_per_second": 1.58,
718
+ "train_steps_per_second": 0.395
719
+ }
720
+ ],
721
+ "logging_steps": 2,
722
+ "max_steps": 201,
723
+ "num_input_tokens_seen": 0,
724
+ "num_train_epochs": 1,
725
+ "save_steps": 500,
726
+ "stateful_callbacks": {
727
+ "TrainerControl": {
728
+ "args": {
729
+ "should_epoch_stop": false,
730
+ "should_evaluate": false,
731
+ "should_log": false,
732
+ "should_save": false,
733
+ "should_training_stop": false
734
+ },
735
+ "attributes": {}
736
+ }
737
+ },
738
+ "total_flos": 3.816239406461747e+16,
739
+ "train_batch_size": 1,
740
+ "trial_name": null,
741
+ "trial_params": null
742
+ }