Blancy commited on
Commit
62e54a6
·
verified ·
1 Parent(s): 107df80

Model save

Browse files
Files changed (3) hide show
  1. all_results.json +6 -6
  2. train_results.json +6 -6
  3. trainer_state.json +86 -163
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 5.211391058037965e+16,
3
- "train_loss": 0.09458173938046872,
4
- "train_runtime": 117.3732,
5
- "train_samples": 1000,
6
- "train_samples_per_second": 164.066,
7
- "train_steps_per_second": 1.304
8
  }
 
1
  {
2
+ "total_flos": 3.3879319029743616e+16,
3
+ "train_loss": 0.10813453071045154,
4
+ "train_runtime": 80.1681,
5
+ "train_samples": 1086,
6
+ "train_samples_per_second": 156.159,
7
+ "train_steps_per_second": 1.235
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 5.211391058037965e+16,
3
- "train_loss": 0.09458173938046872,
4
- "train_runtime": 117.3732,
5
- "train_samples": 1000,
6
- "train_samples_per_second": 164.066,
7
- "train_steps_per_second": 1.304
8
  }
 
1
  {
2
+ "total_flos": 3.3879319029743616e+16,
3
+ "train_loss": 0.10813453071045154,
4
+ "train_runtime": 80.1681,
5
+ "train_samples": 1086,
6
+ "train_samples_per_second": 156.159,
7
+ "train_steps_per_second": 1.235
8
  }
trainer_state.json CHANGED
@@ -4,236 +4,159 @@
4
  "best_model_checkpoint": null,
5
  "epoch": 3.0,
6
  "eval_steps": 500,
7
- "global_step": 153,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.09803921568627451,
14
- "grad_norm": 0.375,
15
- "learning_rate": 2.5e-05,
16
- "loss": 0.1074,
17
  "step": 5
18
  },
19
  {
20
- "epoch": 0.19607843137254902,
21
- "grad_norm": 0.3359375,
22
- "learning_rate": 4.9994719205415894e-05,
23
- "loss": 0.099,
24
  "step": 10
25
  },
26
  {
27
- "epoch": 0.29411764705882354,
28
- "grad_norm": 0.318359375,
29
- "learning_rate": 4.981015154140181e-05,
30
- "loss": 0.1039,
31
  "step": 15
32
  },
33
  {
34
- "epoch": 0.39215686274509803,
35
- "grad_norm": 0.298828125,
36
- "learning_rate": 4.936401774893088e-05,
37
- "loss": 0.1032,
38
  "step": 20
39
  },
40
  {
41
- "epoch": 0.49019607843137253,
42
- "grad_norm": 0.302734375,
43
- "learning_rate": 4.8661548338815794e-05,
44
- "loss": 0.1,
45
  "step": 25
46
  },
47
  {
48
- "epoch": 0.5882352941176471,
49
- "grad_norm": 0.267578125,
50
- "learning_rate": 4.771097912261625e-05,
51
- "loss": 0.0995,
52
  "step": 30
53
  },
54
  {
55
- "epoch": 0.6862745098039216,
56
- "grad_norm": 0.287109375,
57
- "learning_rate": 4.652345465527847e-05,
58
- "loss": 0.1007,
59
  "step": 35
60
  },
61
  {
62
- "epoch": 0.7843137254901961,
63
- "grad_norm": 0.27734375,
64
- "learning_rate": 4.511289757541801e-05,
65
- "loss": 0.098,
66
  "step": 40
67
  },
68
  {
69
- "epoch": 0.8823529411764706,
70
- "grad_norm": 0.271484375,
71
- "learning_rate": 4.349584537511159e-05,
72
- "loss": 0.0968,
73
  "step": 45
74
  },
75
  {
76
- "epoch": 0.9803921568627451,
77
- "grad_norm": 0.2578125,
78
- "learning_rate": 4.16912565129229e-05,
79
- "loss": 0.0988,
80
  "step": 50
81
  },
82
  {
83
- "epoch": 1.0784313725490196,
84
- "grad_norm": 0.279296875,
85
- "learning_rate": 3.9720288143309735e-05,
86
- "loss": 0.0961,
87
  "step": 55
88
  },
89
  {
90
- "epoch": 1.1764705882352942,
91
- "grad_norm": 0.271484375,
92
- "learning_rate": 3.7606048068332286e-05,
93
- "loss": 0.0909,
94
  "step": 60
95
  },
96
  {
97
- "epoch": 1.2745098039215685,
98
- "grad_norm": 0.30078125,
99
- "learning_rate": 3.5373323819801494e-05,
100
- "loss": 0.0942,
101
  "step": 65
102
  },
103
  {
104
- "epoch": 1.3725490196078431,
105
- "grad_norm": 0.267578125,
106
- "learning_rate": 3.304829204813215e-05,
107
- "loss": 0.09,
108
  "step": 70
109
  },
110
  {
111
- "epoch": 1.4705882352941178,
112
- "grad_norm": 0.271484375,
113
- "learning_rate": 3.065821162505025e-05,
114
- "loss": 0.0932,
115
  "step": 75
116
  },
117
  {
118
- "epoch": 1.5686274509803921,
119
- "grad_norm": 0.255859375,
120
- "learning_rate": 2.8231104058245068e-05,
121
- "loss": 0.0894,
122
  "step": 80
123
  },
124
  {
125
- "epoch": 1.6666666666666665,
126
- "grad_norm": 0.25390625,
127
- "learning_rate": 2.579542496481177e-05,
128
- "loss": 0.0964,
129
  "step": 85
130
  },
131
  {
132
- "epoch": 1.7647058823529411,
133
- "grad_norm": 0.29296875,
134
- "learning_rate": 2.3379730455158238e-05,
135
- "loss": 0.0897,
136
  "step": 90
137
  },
138
  {
139
- "epoch": 1.8627450980392157,
140
- "grad_norm": 0.26171875,
141
- "learning_rate": 2.101234233871961e-05,
142
- "loss": 0.0937,
143
  "step": 95
144
  },
145
- {
146
- "epoch": 1.9607843137254903,
147
- "grad_norm": 0.279296875,
148
- "learning_rate": 1.8721016076637528e-05,
149
- "loss": 0.0909,
150
- "step": 100
151
- },
152
- {
153
- "epoch": 2.0588235294117645,
154
- "grad_norm": 0.279296875,
155
- "learning_rate": 1.6532615374355324e-05,
156
- "loss": 0.0966,
157
- "step": 105
158
- },
159
- {
160
- "epoch": 2.156862745098039,
161
- "grad_norm": 0.24609375,
162
- "learning_rate": 1.4472797229233409e-05,
163
- "loss": 0.0882,
164
- "step": 110
165
- },
166
- {
167
- "epoch": 2.2549019607843137,
168
- "grad_norm": 0.251953125,
169
- "learning_rate": 1.2565711125713656e-05,
170
- "loss": 0.0869,
171
- "step": 115
172
- },
173
- {
174
- "epoch": 2.3529411764705883,
175
- "grad_norm": 0.2578125,
176
- "learning_rate": 1.0833715904694373e-05,
177
- "loss": 0.0909,
178
- "step": 120
179
- },
180
- {
181
- "epoch": 2.450980392156863,
182
- "grad_norm": 0.26171875,
183
- "learning_rate": 9.297117626563687e-06,
184
- "loss": 0.0902,
185
- "step": 125
186
- },
187
- {
188
- "epoch": 2.549019607843137,
189
- "grad_norm": 0.23046875,
190
- "learning_rate": 7.973931501207469e-06,
191
- "loss": 0.091,
192
- "step": 130
193
- },
194
- {
195
- "epoch": 2.6470588235294117,
196
- "grad_norm": 0.283203125,
197
- "learning_rate": 6.879670676144916e-06,
198
- "loss": 0.0914,
199
- "step": 135
200
- },
201
- {
202
- "epoch": 2.7450980392156863,
203
- "grad_norm": 0.240234375,
204
- "learning_rate": 6.027164359057668e-06,
205
- "loss": 0.0909,
206
- "step": 140
207
- },
208
- {
209
- "epoch": 2.843137254901961,
210
- "grad_norm": 0.2333984375,
211
- "learning_rate": 5.426407407059619e-06,
212
- "loss": 0.0888,
213
- "step": 145
214
- },
215
- {
216
- "epoch": 2.9411764705882355,
217
- "grad_norm": 0.2431640625,
218
- "learning_rate": 5.084443146135623e-06,
219
- "loss": 0.0911,
220
- "step": 150
221
- },
222
  {
223
  "epoch": 3.0,
224
- "step": 153,
225
- "total_flos": 5.211391058037965e+16,
226
- "train_loss": 0.09458173938046872,
227
- "train_runtime": 117.3732,
228
- "train_samples_per_second": 164.066,
229
- "train_steps_per_second": 1.304
230
  }
231
  ],
232
  "logging_steps": 5,
233
- "max_steps": 153,
234
  "num_input_tokens_seen": 0,
235
  "num_train_epochs": 3,
236
- "save_steps": 100,
237
  "stateful_callbacks": {
238
  "TrainerControl": {
239
  "args": {
@@ -246,7 +169,7 @@
246
  "attributes": {}
247
  }
248
  },
249
- "total_flos": 5.211391058037965e+16,
250
  "train_batch_size": 128,
251
  "trial_name": null,
252
  "trial_params": null
 
4
  "best_model_checkpoint": null,
5
  "epoch": 3.0,
6
  "eval_steps": 500,
7
+ "global_step": 99,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.15151515151515152,
14
+ "grad_norm": 0.4375,
15
+ "learning_rate": 2.4e-05,
16
+ "loss": 0.1336,
17
  "step": 5
18
  },
19
  {
20
+ "epoch": 0.30303030303030304,
21
+ "grad_norm": 0.466796875,
22
+ "learning_rate": 2.9879546090089533e-05,
23
+ "loss": 0.1225,
24
  "step": 10
25
  },
26
  {
27
+ "epoch": 0.45454545454545453,
28
+ "grad_norm": 0.333984375,
29
+ "learning_rate": 2.9393883712293316e-05,
30
+ "loss": 0.1116,
31
  "step": 15
32
  },
33
  {
34
+ "epoch": 0.6060606060606061,
35
+ "grad_norm": 0.31640625,
36
+ "learning_rate": 2.8549004284108398e-05,
37
+ "loss": 0.1105,
38
  "step": 20
39
  },
40
  {
41
+ "epoch": 0.7575757575757576,
42
+ "grad_norm": 0.29296875,
43
+ "learning_rate": 2.7368445717222102e-05,
44
+ "loss": 0.1082,
45
  "step": 25
46
  },
47
  {
48
+ "epoch": 0.9090909090909091,
49
+ "grad_norm": 0.271484375,
50
+ "learning_rate": 2.5885097773607675e-05,
51
+ "loss": 0.1098,
52
  "step": 30
53
  },
54
  {
55
+ "epoch": 1.0606060606060606,
56
+ "grad_norm": 0.26953125,
57
+ "learning_rate": 2.4140285773463036e-05,
58
+ "loss": 0.1056,
59
  "step": 35
60
  },
61
  {
62
+ "epoch": 1.2121212121212122,
63
+ "grad_norm": 0.267578125,
64
+ "learning_rate": 2.2182619292782524e-05,
65
+ "loss": 0.1084,
66
  "step": 40
67
  },
68
  {
69
+ "epoch": 1.3636363636363638,
70
+ "grad_norm": 0.28125,
71
+ "learning_rate": 2.0066637925262362e-05,
72
+ "loss": 0.1048,
73
  "step": 45
74
  },
75
  {
76
+ "epoch": 1.5151515151515151,
77
+ "grad_norm": 0.26953125,
78
+ "learning_rate": 1.7851291836925332e-05,
79
+ "loss": 0.1057,
80
  "step": 50
81
  },
82
  {
83
+ "epoch": 1.6666666666666665,
84
+ "grad_norm": 0.2734375,
85
+ "learning_rate": 1.559829944444086e-05,
86
+ "loss": 0.1046,
87
  "step": 55
88
  },
89
  {
90
+ "epoch": 1.8181818181818183,
91
+ "grad_norm": 0.259765625,
92
+ "learning_rate": 1.3370427971388369e-05,
93
+ "loss": 0.1057,
94
  "step": 60
95
  },
96
  {
97
+ "epoch": 1.9696969696969697,
98
+ "grad_norm": 0.287109375,
99
+ "learning_rate": 1.1229744785292821e-05,
100
+ "loss": 0.1048,
101
  "step": 65
102
  },
103
  {
104
+ "epoch": 2.121212121212121,
105
+ "grad_norm": 0.25390625,
106
+ "learning_rate": 9.235888232294472e-06,
107
+ "loss": 0.104,
108
  "step": 70
109
  },
110
  {
111
+ "epoch": 2.2727272727272725,
112
+ "grad_norm": 0.26171875,
113
+ "learning_rate": 7.444406143120487e-06,
114
+ "loss": 0.1019,
115
  "step": 75
116
  },
117
  {
118
+ "epoch": 2.4242424242424243,
119
+ "grad_norm": 0.271484375,
120
+ "learning_rate": 5.9052082987380775e-06,
121
+ "loss": 0.1052,
122
  "step": 80
123
  },
124
  {
125
+ "epoch": 2.5757575757575757,
126
+ "grad_norm": 0.2578125,
127
+ "learning_rate": 4.6611759692099345e-06,
128
+ "loss": 0.1047,
129
  "step": 85
130
  },
131
  {
132
+ "epoch": 2.7272727272727275,
133
+ "grad_norm": 0.2578125,
134
+ "learning_rate": 3.746967263293098e-06,
135
+ "loss": 0.1048,
136
  "step": 90
137
  },
138
  {
139
+ "epoch": 2.878787878787879,
140
+ "grad_norm": 0.26953125,
141
+ "learning_rate": 3.188051571134615e-06,
142
+ "loss": 0.1035,
143
  "step": 95
144
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  {
146
  "epoch": 3.0,
147
+ "step": 99,
148
+ "total_flos": 3.3879319029743616e+16,
149
+ "train_loss": 0.10813453071045154,
150
+ "train_runtime": 80.1681,
151
+ "train_samples_per_second": 156.159,
152
+ "train_steps_per_second": 1.235
153
  }
154
  ],
155
  "logging_steps": 5,
156
+ "max_steps": 99,
157
  "num_input_tokens_seen": 0,
158
  "num_train_epochs": 3,
159
+ "save_steps": 500,
160
  "stateful_callbacks": {
161
  "TrainerControl": {
162
  "args": {
 
169
  "attributes": {}
170
  }
171
  },
172
+ "total_flos": 3.3879319029743616e+16,
173
  "train_batch_size": 128,
174
  "trial_name": null,
175
  "trial_params": null