kms7530 commited on
Commit
7786b8e
·
verified ·
1 Parent(s): 53e6c0e

Upload 6 files

Browse files
Files changed (4) hide show
  1. config.json +1 -0
  2. model.safetensors +1 -1
  3. trainer_state.json +410 -193
  4. training_args.bin +1 -1
config.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "architectures": [
3
  "VisionEncoderDecoderModel"
4
  ],
 
1
  {
2
+ "_name_or_path": "kms7530/ko-coco",
3
  "architectures": [
4
  "VisionEncoderDecoderModel"
5
  ],
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:286b0f04d00d206225bc71b352f2fc1ee9bcfe04d6e73cec11cafc461173bf8c
3
  size 901697272
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ae809b41c667fa9b9b0fcce2d610db8925c62d14621efb6401bc596ffaeace8
3
  size 901697272
trainer_state.json CHANGED
@@ -1,306 +1,523 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
- "eval_steps": 500,
6
- "global_step": 3077,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.032499187520311994,
13
- "grad_norm": 0.38231489062309265,
14
- "learning_rate": 4.8375040623984405e-05,
15
- "loss": 0.1722,
16
  "step": 100
17
  },
18
  {
19
- "epoch": 0.06499837504062399,
20
- "grad_norm": 0.29991772770881653,
21
- "learning_rate": 4.67500812479688e-05,
22
- "loss": 0.1983,
23
  "step": 200
24
  },
25
  {
26
- "epoch": 0.09749756256093597,
27
- "grad_norm": 0.3033580780029297,
28
- "learning_rate": 4.51251218719532e-05,
29
- "loss": 0.2034,
30
  "step": 300
31
  },
32
  {
33
- "epoch": 0.12999675008124797,
34
- "grad_norm": 0.31061500310897827,
35
- "learning_rate": 4.3500162495937604e-05,
36
- "loss": 0.2022,
37
  "step": 400
38
  },
39
  {
40
- "epoch": 0.16249593760155995,
41
- "grad_norm": 0.27624836564064026,
42
- "learning_rate": 4.1875203119922e-05,
43
- "loss": 0.1975,
44
- "step": 500
45
- },
46
- {
47
- "epoch": 0.16249593760155995,
48
- "eval_gen_len": 19.964290067032298,
49
- "eval_loss": 0.18788783252239227,
50
- "eval_rouge1": 0.0068,
51
- "eval_rouge2": 0.0,
52
- "eval_rougeL": 0.0068,
53
- "eval_rougeLsum": 0.0068,
54
- "eval_runtime": 2174.226,
55
- "eval_samples_per_second": 11.321,
56
- "eval_steps_per_second": 0.354,
57
  "step": 500
58
  },
59
  {
60
- "epoch": 0.19499512512187195,
61
- "grad_norm": 0.2493065893650055,
62
- "learning_rate": 4.02502437439064e-05,
63
- "loss": 0.1943,
64
  "step": 600
65
  },
66
  {
67
- "epoch": 0.22749431264218395,
68
- "grad_norm": 0.2768040597438812,
69
- "learning_rate": 3.8625284367890804e-05,
70
- "loss": 0.1919,
71
  "step": 700
72
  },
73
  {
74
- "epoch": 0.25999350016249595,
75
- "grad_norm": 0.2641097605228424,
76
- "learning_rate": 3.7000324991875206e-05,
77
- "loss": 0.1883,
78
  "step": 800
79
  },
80
  {
81
- "epoch": 0.29249268768280795,
82
- "grad_norm": 0.266985148191452,
83
- "learning_rate": 3.537536561585961e-05,
84
- "loss": 0.1866,
85
  "step": 900
86
  },
87
  {
88
- "epoch": 0.3249918752031199,
89
- "grad_norm": 0.2505531311035156,
90
- "learning_rate": 3.3750406239844004e-05,
91
- "loss": 0.186,
92
  "step": 1000
93
  },
94
  {
95
- "epoch": 0.3249918752031199,
96
- "eval_gen_len": 19.9981718464351,
97
- "eval_loss": 0.1798761785030365,
98
- "eval_rouge1": 0.0108,
99
- "eval_rouge2": 0.0041,
100
- "eval_rougeL": 0.0108,
101
- "eval_rougeLsum": 0.0108,
102
- "eval_runtime": 2176.3057,
103
- "eval_samples_per_second": 11.31,
104
- "eval_steps_per_second": 0.354,
105
  "step": 1000
106
  },
107
  {
108
- "epoch": 0.3574910627234319,
109
- "grad_norm": 0.2498556226491928,
110
- "learning_rate": 3.2125446863828406e-05,
111
- "loss": 0.1839,
112
  "step": 1100
113
  },
114
  {
115
- "epoch": 0.3899902502437439,
116
- "grad_norm": 0.2563565969467163,
117
- "learning_rate": 3.0500487487812808e-05,
118
- "loss": 0.1835,
119
  "step": 1200
120
  },
121
  {
122
- "epoch": 0.4224894377640559,
123
- "grad_norm": 0.25993165373802185,
124
- "learning_rate": 2.8875528111797207e-05,
125
- "loss": 0.1837,
126
  "step": 1300
127
  },
128
  {
129
- "epoch": 0.4549886252843679,
130
- "grad_norm": 0.2639683485031128,
131
- "learning_rate": 2.7250568735781606e-05,
132
- "loss": 0.1814,
133
  "step": 1400
134
  },
135
  {
136
- "epoch": 0.4874878128046799,
137
- "grad_norm": 0.25668901205062866,
138
- "learning_rate": 2.5625609359766008e-05,
139
- "loss": 0.1762,
140
  "step": 1500
141
  },
142
  {
143
- "epoch": 0.4874878128046799,
144
- "eval_gen_len": 19.983384115376804,
145
- "eval_loss": 0.17473776638507843,
146
- "eval_rouge1": 0.1836,
147
- "eval_rouge2": 0.0,
148
- "eval_rougeL": 0.1828,
149
- "eval_rougeLsum": 0.1851,
150
- "eval_runtime": 2167.8807,
151
- "eval_samples_per_second": 11.354,
152
- "eval_steps_per_second": 0.355,
153
- "step": 1500
154
- },
155
- {
156
- "epoch": 0.5199870003249919,
157
- "grad_norm": 0.2645615041255951,
158
- "learning_rate": 2.4000649983750407e-05,
159
- "loss": 0.1794,
160
  "step": 1600
161
  },
162
  {
163
- "epoch": 0.5524861878453039,
164
- "grad_norm": 0.24295924603939056,
165
- "learning_rate": 2.237569060773481e-05,
166
- "loss": 0.1769,
167
  "step": 1700
168
  },
169
  {
170
- "epoch": 0.5849853753656159,
171
- "grad_norm": 0.25981849431991577,
172
- "learning_rate": 2.0750731231719208e-05,
173
- "loss": 0.1785,
174
  "step": 1800
175
  },
176
  {
177
- "epoch": 0.6174845628859279,
178
- "grad_norm": 0.24553097784519196,
179
- "learning_rate": 1.912577185570361e-05,
180
- "loss": 0.173,
181
  "step": 1900
182
  },
183
  {
184
- "epoch": 0.6499837504062398,
185
- "grad_norm": 0.24243266880512238,
186
- "learning_rate": 1.750081247968801e-05,
187
- "loss": 0.1764,
188
  "step": 2000
189
  },
190
  {
191
- "epoch": 0.6499837504062398,
192
- "eval_gen_len": 19.99471866747918,
193
- "eval_loss": 0.17154192924499512,
194
- "eval_rouge1": 0.1151,
195
  "eval_rouge2": 0.0,
196
- "eval_rougeL": 0.1151,
197
- "eval_rougeLsum": 0.1165,
198
- "eval_runtime": 2157.4938,
199
- "eval_samples_per_second": 11.409,
200
- "eval_steps_per_second": 0.357,
201
  "step": 2000
202
  },
203
  {
204
- "epoch": 0.6824829379265518,
205
- "grad_norm": 0.26906952261924744,
206
- "learning_rate": 1.5875853103672408e-05,
207
- "loss": 0.1789,
208
  "step": 2100
209
  },
210
  {
211
- "epoch": 0.7149821254468638,
212
- "grad_norm": 0.2654452919960022,
213
- "learning_rate": 1.4250893727656808e-05,
214
- "loss": 0.1727,
215
  "step": 2200
216
  },
217
  {
218
- "epoch": 0.7474813129671758,
219
- "grad_norm": 0.24443137645721436,
220
- "learning_rate": 1.2625934351641208e-05,
221
- "loss": 0.1767,
222
  "step": 2300
223
  },
224
  {
225
- "epoch": 0.7799805004874878,
226
- "grad_norm": 0.24741144478321075,
227
- "learning_rate": 1.1000974975625609e-05,
228
- "loss": 0.1745,
229
  "step": 2400
230
  },
231
  {
232
- "epoch": 0.8124796880077998,
233
- "grad_norm": 0.2507327198982239,
234
- "learning_rate": 9.376015599610011e-06,
235
- "loss": 0.1756,
236
  "step": 2500
237
  },
238
  {
239
- "epoch": 0.8124796880077998,
240
- "eval_gen_len": 19.99869997968718,
241
- "eval_loss": 0.1691381335258484,
242
- "eval_rouge1": 0.0027,
243
- "eval_rouge2": 0.0,
244
- "eval_rougeL": 0.0027,
245
- "eval_rougeLsum": 0.0027,
246
- "eval_runtime": 2163.6252,
247
- "eval_samples_per_second": 11.377,
248
- "eval_steps_per_second": 0.356,
249
- "step": 2500
250
- },
251
- {
252
- "epoch": 0.8449788755281118,
253
- "grad_norm": 0.23773913085460663,
254
- "learning_rate": 7.75105622359441e-06,
255
- "loss": 0.1733,
256
  "step": 2600
257
  },
258
  {
259
- "epoch": 0.8774780630484238,
260
- "grad_norm": 0.24695108830928802,
261
- "learning_rate": 6.1260968475788104e-06,
262
- "loss": 0.1744,
263
  "step": 2700
264
  },
265
  {
266
- "epoch": 0.9099772505687358,
267
- "grad_norm": 0.23871463537216187,
268
- "learning_rate": 4.501137471563211e-06,
269
- "loss": 0.1741,
270
  "step": 2800
271
  },
272
  {
273
- "epoch": 0.9424764380890478,
274
- "grad_norm": 0.24402374029159546,
275
- "learning_rate": 2.8761780955476114e-06,
276
- "loss": 0.1703,
277
  "step": 2900
278
  },
279
  {
280
- "epoch": 0.9749756256093598,
281
- "grad_norm": 0.24215468764305115,
282
- "learning_rate": 1.2512187195320117e-06,
283
- "loss": 0.1724,
284
  "step": 3000
285
  },
286
  {
287
- "epoch": 0.9749756256093598,
288
- "eval_gen_len": 19.99163111923624,
289
- "eval_loss": 0.1676524430513382,
290
- "eval_rouge1": 0.0257,
291
  "eval_rouge2": 0.0,
292
- "eval_rougeL": 0.0244,
293
- "eval_rougeLsum": 0.0257,
294
- "eval_runtime": 2148.352,
295
- "eval_samples_per_second": 11.458,
296
- "eval_steps_per_second": 0.358,
297
  "step": 3000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
  }
299
  ],
300
  "logging_steps": 100,
301
- "max_steps": 3077,
302
  "num_input_tokens_seen": 0,
303
- "num_train_epochs": 1,
304
  "save_steps": 500,
305
  "stateful_callbacks": {
306
  "TrainerControl": {
@@ -314,7 +531,7 @@
314
  "attributes": {}
315
  }
316
  },
317
- "total_flos": 1.7819450187495506e+19,
318
  "train_batch_size": 32,
319
  "trial_name": null,
320
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.0,
5
+ "eval_steps": 1000,
6
+ "global_step": 6166,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.032435939020434644,
13
+ "grad_norm": 0.35351359844207764,
14
+ "learning_rate": 4.918910152448913e-05,
15
+ "loss": 0.1337,
16
  "step": 100
17
  },
18
  {
19
+ "epoch": 0.06487187804086929,
20
+ "grad_norm": 0.2617332935333252,
21
+ "learning_rate": 4.837820304897827e-05,
22
+ "loss": 0.1762,
23
  "step": 200
24
  },
25
  {
26
+ "epoch": 0.09730781706130393,
27
+ "grad_norm": 0.26761409640312195,
28
+ "learning_rate": 4.7567304573467406e-05,
29
+ "loss": 0.1729,
30
  "step": 300
31
  },
32
  {
33
+ "epoch": 0.12974375608173858,
34
+ "grad_norm": 0.2583613991737366,
35
+ "learning_rate": 4.675640609795654e-05,
36
+ "loss": 0.1728,
37
  "step": 400
38
  },
39
  {
40
+ "epoch": 0.1621796951021732,
41
+ "grad_norm": 0.24968299269676208,
42
+ "learning_rate": 4.594550762244567e-05,
43
+ "loss": 0.1721,
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  "step": 500
45
  },
46
  {
47
+ "epoch": 0.19461563412260785,
48
+ "grad_norm": 0.252204567193985,
49
+ "learning_rate": 4.513460914693481e-05,
50
+ "loss": 0.1729,
51
  "step": 600
52
  },
53
  {
54
+ "epoch": 0.2270515731430425,
55
+ "grad_norm": 0.25790390372276306,
56
+ "learning_rate": 4.432371067142394e-05,
57
+ "loss": 0.1716,
58
  "step": 700
59
  },
60
  {
61
+ "epoch": 0.25948751216347715,
62
+ "grad_norm": 0.2649690508842468,
63
+ "learning_rate": 4.351281219591307e-05,
64
+ "loss": 0.1726,
65
  "step": 800
66
  },
67
  {
68
+ "epoch": 0.2919234511839118,
69
+ "grad_norm": 0.2653878927230835,
70
+ "learning_rate": 4.270191372040221e-05,
71
+ "loss": 0.172,
72
  "step": 900
73
  },
74
  {
75
+ "epoch": 0.3243593902043464,
76
+ "grad_norm": 0.2365722954273224,
77
+ "learning_rate": 4.1891015244891344e-05,
78
+ "loss": 0.1734,
79
  "step": 1000
80
  },
81
  {
82
+ "epoch": 0.3243593902043464,
83
+ "eval_gen_len": 19.97732987265796,
84
+ "eval_loss": 0.17071112990379333,
85
+ "eval_rouge1": 0.146,
86
+ "eval_rouge2": 0.0,
87
+ "eval_rougeL": 0.146,
88
+ "eval_rougeLsum": 0.1477,
89
+ "eval_runtime": 2204.8687,
90
+ "eval_samples_per_second": 11.183,
91
+ "eval_steps_per_second": 0.35,
92
  "step": 1000
93
  },
94
  {
95
+ "epoch": 0.3567953292247811,
96
+ "grad_norm": 0.2285241037607193,
97
+ "learning_rate": 4.108011676938048e-05,
98
+ "loss": 0.1721,
99
  "step": 1100
100
  },
101
  {
102
+ "epoch": 0.3892312682452157,
103
+ "grad_norm": 0.2493802011013031,
104
+ "learning_rate": 4.026921829386961e-05,
105
+ "loss": 0.1694,
106
  "step": 1200
107
  },
108
  {
109
+ "epoch": 0.4216672072656503,
110
+ "grad_norm": 0.25236544013023376,
111
+ "learning_rate": 3.945831981835874e-05,
112
+ "loss": 0.1698,
113
  "step": 1300
114
  },
115
  {
116
+ "epoch": 0.454103146286085,
117
+ "grad_norm": 0.243851900100708,
118
+ "learning_rate": 3.864742134284788e-05,
119
+ "loss": 0.1683,
120
  "step": 1400
121
  },
122
  {
123
+ "epoch": 0.4865390853065196,
124
+ "grad_norm": 0.24125412106513977,
125
+ "learning_rate": 3.783652286733701e-05,
126
+ "loss": 0.1694,
127
  "step": 1500
128
  },
129
  {
130
+ "epoch": 0.5189750243269543,
131
+ "grad_norm": 0.25099146366119385,
132
+ "learning_rate": 3.7025624391826144e-05,
133
+ "loss": 0.1669,
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  "step": 1600
135
  },
136
  {
137
+ "epoch": 0.5514109633473889,
138
+ "grad_norm": 0.23592032492160797,
139
+ "learning_rate": 3.621472591631528e-05,
140
+ "loss": 0.1668,
141
  "step": 1700
142
  },
143
  {
144
+ "epoch": 0.5838469023678236,
145
+ "grad_norm": 0.25207582116127014,
146
+ "learning_rate": 3.540382744080442e-05,
147
+ "loss": 0.1678,
148
  "step": 1800
149
  },
150
  {
151
+ "epoch": 0.6162828413882582,
152
+ "grad_norm": 0.2521522045135498,
153
+ "learning_rate": 3.459292896529355e-05,
154
+ "loss": 0.166,
155
  "step": 1900
156
  },
157
  {
158
+ "epoch": 0.6487187804086928,
159
+ "grad_norm": 0.2497226446866989,
160
+ "learning_rate": 3.378203048978268e-05,
161
+ "loss": 0.169,
162
  "step": 2000
163
  },
164
  {
165
+ "epoch": 0.6487187804086928,
166
+ "eval_gen_len": 19.993389569308135,
167
+ "eval_loss": 0.1660359501838684,
168
+ "eval_rouge1": 0.1899,
169
  "eval_rouge2": 0.0,
170
+ "eval_rougeL": 0.1899,
171
+ "eval_rougeLsum": 0.1913,
172
+ "eval_runtime": 2195.0671,
173
+ "eval_samples_per_second": 11.233,
174
+ "eval_steps_per_second": 0.351,
175
  "step": 2000
176
  },
177
  {
178
+ "epoch": 0.6811547194291274,
179
+ "grad_norm": 0.25921228528022766,
180
+ "learning_rate": 3.2971132014271815e-05,
181
+ "loss": 0.1663,
182
  "step": 2100
183
  },
184
  {
185
+ "epoch": 0.7135906584495622,
186
+ "grad_norm": 0.22675295174121857,
187
+ "learning_rate": 3.2160233538760945e-05,
188
+ "loss": 0.1661,
189
  "step": 2200
190
  },
191
  {
192
+ "epoch": 0.7460265974699968,
193
+ "grad_norm": 0.253683477640152,
194
+ "learning_rate": 3.134933506325008e-05,
195
+ "loss": 0.1661,
196
  "step": 2300
197
  },
198
  {
199
+ "epoch": 0.7784625364904314,
200
+ "grad_norm": 0.23699304461479187,
201
+ "learning_rate": 3.053843658773922e-05,
202
+ "loss": 0.1664,
203
  "step": 2400
204
  },
205
  {
206
+ "epoch": 0.810898475510866,
207
+ "grad_norm": 0.21723595261573792,
208
+ "learning_rate": 2.972753811222835e-05,
209
+ "loss": 0.1628,
210
  "step": 2500
211
  },
212
  {
213
+ "epoch": 0.8433344145313006,
214
+ "grad_norm": 0.2416258603334427,
215
+ "learning_rate": 2.8916639636717486e-05,
216
+ "loss": 0.1684,
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  "step": 2600
218
  },
219
  {
220
+ "epoch": 0.8757703535517353,
221
+ "grad_norm": 0.24152766168117523,
222
+ "learning_rate": 2.8105741161206616e-05,
223
+ "loss": 0.1655,
224
  "step": 2700
225
  },
226
  {
227
+ "epoch": 0.90820629257217,
228
+ "grad_norm": 0.23950466513633728,
229
+ "learning_rate": 2.7294842685695753e-05,
230
+ "loss": 0.1662,
231
  "step": 2800
232
  },
233
  {
234
+ "epoch": 0.9406422315926046,
235
+ "grad_norm": 0.2509647011756897,
236
+ "learning_rate": 2.6483944210184886e-05,
237
+ "loss": 0.1642,
238
  "step": 2900
239
  },
240
  {
241
+ "epoch": 0.9730781706130393,
242
+ "grad_norm": 0.24778147041797638,
243
+ "learning_rate": 2.5673045734674023e-05,
244
+ "loss": 0.1628,
245
  "step": 3000
246
  },
247
  {
248
+ "epoch": 0.9730781706130393,
249
+ "eval_gen_len": 19.997323383891636,
250
+ "eval_loss": 0.16280025243759155,
251
+ "eval_rouge1": 0.0561,
252
  "eval_rouge2": 0.0,
253
+ "eval_rougeL": 0.0561,
254
+ "eval_rougeLsum": 0.0547,
255
+ "eval_runtime": 2183.6993,
256
+ "eval_samples_per_second": 11.292,
257
+ "eval_steps_per_second": 0.353,
258
  "step": 3000
259
+ },
260
+ {
261
+ "epoch": 1.0055141096334739,
262
+ "grad_norm": 0.22274711728096008,
263
+ "learning_rate": 2.4862147259163153e-05,
264
+ "loss": 0.1629,
265
+ "step": 3100
266
+ },
267
+ {
268
+ "epoch": 1.0379500486539086,
269
+ "grad_norm": 0.23203332722187042,
270
+ "learning_rate": 2.405124878365229e-05,
271
+ "loss": 0.1556,
272
+ "step": 3200
273
+ },
274
+ {
275
+ "epoch": 1.0703859876743431,
276
+ "grad_norm": 0.24074462056159973,
277
+ "learning_rate": 2.3240350308141423e-05,
278
+ "loss": 0.1548,
279
+ "step": 3300
280
+ },
281
+ {
282
+ "epoch": 1.1028219266947779,
283
+ "grad_norm": 0.23876875638961792,
284
+ "learning_rate": 2.2429451832630553e-05,
285
+ "loss": 0.1535,
286
+ "step": 3400
287
+ },
288
+ {
289
+ "epoch": 1.1352578657152124,
290
+ "grad_norm": 0.23867164552211761,
291
+ "learning_rate": 2.161855335711969e-05,
292
+ "loss": 0.1528,
293
+ "step": 3500
294
+ },
295
+ {
296
+ "epoch": 1.167693804735647,
297
+ "grad_norm": 0.2626864016056061,
298
+ "learning_rate": 2.0807654881608824e-05,
299
+ "loss": 0.1526,
300
+ "step": 3600
301
+ },
302
+ {
303
+ "epoch": 1.2001297437560818,
304
+ "grad_norm": 0.26715072989463806,
305
+ "learning_rate": 1.9996756406097957e-05,
306
+ "loss": 0.1526,
307
+ "step": 3700
308
+ },
309
+ {
310
+ "epoch": 1.2325656827765163,
311
+ "grad_norm": 0.2496395856142044,
312
+ "learning_rate": 1.9185857930587094e-05,
313
+ "loss": 0.1518,
314
+ "step": 3800
315
+ },
316
+ {
317
+ "epoch": 1.265001621796951,
318
+ "grad_norm": 0.24588952958583832,
319
+ "learning_rate": 1.8374959455076227e-05,
320
+ "loss": 0.1519,
321
+ "step": 3900
322
+ },
323
+ {
324
+ "epoch": 1.2974375608173856,
325
+ "grad_norm": 0.23679636418819427,
326
+ "learning_rate": 1.7564060979565357e-05,
327
+ "loss": 0.1504,
328
+ "step": 4000
329
+ },
330
+ {
331
+ "epoch": 1.2974375608173856,
332
+ "eval_gen_len": 19.98292643361181,
333
+ "eval_loss": 0.16194650530815125,
334
+ "eval_rouge1": 0.1229,
335
+ "eval_rouge2": 0.0041,
336
+ "eval_rougeL": 0.123,
337
+ "eval_rougeLsum": 0.1233,
338
+ "eval_runtime": 2185.3639,
339
+ "eval_samples_per_second": 11.283,
340
+ "eval_steps_per_second": 0.353,
341
+ "step": 4000
342
+ },
343
+ {
344
+ "epoch": 1.3298734998378203,
345
+ "grad_norm": 0.24863706529140472,
346
+ "learning_rate": 1.6753162504054494e-05,
347
+ "loss": 0.1511,
348
+ "step": 4100
349
+ },
350
+ {
351
+ "epoch": 1.3623094388582548,
352
+ "grad_norm": 0.2532482445240021,
353
+ "learning_rate": 1.5942264028543628e-05,
354
+ "loss": 0.1503,
355
+ "step": 4200
356
+ },
357
+ {
358
+ "epoch": 1.3947453778786896,
359
+ "grad_norm": 0.2682236135005951,
360
+ "learning_rate": 1.5131365553032761e-05,
361
+ "loss": 0.1529,
362
+ "step": 4300
363
+ },
364
+ {
365
+ "epoch": 1.4271813168991243,
366
+ "grad_norm": 0.23846983909606934,
367
+ "learning_rate": 1.4320467077521896e-05,
368
+ "loss": 0.1514,
369
+ "step": 4400
370
+ },
371
+ {
372
+ "epoch": 1.4596172559195588,
373
+ "grad_norm": 0.2428126037120819,
374
+ "learning_rate": 1.350956860201103e-05,
375
+ "loss": 0.1486,
376
+ "step": 4500
377
+ },
378
+ {
379
+ "epoch": 1.4920531949399936,
380
+ "grad_norm": 0.25097399950027466,
381
+ "learning_rate": 1.2698670126500161e-05,
382
+ "loss": 0.15,
383
+ "step": 4600
384
+ },
385
+ {
386
+ "epoch": 1.524489133960428,
387
+ "grad_norm": 0.2583732008934021,
388
+ "learning_rate": 1.1887771650989297e-05,
389
+ "loss": 0.1528,
390
+ "step": 4700
391
+ },
392
+ {
393
+ "epoch": 1.5569250729808628,
394
+ "grad_norm": 0.23872007429599762,
395
+ "learning_rate": 1.107687317547843e-05,
396
+ "loss": 0.1486,
397
+ "step": 4800
398
+ },
399
+ {
400
+ "epoch": 1.5893610120012975,
401
+ "grad_norm": 0.2473708838224411,
402
+ "learning_rate": 1.0265974699967565e-05,
403
+ "loss": 0.1493,
404
+ "step": 4900
405
+ },
406
+ {
407
+ "epoch": 1.621796951021732,
408
+ "grad_norm": 0.24233025312423706,
409
+ "learning_rate": 9.455076224456699e-06,
410
+ "loss": 0.1505,
411
+ "step": 5000
412
+ },
413
+ {
414
+ "epoch": 1.621796951021732,
415
+ "eval_gen_len": 19.982034228242355,
416
+ "eval_loss": 0.16026677191257477,
417
+ "eval_rouge1": 0.0744,
418
+ "eval_rouge2": 0.0041,
419
+ "eval_rougeL": 0.0744,
420
+ "eval_rougeLsum": 0.0744,
421
+ "eval_runtime": 2176.6648,
422
+ "eval_samples_per_second": 11.328,
423
+ "eval_steps_per_second": 0.354,
424
+ "step": 5000
425
+ },
426
+ {
427
+ "epoch": 1.6542328900421666,
428
+ "grad_norm": 0.25053468346595764,
429
+ "learning_rate": 8.644177748945832e-06,
430
+ "loss": 0.1508,
431
+ "step": 5100
432
+ },
433
+ {
434
+ "epoch": 1.6866688290626013,
435
+ "grad_norm": 0.24528223276138306,
436
+ "learning_rate": 7.833279273434967e-06,
437
+ "loss": 0.1499,
438
+ "step": 5200
439
+ },
440
+ {
441
+ "epoch": 1.719104768083036,
442
+ "grad_norm": 0.2794703543186188,
443
+ "learning_rate": 7.0223807979241e-06,
444
+ "loss": 0.1476,
445
+ "step": 5300
446
+ },
447
+ {
448
+ "epoch": 1.7515407071034708,
449
+ "grad_norm": 0.257614403963089,
450
+ "learning_rate": 6.211482322413234e-06,
451
+ "loss": 0.1498,
452
+ "step": 5400
453
+ },
454
+ {
455
+ "epoch": 1.7839766461239053,
456
+ "grad_norm": 0.24895663559436798,
457
+ "learning_rate": 5.400583846902368e-06,
458
+ "loss": 0.1485,
459
+ "step": 5500
460
+ },
461
+ {
462
+ "epoch": 1.8164125851443398,
463
+ "grad_norm": 0.26096346974372864,
464
+ "learning_rate": 4.589685371391502e-06,
465
+ "loss": 0.1486,
466
+ "step": 5600
467
+ },
468
+ {
469
+ "epoch": 1.8488485241647745,
470
+ "grad_norm": 0.23756231367588043,
471
+ "learning_rate": 3.7787868958806357e-06,
472
+ "loss": 0.1506,
473
+ "step": 5700
474
+ },
475
+ {
476
+ "epoch": 1.8812844631852093,
477
+ "grad_norm": 0.2549228072166443,
478
+ "learning_rate": 2.9678884203697696e-06,
479
+ "loss": 0.1483,
480
+ "step": 5800
481
+ },
482
+ {
483
+ "epoch": 1.913720402205644,
484
+ "grad_norm": 0.24819372594356537,
485
+ "learning_rate": 2.156989944858904e-06,
486
+ "loss": 0.149,
487
+ "step": 5900
488
+ },
489
+ {
490
+ "epoch": 1.9461563412260785,
491
+ "grad_norm": 0.23372448980808258,
492
+ "learning_rate": 1.3460914693480378e-06,
493
+ "loss": 0.1506,
494
+ "step": 6000
495
+ },
496
+ {
497
+ "epoch": 1.9461563412260785,
498
+ "eval_gen_len": 19.98361586503366,
499
+ "eval_loss": 0.15918129682540894,
500
+ "eval_rouge1": 0.073,
501
+ "eval_rouge2": 0.002,
502
+ "eval_rougeL": 0.073,
503
+ "eval_rougeLsum": 0.073,
504
+ "eval_runtime": 2174.8737,
505
+ "eval_samples_per_second": 11.338,
506
+ "eval_steps_per_second": 0.355,
507
+ "step": 6000
508
+ },
509
+ {
510
+ "epoch": 1.978592280246513,
511
+ "grad_norm": 0.25701549649238586,
512
+ "learning_rate": 5.351929938371715e-07,
513
+ "loss": 0.15,
514
+ "step": 6100
515
  }
516
  ],
517
  "logging_steps": 100,
518
+ "max_steps": 6166,
519
  "num_input_tokens_seen": 0,
520
+ "num_train_epochs": 2,
521
  "save_steps": 500,
522
  "stateful_callbacks": {
523
  "TrainerControl": {
 
531
  "attributes": {}
532
  }
533
  },
534
+ "total_flos": 3.5709139077411373e+19,
535
  "train_batch_size": 32,
536
  "trial_name": null,
537
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:194ce32cec5fe69bed212938bb3ed6d454d649295f789cd68caef98b2b3225af
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5d585a2c4f69235113a2ad29e36aef3be9f972906a8fd4eddd987849af1a5a8
3
  size 5304