Azrail commited on
Commit
9995fe8
·
verified ·
1 Parent(s): 20844d3

Training in progress, step 120000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:423179ea0149a7aaeacb5ccaa10149a8392d7f119d23b5e82ddb6e09d76ee4bf
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e70907b0d675ee2643842e014ed6c972c9663ac94c350f0ab42a0be8632152c
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b71ae6a920aee5962a410d286e3547ba68e15be1375e1283ae48d23a63cbab16
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3a65d04a4bb9bbc428894a0e56fe5a8ff86920144b87270537f75bf5b3558c9
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8f8fb2244d43602b2b223fa5f88e945c708dd60e4c4c5e962793b5f1f77fe7b
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d8a9a435a8fb7efaea34ed653a04299793c4ab23d440f306a1001d1a5e2fe4d
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a469da166349e663b52b425176faaf03bae4cb82a5020b6687129f2f779fc711
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f14fb013cc682f88bd394d32631eff6723ea097f4e238bec79824a853a5616c4
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.1352683735406703,
6
  "eval_steps": 500,
7
- "global_step": 119000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -21190,11 +21190,189 @@
21190
  "eval_steps_per_second": 15.211,
21191
  "num_input_tokens_seen": 62380238112,
21192
  "step": 119000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21193
  }
21194
  ],
21195
  "logging_steps": 50,
21196
  "max_steps": 140000,
21197
- "num_input_tokens_seen": 62380238112,
21198
  "num_train_epochs": 2,
21199
  "save_steps": 1000,
21200
  "stateful_callbacks": {
@@ -21209,7 +21387,7 @@
21209
  "attributes": {}
21210
  }
21211
  },
21212
- "total_flos": 1.1040164330280837e+20,
21213
  "train_batch_size": 32,
21214
  "trial_name": null,
21215
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.1448084238644358,
6
  "eval_steps": 500,
7
+ "global_step": 120000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
21190
  "eval_steps_per_second": 15.211,
21191
  "num_input_tokens_seen": 62380238112,
21192
  "step": 119000
21193
+ },
21194
+ {
21195
+ "epoch": 1.1357453760568588,
21196
+ "grad_norm": 0.1409357637166977,
21197
+ "learning_rate": 0.0008515644076206653,
21198
+ "loss": 2.0885,
21199
+ "num_input_tokens_seen": 62406448192,
21200
+ "step": 119050
21201
+ },
21202
+ {
21203
+ "epoch": 1.136222378573047,
21204
+ "grad_norm": 0.15409712493419647,
21205
+ "learning_rate": 0.0008495643602586287,
21206
+ "loss": 2.0778,
21207
+ "num_input_tokens_seen": 62432661632,
21208
+ "step": 119100
21209
+ },
21210
+ {
21211
+ "epoch": 1.1366993810892352,
21212
+ "grad_norm": 0.1327887326478958,
21213
+ "learning_rate": 0.0008475533114523955,
21214
+ "loss": 2.086,
21215
+ "num_input_tokens_seen": 62458870752,
21216
+ "step": 119150
21217
+ },
21218
+ {
21219
+ "epoch": 1.1371763836054236,
21220
+ "grad_norm": 0.14051629602909088,
21221
+ "learning_rate": 0.0008455313244934324,
21222
+ "loss": 2.0765,
21223
+ "num_input_tokens_seen": 62485082688,
21224
+ "step": 119200
21225
+ },
21226
+ {
21227
+ "epoch": 1.1376533861216118,
21228
+ "grad_norm": 0.13998936116695404,
21229
+ "learning_rate": 0.0008434984630174508,
21230
+ "loss": 2.0784,
21231
+ "num_input_tokens_seen": 62511288832,
21232
+ "step": 119250
21233
+ },
21234
+ {
21235
+ "epoch": 1.1381303886378,
21236
+ "grad_norm": 0.1316358745098114,
21237
+ "learning_rate": 0.0008414547910024035,
21238
+ "loss": 2.0839,
21239
+ "num_input_tokens_seen": 62537499648,
21240
+ "step": 119300
21241
+ },
21242
+ {
21243
+ "epoch": 1.1386073911539882,
21244
+ "grad_norm": 0.13315369188785553,
21245
+ "learning_rate": 0.0008394003727664709,
21246
+ "loss": 2.0793,
21247
+ "num_input_tokens_seen": 62563710336,
21248
+ "step": 119350
21249
+ },
21250
+ {
21251
+ "epoch": 1.1390843936701767,
21252
+ "grad_norm": 0.1454961597919464,
21253
+ "learning_rate": 0.0008373352729660373,
21254
+ "loss": 2.0814,
21255
+ "num_input_tokens_seen": 62589918400,
21256
+ "step": 119400
21257
+ },
21258
+ {
21259
+ "epoch": 1.1395613961863649,
21260
+ "grad_norm": 0.14860859513282776,
21261
+ "learning_rate": 0.0008352595565936554,
21262
+ "loss": 2.0885,
21263
+ "num_input_tokens_seen": 62616130880,
21264
+ "step": 119450
21265
+ },
21266
+ {
21267
+ "epoch": 1.140038398702553,
21268
+ "grad_norm": 0.13664905726909637,
21269
+ "learning_rate": 0.000833173288976002,
21270
+ "loss": 2.0836,
21271
+ "num_input_tokens_seen": 62642339520,
21272
+ "step": 119500
21273
+ },
21274
+ {
21275
+ "epoch": 1.140038398702553,
21276
+ "eval_loss": 1.9989631175994873,
21277
+ "eval_runtime": 83.3074,
21278
+ "eval_samples_per_second": 60.019,
21279
+ "eval_steps_per_second": 15.005,
21280
+ "num_input_tokens_seen": 62642339520,
21281
+ "step": 119500
21282
+ },
21283
+ {
21284
+ "epoch": 1.1405154012187415,
21285
+ "grad_norm": 0.1337277889251709,
21286
+ "learning_rate": 0.0008310765357718206,
21287
+ "loss": 2.0745,
21288
+ "num_input_tokens_seen": 62668548896,
21289
+ "step": 119550
21290
+ },
21291
+ {
21292
+ "epoch": 1.1409924037349297,
21293
+ "grad_norm": 0.13231709599494934,
21294
+ "learning_rate": 0.0008289693629698564,
21295
+ "loss": 2.0851,
21296
+ "num_input_tokens_seen": 62694761888,
21297
+ "step": 119600
21298
+ },
21299
+ {
21300
+ "epoch": 1.141469406251118,
21301
+ "grad_norm": 0.13446244597434998,
21302
+ "learning_rate": 0.0008268518368867782,
21303
+ "loss": 2.0737,
21304
+ "num_input_tokens_seen": 62720974368,
21305
+ "step": 119650
21306
+ },
21307
+ {
21308
+ "epoch": 1.1419464087673061,
21309
+ "grad_norm": 0.14359907805919647,
21310
+ "learning_rate": 0.0008247240241650918,
21311
+ "loss": 2.0772,
21312
+ "num_input_tokens_seen": 62747188768,
21313
+ "step": 119700
21314
+ },
21315
+ {
21316
+ "epoch": 1.1424234112834946,
21317
+ "grad_norm": 0.13156485557556152,
21318
+ "learning_rate": 0.0008225859917710439,
21319
+ "loss": 2.0791,
21320
+ "num_input_tokens_seen": 62773395936,
21321
+ "step": 119750
21322
+ },
21323
+ {
21324
+ "epoch": 1.1429004137996828,
21325
+ "grad_norm": 0.14039525389671326,
21326
+ "learning_rate": 0.000820437806992512,
21327
+ "loss": 2.0656,
21328
+ "num_input_tokens_seen": 62799610336,
21329
+ "step": 119800
21330
+ },
21331
+ {
21332
+ "epoch": 1.143377416315871,
21333
+ "grad_norm": 0.14653949439525604,
21334
+ "learning_rate": 0.0008182795374368893,
21335
+ "loss": 2.0741,
21336
+ "num_input_tokens_seen": 62825821984,
21337
+ "step": 119850
21338
+ },
21339
+ {
21340
+ "epoch": 1.1438544188320594,
21341
+ "grad_norm": 0.12294785678386688,
21342
+ "learning_rate": 0.0008161112510289549,
21343
+ "loss": 2.0741,
21344
+ "num_input_tokens_seen": 62852031840,
21345
+ "step": 119900
21346
+ },
21347
+ {
21348
+ "epoch": 1.1443314213482476,
21349
+ "grad_norm": 0.18639816343784332,
21350
+ "learning_rate": 0.0008139330160087374,
21351
+ "loss": 2.1258,
21352
+ "num_input_tokens_seen": 62878240576,
21353
+ "step": 119950
21354
+ },
21355
+ {
21356
+ "epoch": 1.1448084238644358,
21357
+ "grad_norm": 0.1320071518421173,
21358
+ "learning_rate": 0.0008117449009293668,
21359
+ "loss": 2.0956,
21360
+ "num_input_tokens_seen": 62904447680,
21361
+ "step": 120000
21362
+ },
21363
+ {
21364
+ "epoch": 1.1448084238644358,
21365
+ "eval_loss": 2.0032639503479004,
21366
+ "eval_runtime": 82.8531,
21367
+ "eval_samples_per_second": 60.348,
21368
+ "eval_steps_per_second": 15.087,
21369
+ "num_input_tokens_seen": 62904447680,
21370
+ "step": 120000
21371
  }
21372
  ],
21373
  "logging_steps": 50,
21374
  "max_steps": 140000,
21375
+ "num_input_tokens_seen": 62904447680,
21376
  "num_train_epochs": 2,
21377
  "save_steps": 1000,
21378
  "stateful_callbacks": {
 
21387
  "attributes": {}
21388
  }
21389
  },
21390
+ "total_flos": 1.1132939862234317e+20,
21391
  "train_batch_size": 32,
21392
  "trial_name": null,
21393
  "trial_params": null