Serialtechlab commited on
Commit
109f2fe
·
verified ·
1 Parent(s): 91f7924

Training in progress, step 6000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:39d1255ff8c545d0503c785ed0bd8afaa3475e02ed8adcdbc4ee9b7871c98b31
3
  size 1198571496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:793c130765263750f8f1f131993dc0caa98d39fcab0fc5f563a6523434e118be
3
  size 1198571496
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d0b1780b52a4759f9c8c1eb2328b24981e178f57ad506840f3549feba0a15d96
3
  size 2397248267
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab3d91af6e674c2b614e85979666a82648a13e9b2459e79faab3976a19519901
3
  size 2397248267
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:234808e50c5f4f432be493ad718dde9abe206806428ff8453fd78d8f3ea033b9
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c599e781f4e6d1c940117dd009813b60937567b00a20a61337d3fa439b5161a5
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:43636f6dccd3c315049e886e6f6481f85291e0d56fde0d6ecff9ae33f4f2662d
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa3a82c280f8eac3ce59fa1481cc6b1d67cb6d85a23dbf36d46bc2e65a799ba4
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 4000,
3
- "best_metric": 0.17358896639123886,
4
- "best_model_checkpoint": "./byt5-dhivehi-correction/checkpoint-4000",
5
- "epoch": 0.3839877123932034,
6
  "eval_steps": 2000,
7
- "global_step": 4000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -310,6 +310,157 @@
310
  "eval_samples_per_second": 4.417,
311
  "eval_steps_per_second": 0.092,
312
  "step": 4000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
  }
314
  ],
315
  "logging_steps": 100,
@@ -329,7 +480,7 @@
329
  "attributes": {}
330
  }
331
  },
332
- "total_flos": 3.5275117263427584e+17,
333
  "train_batch_size": 48,
334
  "trial_name": null,
335
  "trial_params": null
 
1
  {
2
+ "best_global_step": 6000,
3
+ "best_metric": 0.1751142329795809,
4
+ "best_model_checkpoint": "./byt5-dhivehi-correction/checkpoint-6000",
5
+ "epoch": 0.5759815685898051,
6
  "eval_steps": 2000,
7
+ "global_step": 6000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
310
  "eval_samples_per_second": 4.417,
311
  "eval_steps_per_second": 0.092,
312
  "step": 4000
313
+ },
314
+ {
315
+ "epoch": 0.3935874052030335,
316
+ "grad_norm": 0.08163648098707199,
317
+ "learning_rate": 4.1150290154421164e-05,
318
+ "loss": 0.0711,
319
+ "step": 4100
320
+ },
321
+ {
322
+ "epoch": 0.40318709801286357,
323
+ "grad_norm": 0.1291944980621338,
324
+ "learning_rate": 4.090439657716141e-05,
325
+ "loss": 0.0741,
326
+ "step": 4200
327
+ },
328
+ {
329
+ "epoch": 0.41278679082269365,
330
+ "grad_norm": 0.1216018944978714,
331
+ "learning_rate": 4.065850299990164e-05,
332
+ "loss": 0.0711,
333
+ "step": 4300
334
+ },
335
+ {
336
+ "epoch": 0.42238648363252373,
337
+ "grad_norm": 0.07729393988847733,
338
+ "learning_rate": 4.0412609422641886e-05,
339
+ "loss": 0.0711,
340
+ "step": 4400
341
+ },
342
+ {
343
+ "epoch": 0.4319861764423538,
344
+ "grad_norm": 0.06937304139137268,
345
+ "learning_rate": 4.016671584538212e-05,
346
+ "loss": 0.069,
347
+ "step": 4500
348
+ },
349
+ {
350
+ "epoch": 0.4415858692521839,
351
+ "grad_norm": 0.0750502273440361,
352
+ "learning_rate": 3.992082226812236e-05,
353
+ "loss": 0.0686,
354
+ "step": 4600
355
+ },
356
+ {
357
+ "epoch": 0.451185562062014,
358
+ "grad_norm": 0.06546641141176224,
359
+ "learning_rate": 3.9674928690862595e-05,
360
+ "loss": 0.0692,
361
+ "step": 4700
362
+ },
363
+ {
364
+ "epoch": 0.4607852548718441,
365
+ "grad_norm": 0.08090441673994064,
366
+ "learning_rate": 3.942903511360283e-05,
367
+ "loss": 0.0663,
368
+ "step": 4800
369
+ },
370
+ {
371
+ "epoch": 0.47038494768167416,
372
+ "grad_norm": 0.06444734334945679,
373
+ "learning_rate": 3.918314153634307e-05,
374
+ "loss": 0.0674,
375
+ "step": 4900
376
+ },
377
+ {
378
+ "epoch": 0.47998464049150424,
379
+ "grad_norm": 0.07890176773071289,
380
+ "learning_rate": 3.893724795908331e-05,
381
+ "loss": 0.0688,
382
+ "step": 5000
383
+ },
384
+ {
385
+ "epoch": 0.4895843333013344,
386
+ "grad_norm": 0.07319965213537216,
387
+ "learning_rate": 3.869135438182355e-05,
388
+ "loss": 0.0672,
389
+ "step": 5100
390
+ },
391
+ {
392
+ "epoch": 0.49918402611116447,
393
+ "grad_norm": 0.06573938578367233,
394
+ "learning_rate": 3.8445460804563786e-05,
395
+ "loss": 0.067,
396
+ "step": 5200
397
+ },
398
+ {
399
+ "epoch": 0.5087837189209945,
400
+ "grad_norm": 0.0633859932422638,
401
+ "learning_rate": 3.8199567227304025e-05,
402
+ "loss": 0.0652,
403
+ "step": 5300
404
+ },
405
+ {
406
+ "epoch": 0.5183834117308246,
407
+ "grad_norm": 0.07647623121738434,
408
+ "learning_rate": 3.795367365004426e-05,
409
+ "loss": 0.0644,
410
+ "step": 5400
411
+ },
412
+ {
413
+ "epoch": 0.5279831045406547,
414
+ "grad_norm": 0.052805762737989426,
415
+ "learning_rate": 3.77077800727845e-05,
416
+ "loss": 0.0649,
417
+ "step": 5500
418
+ },
419
+ {
420
+ "epoch": 0.5375827973504848,
421
+ "grad_norm": 0.061066884547472,
422
+ "learning_rate": 3.746188649552474e-05,
423
+ "loss": 0.0654,
424
+ "step": 5600
425
+ },
426
+ {
427
+ "epoch": 0.5471824901603148,
428
+ "grad_norm": 0.06936004012823105,
429
+ "learning_rate": 3.721599291826498e-05,
430
+ "loss": 0.0641,
431
+ "step": 5700
432
+ },
433
+ {
434
+ "epoch": 0.5567821829701449,
435
+ "grad_norm": 0.05201058089733124,
436
+ "learning_rate": 3.6970099341005216e-05,
437
+ "loss": 0.0633,
438
+ "step": 5800
439
+ },
440
+ {
441
+ "epoch": 0.566381875779975,
442
+ "grad_norm": 0.0771355852484703,
443
+ "learning_rate": 3.672420576374545e-05,
444
+ "loss": 0.0644,
445
+ "step": 5900
446
+ },
447
+ {
448
+ "epoch": 0.5759815685898051,
449
+ "grad_norm": 0.0716933161020279,
450
+ "learning_rate": 3.647831218648569e-05,
451
+ "loss": 0.062,
452
+ "step": 6000
453
+ },
454
+ {
455
+ "epoch": 0.5759815685898051,
456
+ "eval_loss": 0.05111023411154747,
457
+ "eval_rouge1": 0.17564387777922563,
458
+ "eval_rouge2": 0.07400287019608529,
459
+ "eval_rougeL": 0.1751142329795809,
460
+ "eval_runtime": 2273.0348,
461
+ "eval_samples_per_second": 4.399,
462
+ "eval_steps_per_second": 0.092,
463
+ "step": 6000
464
  }
465
  ],
466
  "logging_steps": 100,
 
480
  "attributes": {}
481
  }
482
  },
483
+ "total_flos": 5.291200836615168e+17,
484
  "train_batch_size": 48,
485
  "trial_name": null,
486
  "trial_params": null