aghatage commited on
Commit
d13ff70
·
verified ·
1 Parent(s): c6810d0

Training in progress, step 1500, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5a4ad0fda5a25c3ff587c8fc3613dad924a91714c4d11cefa4533c354410f3d2
3
  size 12017472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e3e9ee8a611e7a57dccff25563a008747ed15810194baa91980ef853c11a0a7
3
  size 12017472
last-checkpoint/global_step1500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:390228104a24054ada0fb3d185b75e1ee48590db75d7869429819a9c9ab05d18
3
+ size 71982309
last-checkpoint/global_step1500/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efc5ccbefb5e21781792c66a4a01865f2af9fb2a9db01cede684299d60e76df6
3
+ size 146356645
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step1000
 
1
+ global_step1500
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:162ea028bad2d13be08d5ca87f053c3d58c23e084a4fa2f7cf1e9194c8781bcb
3
  size 14709
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27afcafd6ed5692d8873208ba0cf57e46a0701e5eb0aa08cd9750d1e2b88cb5d
3
  size 14709
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 1000,
3
- "best_metric": 0.7203578948974609,
4
- "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-1000",
5
- "epoch": 0.7271405199054717,
6
  "eval_steps": 250,
7
- "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -417,6 +417,206 @@
417
  "eval_samples_per_second": 43.127,
418
  "eval_steps_per_second": 5.397,
419
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
  }
421
  ],
422
  "logging_steps": 25,
@@ -436,7 +636,7 @@
436
  "attributes": {}
437
  }
438
  },
439
- "total_flos": 5.564104967821722e+16,
440
  "train_batch_size": 4,
441
  "trial_name": null,
442
  "trial_params": null
 
1
  {
2
+ "best_global_step": 1500,
3
+ "best_metric": 0.6834071278572083,
4
+ "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-1500",
5
+ "epoch": 1.0901654244682786,
6
  "eval_steps": 250,
7
+ "global_step": 1500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
417
  "eval_samples_per_second": 43.127,
418
  "eval_steps_per_second": 5.397,
419
  "step": 1000
420
+ },
421
+ {
422
+ "epoch": 0.7453190329031085,
423
+ "grad_norm": 1.012831449508667,
424
+ "learning_rate": 7.937460379745742e-05,
425
+ "loss": 0.7165,
426
+ "mean_token_accuracy": 0.781532918214798,
427
+ "num_tokens": 22623696.0,
428
+ "step": 1025
429
+ },
430
+ {
431
+ "epoch": 0.7634975459007454,
432
+ "grad_norm": 1.1071596145629883,
433
+ "learning_rate": 7.934043623246033e-05,
434
+ "loss": 0.7161,
435
+ "mean_token_accuracy": 0.7818375152349472,
436
+ "num_tokens": 23152552.0,
437
+ "step": 1050
438
+ },
439
+ {
440
+ "epoch": 0.7816760588983821,
441
+ "grad_norm": 0.9036078453063965,
442
+ "learning_rate": 7.930536772908241e-05,
443
+ "loss": 0.7191,
444
+ "mean_token_accuracy": 0.7791232514381409,
445
+ "num_tokens": 23710635.0,
446
+ "step": 1075
447
+ },
448
+ {
449
+ "epoch": 0.7998545718960189,
450
+ "grad_norm": 0.9387621283531189,
451
+ "learning_rate": 7.926939909043018e-05,
452
+ "loss": 0.7201,
453
+ "mean_token_accuracy": 0.7794615784287453,
454
+ "num_tokens": 24277083.0,
455
+ "step": 1100
456
+ },
457
+ {
458
+ "epoch": 0.8180330848936557,
459
+ "grad_norm": 0.9713501930236816,
460
+ "learning_rate": 7.92325311402242e-05,
461
+ "loss": 0.7103,
462
+ "mean_token_accuracy": 0.7832019272446632,
463
+ "num_tokens": 24817217.0,
464
+ "step": 1125
465
+ },
466
+ {
467
+ "epoch": 0.8362115978912925,
468
+ "grad_norm": 0.9385744333267212,
469
+ "learning_rate": 7.919476472278026e-05,
470
+ "loss": 0.7121,
471
+ "mean_token_accuracy": 0.782238809466362,
472
+ "num_tokens": 25368089.0,
473
+ "step": 1150
474
+ },
475
+ {
476
+ "epoch": 0.8543901108889292,
477
+ "grad_norm": 0.9703376293182373,
478
+ "learning_rate": 7.915610070299001e-05,
479
+ "loss": 0.7073,
480
+ "mean_token_accuracy": 0.7827208504080773,
481
+ "num_tokens": 25912613.0,
482
+ "step": 1175
483
+ },
484
+ {
485
+ "epoch": 0.8725686238865661,
486
+ "grad_norm": 1.0707026720046997,
487
+ "learning_rate": 7.911653996630121e-05,
488
+ "loss": 0.7048,
489
+ "mean_token_accuracy": 0.7837258630990982,
490
+ "num_tokens": 26466244.0,
491
+ "step": 1200
492
+ },
493
+ {
494
+ "epoch": 0.8907471368842028,
495
+ "grad_norm": 1.0107282400131226,
496
+ "learning_rate": 7.90760834186973e-05,
497
+ "loss": 0.6945,
498
+ "mean_token_accuracy": 0.7874345901608467,
499
+ "num_tokens": 27009212.0,
500
+ "step": 1225
501
+ },
502
+ {
503
+ "epoch": 0.9089256498818397,
504
+ "grad_norm": 0.9038238525390625,
505
+ "learning_rate": 7.903473198667684e-05,
506
+ "loss": 0.7011,
507
+ "mean_token_accuracy": 0.7847383853793144,
508
+ "num_tokens": 27551777.0,
509
+ "step": 1250
510
+ },
511
+ {
512
+ "epoch": 0.9089256498818397,
513
+ "eval_loss": 0.6989394426345825,
514
+ "eval_mean_token_accuracy": 0.7840953680619694,
515
+ "eval_num_tokens": 27551777.0,
516
+ "eval_runtime": 113.5783,
517
+ "eval_samples_per_second": 43.054,
518
+ "eval_steps_per_second": 5.388,
519
+ "step": 1250
520
+ },
521
+ {
522
+ "epoch": 0.9271041628794765,
523
+ "grad_norm": 1.009592890739441,
524
+ "learning_rate": 7.899248661723218e-05,
525
+ "loss": 0.702,
526
+ "mean_token_accuracy": 0.7847409224510193,
527
+ "num_tokens": 28112347.0,
528
+ "step": 1275
529
+ },
530
+ {
531
+ "epoch": 0.9452826758771132,
532
+ "grad_norm": 1.1236053705215454,
533
+ "learning_rate": 7.894934827782781e-05,
534
+ "loss": 0.6992,
535
+ "mean_token_accuracy": 0.7860203450918197,
536
+ "num_tokens": 28650826.0,
537
+ "step": 1300
538
+ },
539
+ {
540
+ "epoch": 0.9634611888747501,
541
+ "grad_norm": 1.0439739227294922,
542
+ "learning_rate": 7.890531795637816e-05,
543
+ "loss": 0.6937,
544
+ "mean_token_accuracy": 0.7889308288693428,
545
+ "num_tokens": 29182993.0,
546
+ "step": 1325
547
+ },
548
+ {
549
+ "epoch": 0.9816397018723868,
550
+ "grad_norm": 1.2452131509780884,
551
+ "learning_rate": 7.886039666122508e-05,
552
+ "loss": 0.6941,
553
+ "mean_token_accuracy": 0.7871252146363258,
554
+ "num_tokens": 29740267.0,
555
+ "step": 1350
556
+ },
557
+ {
558
+ "epoch": 0.9998182148700236,
559
+ "grad_norm": 1.0504438877105713,
560
+ "learning_rate": 7.881458542111457e-05,
561
+ "loss": 0.6888,
562
+ "mean_token_accuracy": 0.7882778728008271,
563
+ "num_tokens": 30296922.0,
564
+ "step": 1375
565
+ },
566
+ {
567
+ "epoch": 1.0174513724777314,
568
+ "grad_norm": 0.949993908405304,
569
+ "learning_rate": 7.876788528517346e-05,
570
+ "loss": 0.6937,
571
+ "mean_token_accuracy": 0.7869942234349006,
572
+ "num_tokens": 30828543.0,
573
+ "step": 1400
574
+ },
575
+ {
576
+ "epoch": 1.035629885475368,
577
+ "grad_norm": 0.9186608195304871,
578
+ "learning_rate": 7.872029732288515e-05,
579
+ "loss": 0.6912,
580
+ "mean_token_accuracy": 0.7877016308903694,
581
+ "num_tokens": 31382166.0,
582
+ "step": 1425
583
+ },
584
+ {
585
+ "epoch": 1.053808398473005,
586
+ "grad_norm": 0.9520437121391296,
587
+ "learning_rate": 7.867182262406524e-05,
588
+ "loss": 0.6848,
589
+ "mean_token_accuracy": 0.789347026348114,
590
+ "num_tokens": 31936482.0,
591
+ "step": 1450
592
+ },
593
+ {
594
+ "epoch": 1.0719869114706417,
595
+ "grad_norm": 0.9007825255393982,
596
+ "learning_rate": 7.86224622988366e-05,
597
+ "loss": 0.6915,
598
+ "mean_token_accuracy": 0.7874828764796257,
599
+ "num_tokens": 32491054.0,
600
+ "step": 1475
601
+ },
602
+ {
603
+ "epoch": 1.0901654244682786,
604
+ "grad_norm": 0.9098522067070007,
605
+ "learning_rate": 7.857221747760384e-05,
606
+ "loss": 0.6791,
607
+ "mean_token_accuracy": 0.7907387048006058,
608
+ "num_tokens": 33049696.0,
609
+ "step": 1500
610
+ },
611
+ {
612
+ "epoch": 1.0901654244682786,
613
+ "eval_loss": 0.6834071278572083,
614
+ "eval_mean_token_accuracy": 0.788765495509104,
615
+ "eval_num_tokens": 33049696.0,
616
+ "eval_runtime": 113.3067,
617
+ "eval_samples_per_second": 43.157,
618
+ "eval_steps_per_second": 5.401,
619
+ "step": 1500
620
  }
621
  ],
622
  "logging_steps": 25,
 
636
  "attributes": {}
637
  }
638
  },
639
+ "total_flos": 8.33876308084654e+16,
640
  "train_batch_size": 4,
641
  "trial_name": null,
642
  "trial_params": null