MRBSTUDIO commited on
Commit
ab2fbfe
·
verified ·
1 Parent(s): 9a801b0

sft-6900-step

Browse files
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5afe2a027d395f551dd07fb95d989917901b4fc4d9a3f699bcf96a1a0c52045c
3
  size 698419728
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2ffc6552dcc71c20350838f8a219506181ba46c38f659ec852bba2bddda4dfc
3
  size 698419728
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc2f55384d96aef8c36f040c4c76bba8c886457259f6de979d38405706b46bfd
3
  size 1397136587
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42b8045c238ff4f74f9e3fe7c94d27857f2fadb3c697f6661bb73fb9bb04a576
3
  size 1397136587
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:14f4ff77bbbe606f2785ea0f04a2535a7c54901e7091c0fed26a6e7b85eda9ae
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be73d303d67b9e1d37ae52f58cd2c7c7c5aeb597a44b9c72b8875cd9acb7be14
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:08804d0a21d8df191a267fdfad60532380afcf3b04182506e89486e7b012afc5
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fff36b79323a1d28015f7255a86b9e604fcdba024c3e96bcdca5e4c7054b0293
3
  size 1465
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 6700,
3
- "best_metric": 1.140816330909729,
4
- "best_model_checkpoint": "/workspace/project_2026_1/checkpoints/sft/checkpoint-6700",
5
- "epoch": 1.9711679905854664,
6
  "eval_steps": 100,
7
- "global_step": 6700,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -7445,6 +7445,228 @@
7445
  "eval_samples_per_second": 26.029,
7446
  "eval_steps_per_second": 3.257,
7447
  "step": 6700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7448
  }
7449
  ],
7450
  "logging_steps": 10,
@@ -7464,7 +7686,7 @@
7464
  "attributes": {}
7465
  }
7466
  },
7467
- "total_flos": 1.2570357073262346e+18,
7468
  "train_batch_size": 8,
7469
  "trial_name": null,
7470
  "trial_params": null
 
1
  {
2
+ "best_global_step": 6800,
3
+ "best_metric": 1.1395292282104492,
4
+ "best_model_checkpoint": "/workspace/project_2026_1/checkpoints/sft/checkpoint-6800",
5
+ "epoch": 2.030008826125331,
6
  "eval_steps": 100,
7
+ "global_step": 6900,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
7445
  "eval_samples_per_second": 26.029,
7446
  "eval_steps_per_second": 3.257,
7447
  "step": 6700
7448
+ },
7449
+ {
7450
+ "entropy": 0.9509491920471191,
7451
+ "epoch": 1.9741100323624594,
7452
+ "grad_norm": 0.6006605625152588,
7453
+ "learning_rate": 5.744202611276379e-05,
7454
+ "loss": 0.9503057479858399,
7455
+ "mean_token_accuracy": 0.7786516189575196,
7456
+ "num_tokens": 27097949.0,
7457
+ "step": 6710
7458
+ },
7459
+ {
7460
+ "entropy": 1.0138035595417023,
7461
+ "epoch": 1.9770520741394528,
7462
+ "grad_norm": 0.5902991890907288,
7463
+ "learning_rate": 5.7148775369783694e-05,
7464
+ "loss": 1.0296749114990233,
7465
+ "mean_token_accuracy": 0.7590757310390472,
7466
+ "num_tokens": 27138453.0,
7467
+ "step": 6720
7468
+ },
7469
+ {
7470
+ "entropy": 0.933520519733429,
7471
+ "epoch": 1.979994115916446,
7472
+ "grad_norm": 0.5484936833381653,
7473
+ "learning_rate": 5.685597532311455e-05,
7474
+ "loss": 0.957374095916748,
7475
+ "mean_token_accuracy": 0.7793904483318329,
7476
+ "num_tokens": 27178805.0,
7477
+ "step": 6730
7478
+ },
7479
+ {
7480
+ "entropy": 0.9440324783325196,
7481
+ "epoch": 1.9829361576934392,
7482
+ "grad_norm": 0.5826029777526855,
7483
+ "learning_rate": 5.656362905233923e-05,
7484
+ "loss": 0.9262220382690429,
7485
+ "mean_token_accuracy": 0.7845340669155121,
7486
+ "num_tokens": 27219347.0,
7487
+ "step": 6740
7488
+ },
7489
+ {
7490
+ "entropy": 0.9071877419948577,
7491
+ "epoch": 1.9858781994704324,
7492
+ "grad_norm": 0.5721964836120605,
7493
+ "learning_rate": 5.6271739632268094e-05,
7494
+ "loss": 0.9060114860534668,
7495
+ "mean_token_accuracy": 0.7890908360481262,
7496
+ "num_tokens": 27258890.0,
7497
+ "step": 6750
7498
+ },
7499
+ {
7500
+ "entropy": 0.9562793612480164,
7501
+ "epoch": 1.9888202412474256,
7502
+ "grad_norm": 0.614380955696106,
7503
+ "learning_rate": 5.598031013290631e-05,
7504
+ "loss": 0.9876157760620117,
7505
+ "mean_token_accuracy": 0.768429833650589,
7506
+ "num_tokens": 27299053.0,
7507
+ "step": 6760
7508
+ },
7509
+ {
7510
+ "entropy": 0.9924969553947449,
7511
+ "epoch": 1.991762283024419,
7512
+ "grad_norm": 0.6030513644218445,
7513
+ "learning_rate": 5.5689343619421906e-05,
7514
+ "loss": 0.9977625846862793,
7515
+ "mean_token_accuracy": 0.7658666670322418,
7516
+ "num_tokens": 27339515.0,
7517
+ "step": 6770
7518
+ },
7519
+ {
7520
+ "entropy": 0.9534170269966126,
7521
+ "epoch": 1.994704324801412,
7522
+ "grad_norm": 0.5039950609207153,
7523
+ "learning_rate": 5.539884315211321e-05,
7524
+ "loss": 0.9545814514160156,
7525
+ "mean_token_accuracy": 0.7779964745044708,
7526
+ "num_tokens": 27379693.0,
7527
+ "step": 6780
7528
+ },
7529
+ {
7530
+ "entropy": 0.9789716601371765,
7531
+ "epoch": 1.9976463665784054,
7532
+ "grad_norm": 0.5822030305862427,
7533
+ "learning_rate": 5.5108811786376925e-05,
7534
+ "loss": 0.9928366661071777,
7535
+ "mean_token_accuracy": 0.7682704031467438,
7536
+ "num_tokens": 27419734.0,
7537
+ "step": 6790
7538
+ },
7539
+ {
7540
+ "entropy": 0.915216040611267,
7541
+ "epoch": 2.000588408355399,
7542
+ "grad_norm": 0.4654218554496765,
7543
+ "learning_rate": 5.481925257267589e-05,
7544
+ "loss": 0.8871613502502441,
7545
+ "mean_token_accuracy": 0.7920856356620789,
7546
+ "num_tokens": 27458303.0,
7547
+ "step": 6800
7548
+ },
7549
+ {
7550
+ "epoch": 2.000588408355399,
7551
+ "eval_entropy": 0.9942471109663095,
7552
+ "eval_loss": 1.1395292282104492,
7553
+ "eval_mean_token_accuracy": 0.7511763375575148,
7554
+ "eval_num_tokens": 27458303.0,
7555
+ "eval_runtime": 116.8845,
7556
+ "eval_samples_per_second": 26.051,
7557
+ "eval_steps_per_second": 3.26,
7558
+ "step": 6800
7559
+ },
7560
+ {
7561
+ "entropy": 0.7543269693851471,
7562
+ "epoch": 2.003530450132392,
7563
+ "grad_norm": 0.6209985613822937,
7564
+ "learning_rate": 5.4530168556506875e-05,
7565
+ "loss": 0.6749869823455811,
7566
+ "mean_token_accuracy": 0.8347735464572906,
7567
+ "num_tokens": 27498607.0,
7568
+ "step": 6810
7569
+ },
7570
+ {
7571
+ "entropy": 0.6835850536823272,
7572
+ "epoch": 2.0064724919093853,
7573
+ "grad_norm": 0.781541109085083,
7574
+ "learning_rate": 5.424156277836881e-05,
7575
+ "loss": 0.6951170921325683,
7576
+ "mean_token_accuracy": 0.8288436651229858,
7577
+ "num_tokens": 27538904.0,
7578
+ "step": 6820
7579
+ },
7580
+ {
7581
+ "entropy": 0.6437631964683532,
7582
+ "epoch": 2.0094145336863782,
7583
+ "grad_norm": 0.8998324871063232,
7584
+ "learning_rate": 5.395343827373053e-05,
7585
+ "loss": 0.6296420574188233,
7586
+ "mean_token_accuracy": 0.8461188077926636,
7587
+ "num_tokens": 27579223.0,
7588
+ "step": 6830
7589
+ },
7590
+ {
7591
+ "entropy": 0.6127074956893921,
7592
+ "epoch": 2.0123565754633717,
7593
+ "grad_norm": 0.6167740225791931,
7594
+ "learning_rate": 5.366579807299909e-05,
7595
+ "loss": 0.5965664386749268,
7596
+ "mean_token_accuracy": 0.850104957818985,
7597
+ "num_tokens": 27619638.0,
7598
+ "step": 6840
7599
+ },
7600
+ {
7601
+ "entropy": 0.6964607417583466,
7602
+ "epoch": 2.0152986172403646,
7603
+ "grad_norm": 0.637476921081543,
7604
+ "learning_rate": 5.337864520148768e-05,
7605
+ "loss": 0.6968545913696289,
7606
+ "mean_token_accuracy": 0.8300110459327698,
7607
+ "num_tokens": 27660158.0,
7608
+ "step": 6850
7609
+ },
7610
+ {
7611
+ "entropy": 0.6738093435764313,
7612
+ "epoch": 2.018240659017358,
7613
+ "grad_norm": 0.7894798517227173,
7614
+ "learning_rate": 5.309198267938402e-05,
7615
+ "loss": 0.6670093059539794,
7616
+ "mean_token_accuracy": 0.8377935826778412,
7617
+ "num_tokens": 27700212.0,
7618
+ "step": 6860
7619
+ },
7620
+ {
7621
+ "entropy": 0.6280623555183411,
7622
+ "epoch": 2.0211827007943515,
7623
+ "grad_norm": 0.80244380235672,
7624
+ "learning_rate": 5.280581352171836e-05,
7625
+ "loss": 0.6267249107360839,
7626
+ "mean_token_accuracy": 0.8437743067741394,
7627
+ "num_tokens": 27740554.0,
7628
+ "step": 6870
7629
+ },
7630
+ {
7631
+ "entropy": 0.6882079899311065,
7632
+ "epoch": 2.0241247425713444,
7633
+ "grad_norm": 0.7488958835601807,
7634
+ "learning_rate": 5.2520140738332025e-05,
7635
+ "loss": 0.6897297382354737,
7636
+ "mean_token_accuracy": 0.8309988558292389,
7637
+ "num_tokens": 27781034.0,
7638
+ "step": 6880
7639
+ },
7640
+ {
7641
+ "entropy": 0.676528149843216,
7642
+ "epoch": 2.027066784348338,
7643
+ "grad_norm": 0.8301676511764526,
7644
+ "learning_rate": 5.2234967333845466e-05,
7645
+ "loss": 0.6622447490692138,
7646
+ "mean_token_accuracy": 0.8345989942550659,
7647
+ "num_tokens": 27821579.0,
7648
+ "step": 6890
7649
+ },
7650
+ {
7651
+ "entropy": 0.6388787865638733,
7652
+ "epoch": 2.030008826125331,
7653
+ "grad_norm": 0.7029614448547363,
7654
+ "learning_rate": 5.1950296307626956e-05,
7655
+ "loss": 0.6487605571746826,
7656
+ "mean_token_accuracy": 0.8400563955307007,
7657
+ "num_tokens": 27861899.0,
7658
+ "step": 6900
7659
+ },
7660
+ {
7661
+ "epoch": 2.030008826125331,
7662
+ "eval_entropy": 0.8397969613707285,
7663
+ "eval_loss": 1.2236672639846802,
7664
+ "eval_mean_token_accuracy": 0.7469185830101254,
7665
+ "eval_num_tokens": 27861899.0,
7666
+ "eval_runtime": 116.8259,
7667
+ "eval_samples_per_second": 26.064,
7668
+ "eval_steps_per_second": 3.261,
7669
+ "step": 6900
7670
  }
7671
  ],
7672
  "logging_steps": 10,
 
7686
  "attributes": {}
7687
  }
7688
  },
7689
+ "total_flos": 1.2944070017481708e+18,
7690
  "train_batch_size": 8,
7691
  "trial_name": null,
7692
  "trial_params": null