Upload model from /mnt/disk5/gautam/post_training_pipelines/gemma1b_sft_toolcalling/checkpoint-1250
704043d verified | { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 200, | |
| "global_step": 1250, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 88.0, | |
| "learning_rate": 3.6e-06, | |
| "loss": 2.6892, | |
| "mean_token_accuracy": 0.7017069637775422, | |
| "num_tokens": 17031.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 27.0, | |
| "learning_rate": 7.6e-06, | |
| "loss": 1.1388, | |
| "mean_token_accuracy": 0.8153024911880493, | |
| "num_tokens": 33010.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 11.3125, | |
| "learning_rate": 1.16e-05, | |
| "loss": 0.634, | |
| "mean_token_accuracy": 0.8608590424060821, | |
| "num_tokens": 49690.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 11.1875, | |
| "learning_rate": 1.56e-05, | |
| "loss": 0.4965, | |
| "mean_token_accuracy": 0.8881326198577881, | |
| "num_tokens": 66528.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 6.1875, | |
| "learning_rate": 1.9600000000000002e-05, | |
| "loss": 0.4615, | |
| "mean_token_accuracy": 0.8937213480472564, | |
| "num_tokens": 83663.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 2.36e-05, | |
| "loss": 0.4348, | |
| "mean_token_accuracy": 0.898058295249939, | |
| "num_tokens": 100010.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 8.75, | |
| "learning_rate": 2.7600000000000003e-05, | |
| "loss": 0.4548, | |
| "mean_token_accuracy": 0.892582792043686, | |
| "num_tokens": 117018.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 3.16e-05, | |
| "loss": 0.4401, | |
| "mean_token_accuracy": 0.8965379416942596, | |
| "num_tokens": 133104.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 7.25, | |
| "learning_rate": 3.56e-05, | |
| "loss": 0.448, | |
| "mean_token_accuracy": 0.8975786864757538, | |
| "num_tokens": 150048.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 7.875, | |
| "learning_rate": 3.960000000000001e-05, | |
| "loss": 0.3603, | |
| "mean_token_accuracy": 0.9116077303886414, | |
| "num_tokens": 166440.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 6.5, | |
| "learning_rate": 4.36e-05, | |
| "loss": 0.4385, | |
| "mean_token_accuracy": 0.8982407033443451, | |
| "num_tokens": 182539.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 7.71875, | |
| "learning_rate": 4.76e-05, | |
| "loss": 0.4103, | |
| "mean_token_accuracy": 0.9048125684261322, | |
| "num_tokens": 200111.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 7.0625, | |
| "learning_rate": 4.9998440375027166e-05, | |
| "loss": 0.424, | |
| "mean_token_accuracy": 0.8992080807685852, | |
| "num_tokens": 217757.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 10.125, | |
| "learning_rate": 4.998089682880117e-05, | |
| "loss": 0.3576, | |
| "mean_token_accuracy": 0.9198148787021637, | |
| "num_tokens": 234762.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 4.994387393067117e-05, | |
| "loss": 0.3929, | |
| "mean_token_accuracy": 0.9089428544044494, | |
| "num_tokens": 251850.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 8.875, | |
| "learning_rate": 4.988740054997943e-05, | |
| "loss": 0.3589, | |
| "mean_token_accuracy": 0.9174381196498871, | |
| "num_tokens": 268781.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 7.5, | |
| "learning_rate": 4.9811520722969465e-05, | |
| "loss": 0.3923, | |
| "mean_token_accuracy": 0.9071999669075013, | |
| "num_tokens": 285258.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 5.90625, | |
| "learning_rate": 4.971629361844785e-05, | |
| "loss": 0.3634, | |
| "mean_token_accuracy": 0.9199654579162597, | |
| "num_tokens": 301252.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 4.46875, | |
| "learning_rate": 4.960179349164621e-05, | |
| "loss": 0.3777, | |
| "mean_token_accuracy": 0.9212145566940307, | |
| "num_tokens": 319196.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.625, | |
| "learning_rate": 4.946810962631916e-05, | |
| "loss": 0.3977, | |
| "mean_token_accuracy": 0.9068084478378295, | |
| "num_tokens": 335261.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_loss": 0.33250153064727783, | |
| "eval_mean_token_accuracy": 0.9264034070074558, | |
| "eval_num_tokens": 335261.0, | |
| "eval_runtime": 1.5704, | |
| "eval_samples_per_second": 318.394, | |
| "eval_steps_per_second": 20.377, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 6.6875, | |
| "learning_rate": 4.9315346265123594e-05, | |
| "loss": 0.4252, | |
| "mean_token_accuracy": 0.9102438390254974, | |
| "num_tokens": 351763.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 5.78125, | |
| "learning_rate": 4.914362252833332e-05, | |
| "loss": 0.3439, | |
| "mean_token_accuracy": 0.9167511999607086, | |
| "num_tokens": 368897.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 6.125, | |
| "learning_rate": 4.8953072320952745e-05, | |
| "loss": 0.4426, | |
| "mean_token_accuracy": 0.9001888394355774, | |
| "num_tokens": 385037.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 6.6875, | |
| "learning_rate": 4.874384422830167e-05, | |
| "loss": 0.3259, | |
| "mean_token_accuracy": 0.9194521307945251, | |
| "num_tokens": 401098.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 5.1875, | |
| "learning_rate": 4.851610140015304e-05, | |
| "loss": 0.344, | |
| "mean_token_accuracy": 0.9198171913623809, | |
| "num_tokens": 417872.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 4.28125, | |
| "learning_rate": 4.8270021423513554e-05, | |
| "loss": 0.2945, | |
| "mean_token_accuracy": 0.9273608386516571, | |
| "num_tokens": 434393.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 4.800579618414676e-05, | |
| "loss": 0.2993, | |
| "mean_token_accuracy": 0.930290675163269, | |
| "num_tokens": 451159.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 3.34375, | |
| "learning_rate": 4.772363171694622e-05, | |
| "loss": 0.3248, | |
| "mean_token_accuracy": 0.9235591053962707, | |
| "num_tokens": 467623.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 4.96875, | |
| "learning_rate": 4.742374804527575e-05, | |
| "loss": 0.3275, | |
| "mean_token_accuracy": 0.9240097522735595, | |
| "num_tokens": 484853.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 4.3125, | |
| "learning_rate": 4.710637900940181e-05, | |
| "loss": 0.2941, | |
| "mean_token_accuracy": 0.9259559392929078, | |
| "num_tokens": 501511.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 5.6875, | |
| "learning_rate": 4.6771772084151885e-05, | |
| "loss": 0.2647, | |
| "mean_token_accuracy": 0.9329790830612182, | |
| "num_tokens": 517741.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 4.875, | |
| "learning_rate": 4.642018818594107e-05, | |
| "loss": 0.2797, | |
| "mean_token_accuracy": 0.932785815000534, | |
| "num_tokens": 534206.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 4.375, | |
| "learning_rate": 4.605190146931731e-05, | |
| "loss": 0.3121, | |
| "mean_token_accuracy": 0.9296325027942658, | |
| "num_tokens": 550992.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 4.566719911318389e-05, | |
| "loss": 0.2286, | |
| "mean_token_accuracy": 0.9427693903446197, | |
| "num_tokens": 568295.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 4.5266381096866e-05, | |
| "loss": 0.2682, | |
| "mean_token_accuracy": 0.9327518224716187, | |
| "num_tokens": 584532.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 5.0, | |
| "learning_rate": 4.484975996619589e-05, | |
| "loss": 0.2396, | |
| "mean_token_accuracy": 0.9453687310218811, | |
| "num_tokens": 601813.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 3.421875, | |
| "learning_rate": 4.441766058979898e-05, | |
| "loss": 0.23, | |
| "mean_token_accuracy": 0.9476732790470124, | |
| "num_tokens": 618224.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 4.3970419905771145e-05, | |
| "loss": 0.2564, | |
| "mean_token_accuracy": 0.9375335276126862, | |
| "num_tokens": 634072.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 3.765625, | |
| "learning_rate": 4.350838665894446e-05, | |
| "loss": 0.2166, | |
| "mean_token_accuracy": 0.9469930708408356, | |
| "num_tokens": 649869.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 5.5625, | |
| "learning_rate": 4.303192112894652e-05, | |
| "loss": 0.2456, | |
| "mean_token_accuracy": 0.9419237852096558, | |
| "num_tokens": 666571.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_loss": 0.18274205923080444, | |
| "eval_mean_token_accuracy": 0.9541891273111105, | |
| "eval_num_tokens": 666571.0, | |
| "eval_runtime": 1.5188, | |
| "eval_samples_per_second": 329.211, | |
| "eval_steps_per_second": 21.07, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 4.254139484926519e-05, | |
| "loss": 0.3468, | |
| "mean_token_accuracy": 0.926877224445343, | |
| "num_tokens": 683304.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 4.2037190317538e-05, | |
| "loss": 0.2254, | |
| "mean_token_accuracy": 0.9466065883636474, | |
| "num_tokens": 700169.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 4.1519700697291944e-05, | |
| "loss": 0.2744, | |
| "mean_token_accuracy": 0.9352014183998107, | |
| "num_tokens": 716946.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 4.098932951136645e-05, | |
| "loss": 0.2672, | |
| "mean_token_accuracy": 0.9392520666122437, | |
| "num_tokens": 734193.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 4.375, | |
| "learning_rate": 4.044649032725836e-05, | |
| "loss": 0.2644, | |
| "mean_token_accuracy": 0.9324163973331452, | |
| "num_tokens": 751666.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 3.989160643463445e-05, | |
| "loss": 0.2335, | |
| "mean_token_accuracy": 0.9438653469085694, | |
| "num_tokens": 768213.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 4.25, | |
| "learning_rate": 3.932511051526289e-05, | |
| "loss": 0.246, | |
| "mean_token_accuracy": 0.943376499414444, | |
| "num_tokens": 784533.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 3.96875, | |
| "learning_rate": 3.8747444305621e-05, | |
| "loss": 0.1937, | |
| "mean_token_accuracy": 0.9567820847034454, | |
| "num_tokens": 800972.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 3.8159058252442446e-05, | |
| "loss": 0.2455, | |
| "mean_token_accuracy": 0.9408983290195465, | |
| "num_tokens": 817357.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 3.7560411161472456e-05, | |
| "loss": 0.2191, | |
| "mean_token_accuracy": 0.9423716247081757, | |
| "num_tokens": 834267.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 3.695196983970481e-05, | |
| "loss": 0.1942, | |
| "mean_token_accuracy": 0.9473359823226929, | |
| "num_tokens": 851214.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 4.5, | |
| "learning_rate": 3.633420873137988e-05, | |
| "loss": 0.2162, | |
| "mean_token_accuracy": 0.9475499153137207, | |
| "num_tokens": 868271.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 3.609375, | |
| "learning_rate": 3.570760954802726e-05, | |
| "loss": 0.2176, | |
| "mean_token_accuracy": 0.9434495270252228, | |
| "num_tokens": 884946.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 3.609375, | |
| "learning_rate": 3.507266089284157e-05, | |
| "loss": 0.1941, | |
| "mean_token_accuracy": 0.9531172752380371, | |
| "num_tokens": 901712.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 3.442985787968442e-05, | |
| "loss": 0.1657, | |
| "mean_token_accuracy": 0.9569245934486389, | |
| "num_tokens": 918042.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 4.84375, | |
| "learning_rate": 3.3779701747009504e-05, | |
| "loss": 0.1742, | |
| "mean_token_accuracy": 0.9552597820758819, | |
| "num_tokens": 934691.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 2.5, | |
| "learning_rate": 3.312269946701191e-05, | |
| "loss": 0.195, | |
| "mean_token_accuracy": 0.9504463136196136, | |
| "num_tokens": 950782.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 3.5, | |
| "learning_rate": 3.245936335030651e-05, | |
| "loss": 0.205, | |
| "mean_token_accuracy": 0.9523600995540619, | |
| "num_tokens": 967182.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 3.179021064644347e-05, | |
| "loss": 0.2042, | |
| "mean_token_accuracy": 0.9493149757385254, | |
| "num_tokens": 984880.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 3.111576314057268e-05, | |
| "loss": 0.1604, | |
| "mean_token_accuracy": 0.9593855381011963, | |
| "num_tokens": 1001725.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_loss": 0.09306449443101883, | |
| "eval_mean_token_accuracy": 0.9749966748058796, | |
| "eval_num_tokens": 1001725.0, | |
| "eval_runtime": 1.5212, | |
| "eval_samples_per_second": 328.696, | |
| "eval_steps_per_second": 21.037, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 3.0436546746571372e-05, | |
| "loss": 0.1715, | |
| "mean_token_accuracy": 0.9547972202301025, | |
| "num_tokens": 1018394.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 2.75, | |
| "learning_rate": 2.9753091096952255e-05, | |
| "loss": 0.1661, | |
| "mean_token_accuracy": 0.9591395735740662, | |
| "num_tokens": 1035383.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.008, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 2.9065929129872094e-05, | |
| "loss": 0.1171, | |
| "mean_token_accuracy": 0.9689013838768006, | |
| "num_tokens": 1051803.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.024, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 2.8375596673562482e-05, | |
| "loss": 0.1067, | |
| "mean_token_accuracy": 0.9707406878471374, | |
| "num_tokens": 1068069.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 2.7682632028507167e-05, | |
| "loss": 0.1374, | |
| "mean_token_accuracy": 0.9712687909603119, | |
| "num_tokens": 1084576.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.056, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 2.6987575547691497e-05, | |
| "loss": 0.106, | |
| "mean_token_accuracy": 0.9707829177379608, | |
| "num_tokens": 1101469.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.072, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 2.6290969215251416e-05, | |
| "loss": 0.0898, | |
| "mean_token_accuracy": 0.9751771628856659, | |
| "num_tokens": 1117983.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.088, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 2.559335622385055e-05, | |
| "loss": 0.0802, | |
| "mean_token_accuracy": 0.9767919540405273, | |
| "num_tokens": 1133632.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.104, | |
| "grad_norm": 2.625, | |
| "learning_rate": 2.4895280551114907e-05, | |
| "loss": 0.0784, | |
| "mean_token_accuracy": 0.9760872542858123, | |
| "num_tokens": 1150894.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 2.4197286535455464e-05, | |
| "loss": 0.0734, | |
| "mean_token_accuracy": 0.9789025604724884, | |
| "num_tokens": 1167636.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.1360000000000001, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 2.349991845160949e-05, | |
| "loss": 0.0917, | |
| "mean_token_accuracy": 0.9734372615814209, | |
| "num_tokens": 1184462.0, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.152, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 2.280372008623142e-05, | |
| "loss": 0.0742, | |
| "mean_token_accuracy": 0.9806265234947205, | |
| "num_tokens": 1200572.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.168, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 2.2109234313864465e-05, | |
| "loss": 0.0652, | |
| "mean_token_accuracy": 0.9816279947757721, | |
| "num_tokens": 1217166.0, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.184, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 2.1417002673623264e-05, | |
| "loss": 0.0823, | |
| "mean_token_accuracy": 0.9744715809822082, | |
| "num_tokens": 1233948.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 2.0727564946918087e-05, | |
| "loss": 0.0859, | |
| "mean_token_accuracy": 0.9752485632896424, | |
| "num_tokens": 1250399.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.216, | |
| "grad_norm": 3.59375, | |
| "learning_rate": 2.004145873654942e-05, | |
| "loss": 0.0712, | |
| "mean_token_accuracy": 0.9803094148635865, | |
| "num_tokens": 1266954.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.232, | |
| "grad_norm": 3.953125, | |
| "learning_rate": 1.9359219047501565e-05, | |
| "loss": 0.0824, | |
| "mean_token_accuracy": 0.9807477355003357, | |
| "num_tokens": 1284040.0, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.248, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.868137786976177e-05, | |
| "loss": 0.1033, | |
| "mean_token_accuracy": 0.9721530199050903, | |
| "num_tokens": 1300599.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.264, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 1.800846376349051e-05, | |
| "loss": 0.0617, | |
| "mean_token_accuracy": 0.9832390666007995, | |
| "num_tokens": 1318418.0, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.73410014468661e-05, | |
| "loss": 0.0817, | |
| "mean_token_accuracy": 0.9761280298233033, | |
| "num_tokens": 1334668.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_loss": 0.06361612677574158, | |
| "eval_mean_token_accuracy": 0.9822994004935026, | |
| "eval_num_tokens": 1334668.0, | |
| "eval_runtime": 1.5207, | |
| "eval_samples_per_second": 328.801, | |
| "eval_steps_per_second": 21.043, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.296, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 1.6679511386925337e-05, | |
| "loss": 0.0918, | |
| "mean_token_accuracy": 0.9741989850997925, | |
| "num_tokens": 1351685.0, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.312, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 1.6024509393718844e-05, | |
| "loss": 0.057, | |
| "mean_token_accuracy": 0.9829799175262451, | |
| "num_tokens": 1368068.0, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.328, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 1.5376506218098015e-05, | |
| "loss": 0.0755, | |
| "mean_token_accuracy": 0.9813410580158234, | |
| "num_tokens": 1384459.0, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.3439999999999999, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 1.4736007153446801e-05, | |
| "loss": 0.0774, | |
| "mean_token_accuracy": 0.9792020142078399, | |
| "num_tokens": 1401056.0, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 1.4103511641669152e-05, | |
| "loss": 0.0727, | |
| "mean_token_accuracy": 0.9780106008052826, | |
| "num_tokens": 1417652.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.376, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 1.3479512883739232e-05, | |
| "loss": 0.062, | |
| "mean_token_accuracy": 0.983563768863678, | |
| "num_tokens": 1434175.0, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.392, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.2864497455118152e-05, | |
| "loss": 0.0732, | |
| "mean_token_accuracy": 0.9788140952587128, | |
| "num_tokens": 1451444.0, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.408, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 1.2258944926337057e-05, | |
| "loss": 0.0736, | |
| "mean_token_accuracy": 0.9813082575798034, | |
| "num_tokens": 1468191.0, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.424, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 1.1663327489042435e-05, | |
| "loss": 0.0739, | |
| "mean_token_accuracy": 0.9783754467964172, | |
| "num_tokens": 1485576.0, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.107810958779531e-05, | |
| "loss": 0.0825, | |
| "mean_token_accuracy": 0.9774186849594116, | |
| "num_tokens": 1502510.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.456, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.050374755791127e-05, | |
| "loss": 0.0785, | |
| "mean_token_accuracy": 0.98012655377388, | |
| "num_tokens": 1519612.0, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.472, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 9.94068926962404e-06, | |
| "loss": 0.0552, | |
| "mean_token_accuracy": 0.9845038235187531, | |
| "num_tokens": 1536102.0, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.488, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 9.389373778849612e-06, | |
| "loss": 0.0811, | |
| "mean_token_accuracy": 0.9776850759983062, | |
| "num_tokens": 1552779.0, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.504, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 8.850230984823735e-06, | |
| "loss": 0.0705, | |
| "mean_token_accuracy": 0.9811655461788178, | |
| "num_tokens": 1569305.0, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 8.323681294879394e-06, | |
| "loss": 0.0866, | |
| "mean_token_accuracy": 0.9734682440757751, | |
| "num_tokens": 1586364.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.536, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 7.810135296625818e-06, | |
| "loss": 0.0661, | |
| "mean_token_accuracy": 0.9802649438381195, | |
| "num_tokens": 1603137.0, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.552, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 7.309993437784624e-06, | |
| "loss": 0.0934, | |
| "mean_token_accuracy": 0.9778239965438843, | |
| "num_tokens": 1619134.0, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.568, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 6.823645713932708e-06, | |
| "loss": 0.0684, | |
| "mean_token_accuracy": 0.9809476971626282, | |
| "num_tokens": 1636046.0, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.584, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 6.3514713643954475e-06, | |
| "loss": 0.107, | |
| "mean_token_accuracy": 0.9713779807090759, | |
| "num_tokens": 1652978.0, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 5.893838576527275e-06, | |
| "loss": 0.0816, | |
| "mean_token_accuracy": 0.978422349691391, | |
| "num_tokens": 1669667.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_loss": 0.04640455171465874, | |
| "eval_mean_token_accuracy": 0.9878480397164822, | |
| "eval_num_tokens": 1669667.0, | |
| "eval_runtime": 1.5193, | |
| "eval_samples_per_second": 329.104, | |
| "eval_steps_per_second": 21.063, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.616, | |
| "grad_norm": 2.75, | |
| "learning_rate": 5.451104198610249e-06, | |
| "loss": 0.0636, | |
| "mean_token_accuracy": 0.9817108273506164, | |
| "num_tokens": 1686097.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.6320000000000001, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 5.023613461594512e-06, | |
| "loss": 0.0556, | |
| "mean_token_accuracy": 0.9840249598026276, | |
| "num_tokens": 1703096.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.6480000000000001, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 4.6116997098975465e-06, | |
| "loss": 0.0693, | |
| "mean_token_accuracy": 0.9816091299057007, | |
| "num_tokens": 1719554.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.6640000000000001, | |
| "grad_norm": 4.96875, | |
| "learning_rate": 4.215684141472292e-06, | |
| "loss": 0.1045, | |
| "mean_token_accuracy": 0.9720765173435211, | |
| "num_tokens": 1736865.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 2.0, | |
| "learning_rate": 3.835875557346552e-06, | |
| "loss": 0.0697, | |
| "mean_token_accuracy": 0.9806360006332397, | |
| "num_tokens": 1753684.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.696, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 3.4725701208293435e-06, | |
| "loss": 0.0571, | |
| "mean_token_accuracy": 0.9828337907791138, | |
| "num_tokens": 1770452.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.712, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 3.126051126571561e-06, | |
| "loss": 0.0824, | |
| "mean_token_accuracy": 0.9765742599964142, | |
| "num_tokens": 1786679.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.728, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 2.7965887796613884e-06, | |
| "loss": 0.0821, | |
| "mean_token_accuracy": 0.9753875434398651, | |
| "num_tokens": 1803322.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.744, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 2.4844399849264928e-06, | |
| "loss": 0.0374, | |
| "mean_token_accuracy": 0.9889423012733459, | |
| "num_tokens": 1819913.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 3.734375, | |
| "learning_rate": 2.189848146607348e-06, | |
| "loss": 0.0505, | |
| "mean_token_accuracy": 0.9857592463493348, | |
| "num_tokens": 1836798.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.776, | |
| "grad_norm": 2.625, | |
| "learning_rate": 1.913042978557944e-06, | |
| "loss": 0.077, | |
| "mean_token_accuracy": 0.9813314735889435, | |
| "num_tokens": 1853781.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.792, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.654240325121831e-06, | |
| "loss": 0.0903, | |
| "mean_token_accuracy": 0.9749953150749207, | |
| "num_tokens": 1870349.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.808, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 1.4136419928231892e-06, | |
| "loss": 0.0548, | |
| "mean_token_accuracy": 0.982354861497879, | |
| "num_tokens": 1886606.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.8239999999999998, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.1914355930041837e-06, | |
| "loss": 0.0547, | |
| "mean_token_accuracy": 0.9842985808849335, | |
| "num_tokens": 1903278.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 9.877943955312552e-07, | |
| "loss": 0.0692, | |
| "mean_token_accuracy": 0.9810509204864502, | |
| "num_tokens": 1919800.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.8559999999999999, | |
| "grad_norm": 3.421875, | |
| "learning_rate": 8.028771936845342e-07, | |
| "loss": 0.084, | |
| "mean_token_accuracy": 0.9769620358943939, | |
| "num_tokens": 1935929.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.8719999999999999, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 6.368281803355691e-07, | |
| "loss": 0.0996, | |
| "mean_token_accuracy": 0.9759830415248871, | |
| "num_tokens": 1952544.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.888, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 4.897768355101084e-07, | |
| "loss": 0.0994, | |
| "mean_token_accuracy": 0.9729955196380615, | |
| "num_tokens": 1969130.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.904, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 3.6183782542343057e-07, | |
| "loss": 0.0607, | |
| "mean_token_accuracy": 0.9826790690422058, | |
| "num_tokens": 1985369.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 2.531109130671061e-07, | |
| "loss": 0.0704, | |
| "mean_token_accuracy": 0.9832096040248871, | |
| "num_tokens": 2001651.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "eval_loss": 0.044618841260671616, | |
| "eval_mean_token_accuracy": 0.988185340538621, | |
| "eval_num_tokens": 2001651.0, | |
| "eval_runtime": 1.552, | |
| "eval_samples_per_second": 322.163, | |
| "eval_steps_per_second": 20.618, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.936, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.6368088041681108e-07, | |
| "loss": 0.0618, | |
| "mean_token_accuracy": 0.983083826303482, | |
| "num_tokens": 2017844.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.952, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 9.361746232188495e-08, | |
| "loss": 0.0713, | |
| "mean_token_accuracy": 0.9801405131816864, | |
| "num_tokens": 2034789.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.968, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 4.2975292128200064e-08, | |
| "loss": 0.06, | |
| "mean_token_accuracy": 0.9813045263290405, | |
| "num_tokens": 2051869.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.984, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 1.179385907672248e-08, | |
| "loss": 0.0773, | |
| "mean_token_accuracy": 0.9785664975643158, | |
| "num_tokens": 2069158.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 9.747751098521107e-11, | |
| "loss": 0.0512, | |
| "mean_token_accuracy": 0.9852560698986054, | |
| "num_tokens": 2085734.0, | |
| "step": 1250 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1250, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.6043078390513664e+16, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |