{ "best_metric": null, "best_model_checkpoint": null, "epoch": 15.998185117967331, "eval_steps": 100, "global_step": 1100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.14519056261343014, "grad_norm": 20.98495864868164, "learning_rate": 5.882352941176471e-07, "loss": 9.4496, "step": 10 }, { "epoch": 0.29038112522686027, "grad_norm": 16.1613712310791, "learning_rate": 1.1764705882352942e-06, "loss": 8.8125, "step": 20 }, { "epoch": 0.4355716878402904, "grad_norm": 6.7023138999938965, "learning_rate": 1.7647058823529414e-06, "loss": 7.7501, "step": 30 }, { "epoch": 0.5807622504537205, "grad_norm": 4.253345489501953, "learning_rate": 2.3529411764705885e-06, "loss": 7.0378, "step": 40 }, { "epoch": 0.7259528130671506, "grad_norm": 2.4256012439727783, "learning_rate": 2.9411764705882355e-06, "loss": 6.6558, "step": 50 }, { "epoch": 0.8711433756805808, "grad_norm": 1.664031982421875, "learning_rate": 3.529411764705883e-06, "loss": 6.3545, "step": 60 }, { "epoch": 1.0181488203266789, "grad_norm": 1.206116795539856, "learning_rate": 4.11764705882353e-06, "loss": 6.2306, "step": 70 }, { "epoch": 1.1633393829401089, "grad_norm": 1.1027638912200928, "learning_rate": 4.705882352941177e-06, "loss": 5.8555, "step": 80 }, { "epoch": 1.308529945553539, "grad_norm": 1.2014143466949463, "learning_rate": 5.294117647058824e-06, "loss": 5.6277, "step": 90 }, { "epoch": 1.453720508166969, "grad_norm": 1.3407199382781982, "learning_rate": 5.882352941176471e-06, "loss": 5.4611, "step": 100 }, { "epoch": 1.453720508166969, "eval_loss": 5.380606174468994, "eval_runtime": 14.5497, "eval_samples_per_second": 195.88, "eval_steps_per_second": 6.186, "step": 100 }, { "epoch": 1.5989110707803993, "grad_norm": 1.6456304788589478, "learning_rate": 6.470588235294119e-06, "loss": 5.3117, "step": 110 }, { "epoch": 1.7441016333938295, "grad_norm": 2.4888839721679688, "learning_rate": 7.058823529411766e-06, "loss": 5.1868, "step": 120 }, { "epoch": 1.8892921960072595, "grad_norm": 2.137110471725464, "learning_rate": 7.647058823529411e-06, "loss": 5.0505, "step": 130 }, { "epoch": 2.0362976406533577, "grad_norm": 2.2031850814819336, "learning_rate": 8.23529411764706e-06, "loss": 5.0571, "step": 140 }, { "epoch": 2.1814882032667877, "grad_norm": 2.2017667293548584, "learning_rate": 8.823529411764707e-06, "loss": 4.8034, "step": 150 }, { "epoch": 2.3266787658802177, "grad_norm": 2.2998626232147217, "learning_rate": 9.411764705882354e-06, "loss": 4.6938, "step": 160 }, { "epoch": 2.471869328493648, "grad_norm": 2.088256359100342, "learning_rate": 1e-05, "loss": 4.5817, "step": 170 }, { "epoch": 2.617059891107078, "grad_norm": 2.1210474967956543, "learning_rate": 1.0588235294117648e-05, "loss": 4.4561, "step": 180 }, { "epoch": 2.762250453720508, "grad_norm": 2.330993175506592, "learning_rate": 1.1176470588235295e-05, "loss": 4.368, "step": 190 }, { "epoch": 2.907441016333938, "grad_norm": 2.2989509105682373, "learning_rate": 1.1764705882352942e-05, "loss": 4.268, "step": 200 }, { "epoch": 2.907441016333938, "eval_loss": 4.2127814292907715, "eval_runtime": 14.464, "eval_samples_per_second": 197.041, "eval_steps_per_second": 6.222, "step": 200 }, { "epoch": 3.0544464609800364, "grad_norm": 1.6547863483428955, "learning_rate": 1.235294117647059e-05, "loss": 4.2584, "step": 210 }, { "epoch": 3.1996370235934664, "grad_norm": 2.2462234497070312, "learning_rate": 1.2941176470588238e-05, "loss": 4.0844, "step": 220 }, { "epoch": 3.344827586206897, "grad_norm": 2.176753044128418, "learning_rate": 1.3529411764705885e-05, "loss": 3.9943, "step": 230 }, { "epoch": 3.490018148820327, "grad_norm": 2.2812905311584473, "learning_rate": 1.4117647058823532e-05, "loss": 3.9237, "step": 240 }, { "epoch": 3.635208711433757, "grad_norm": 2.2799627780914307, "learning_rate": 1.4705882352941179e-05, "loss": 3.849, "step": 250 }, { "epoch": 3.780399274047187, "grad_norm": 2.3910913467407227, "learning_rate": 1.5294117647058822e-05, "loss": 3.7829, "step": 260 }, { "epoch": 3.925589836660617, "grad_norm": 2.793339490890503, "learning_rate": 1.5882352941176473e-05, "loss": 3.7159, "step": 270 }, { "epoch": 4.072595281306715, "grad_norm": 4.1607465744018555, "learning_rate": 1.647058823529412e-05, "loss": 3.7605, "step": 280 }, { "epoch": 4.217785843920145, "grad_norm": 2.6675713062286377, "learning_rate": 1.7058823529411767e-05, "loss": 3.6045, "step": 290 }, { "epoch": 4.362976406533575, "grad_norm": 3.1864140033721924, "learning_rate": 1.7647058823529414e-05, "loss": 3.5499, "step": 300 }, { "epoch": 4.362976406533575, "eval_loss": 3.5333378314971924, "eval_runtime": 14.5283, "eval_samples_per_second": 196.168, "eval_steps_per_second": 6.195, "step": 300 }, { "epoch": 4.508166969147005, "grad_norm": 3.361107110977173, "learning_rate": 1.823529411764706e-05, "loss": 3.4898, "step": 310 }, { "epoch": 4.653357531760435, "grad_norm": 3.826258897781372, "learning_rate": 1.8823529411764708e-05, "loss": 3.4595, "step": 320 }, { "epoch": 4.798548094373865, "grad_norm": 3.7704880237579346, "learning_rate": 1.9411764705882355e-05, "loss": 3.4113, "step": 330 }, { "epoch": 4.943738656987296, "grad_norm": 3.5223851203918457, "learning_rate": 2e-05, "loss": 3.3734, "step": 340 }, { "epoch": 5.090744101633394, "grad_norm": 3.332577705383301, "learning_rate": 1.9999472984871734e-05, "loss": 3.4133, "step": 350 }, { "epoch": 5.235934664246824, "grad_norm": 4.306556701660156, "learning_rate": 1.9997891995035914e-05, "loss": 3.2955, "step": 360 }, { "epoch": 5.381125226860254, "grad_norm": 4.148169040679932, "learning_rate": 1.999525719713366e-05, "loss": 3.2456, "step": 370 }, { "epoch": 5.526315789473684, "grad_norm": 4.137167930603027, "learning_rate": 1.999156886888064e-05, "loss": 3.2064, "step": 380 }, { "epoch": 5.671506352087114, "grad_norm": 3.123608350753784, "learning_rate": 1.998682739903781e-05, "loss": 3.1841, "step": 390 }, { "epoch": 5.816696914700545, "grad_norm": 4.209367752075195, "learning_rate": 1.9981033287370443e-05, "loss": 3.1453, "step": 400 }, { "epoch": 5.816696914700545, "eval_loss": 3.1412713527679443, "eval_runtime": 14.4671, "eval_samples_per_second": 196.998, "eval_steps_per_second": 6.221, "step": 400 }, { "epoch": 5.961887477313975, "grad_norm": 3.708157539367676, "learning_rate": 1.9974187144595433e-05, "loss": 3.1183, "step": 410 }, { "epoch": 6.108892921960073, "grad_norm": 6.567568778991699, "learning_rate": 1.9966289692316944e-05, "loss": 3.166, "step": 420 }, { "epoch": 6.254083484573503, "grad_norm": 4.073953151702881, "learning_rate": 1.9957341762950346e-05, "loss": 3.0523, "step": 430 }, { "epoch": 6.399274047186933, "grad_norm": 4.078774452209473, "learning_rate": 1.9947344299634464e-05, "loss": 3.018, "step": 440 }, { "epoch": 6.544464609800363, "grad_norm": 2.7505741119384766, "learning_rate": 1.993629835613218e-05, "loss": 2.9874, "step": 450 }, { "epoch": 6.689655172413794, "grad_norm": 4.441661834716797, "learning_rate": 1.992420509671936e-05, "loss": 2.9679, "step": 460 }, { "epoch": 6.834845735027224, "grad_norm": 3.656827211380005, "learning_rate": 1.9911065796062137e-05, "loss": 2.9333, "step": 470 }, { "epoch": 6.980036297640654, "grad_norm": 3.519759178161621, "learning_rate": 1.9896881839082554e-05, "loss": 2.9003, "step": 480 }, { "epoch": 7.127041742286751, "grad_norm": 2.58138108253479, "learning_rate": 1.9881654720812594e-05, "loss": 2.9466, "step": 490 }, { "epoch": 7.272232304900181, "grad_norm": 3.1261541843414307, "learning_rate": 1.9865386046236597e-05, "loss": 2.865, "step": 500 }, { "epoch": 7.272232304900181, "eval_loss": 2.8578507900238037, "eval_runtime": 14.4738, "eval_samples_per_second": 196.908, "eval_steps_per_second": 6.218, "step": 500 }, { "epoch": 7.417422867513611, "grad_norm": 4.163350582122803, "learning_rate": 1.9848077530122083e-05, "loss": 2.8359, "step": 510 }, { "epoch": 7.562613430127042, "grad_norm": 2.9917635917663574, "learning_rate": 1.982973099683902e-05, "loss": 2.7953, "step": 520 }, { "epoch": 7.707803992740472, "grad_norm": 3.293595790863037, "learning_rate": 1.9810348380167527e-05, "loss": 2.7838, "step": 530 }, { "epoch": 7.852994555353902, "grad_norm": 3.7709453105926514, "learning_rate": 1.9789931723094046e-05, "loss": 2.7497, "step": 540 }, { "epoch": 7.998185117967332, "grad_norm": 3.3971333503723145, "learning_rate": 1.9768483177596008e-05, "loss": 2.8238, "step": 550 }, { "epoch": 8.14519056261343, "grad_norm": 4.206657886505127, "learning_rate": 1.9746005004415004e-05, "loss": 2.7141, "step": 560 }, { "epoch": 8.29038112522686, "grad_norm": 3.42154598236084, "learning_rate": 1.9722499572818496e-05, "loss": 2.7061, "step": 570 }, { "epoch": 8.43557168784029, "grad_norm": 2.6466500759124756, "learning_rate": 1.9697969360350098e-05, "loss": 2.6849, "step": 580 }, { "epoch": 8.58076225045372, "grad_norm": 3.4602091312408447, "learning_rate": 1.9672416952568416e-05, "loss": 2.6546, "step": 590 }, { "epoch": 8.72595281306715, "grad_norm": 2.882288694381714, "learning_rate": 1.9645845042774555e-05, "loss": 2.6592, "step": 600 }, { "epoch": 8.72595281306715, "eval_loss": 2.655930280685425, "eval_runtime": 14.4429, "eval_samples_per_second": 197.329, "eval_steps_per_second": 6.231, "step": 600 }, { "epoch": 8.87114337568058, "grad_norm": 2.86531662940979, "learning_rate": 1.961825643172819e-05, "loss": 2.6246, "step": 610 }, { "epoch": 9.01814882032668, "grad_norm": 2.0800743103027344, "learning_rate": 1.9589654027352412e-05, "loss": 2.6887, "step": 620 }, { "epoch": 9.163339382940109, "grad_norm": 2.1428987979888916, "learning_rate": 1.956004084442718e-05, "loss": 2.6034, "step": 630 }, { "epoch": 9.30852994555354, "grad_norm": 2.0337836742401123, "learning_rate": 1.9529420004271568e-05, "loss": 2.6018, "step": 640 }, { "epoch": 9.453720508166969, "grad_norm": 2.25555419921875, "learning_rate": 1.9497794734414782e-05, "loss": 2.5723, "step": 650 }, { "epoch": 9.5989110707804, "grad_norm": 2.281365156173706, "learning_rate": 1.9465168368255946e-05, "loss": 2.5639, "step": 660 }, { "epoch": 9.744101633393829, "grad_norm": 1.9673478603363037, "learning_rate": 1.9431544344712776e-05, "loss": 2.5486, "step": 670 }, { "epoch": 9.88929219600726, "grad_norm": 2.3862695693969727, "learning_rate": 1.9396926207859085e-05, "loss": 2.5319, "step": 680 }, { "epoch": 10.036297640653357, "grad_norm": 2.3560924530029297, "learning_rate": 1.936131760655124e-05, "loss": 2.5827, "step": 690 }, { "epoch": 10.181488203266788, "grad_norm": 3.1034605503082275, "learning_rate": 1.932472229404356e-05, "loss": 2.514, "step": 700 }, { "epoch": 10.181488203266788, "eval_loss": 2.5237531661987305, "eval_runtime": 14.6638, "eval_samples_per_second": 194.356, "eval_steps_per_second": 6.138, "step": 700 }, { "epoch": 10.326678765880217, "grad_norm": 2.507720470428467, "learning_rate": 1.9287144127592704e-05, "loss": 2.5069, "step": 710 }, { "epoch": 10.471869328493648, "grad_norm": 2.375530481338501, "learning_rate": 1.924858706805112e-05, "loss": 2.4914, "step": 720 }, { "epoch": 10.617059891107077, "grad_norm": 2.781869649887085, "learning_rate": 1.920905517944954e-05, "loss": 2.4731, "step": 730 }, { "epoch": 10.762250453720508, "grad_norm": 2.3014352321624756, "learning_rate": 1.9168552628568632e-05, "loss": 2.4679, "step": 740 }, { "epoch": 10.907441016333939, "grad_norm": 2.277211904525757, "learning_rate": 1.9127083684499805e-05, "loss": 2.4708, "step": 750 }, { "epoch": 11.054446460980037, "grad_norm": 2.731947660446167, "learning_rate": 1.9084652718195237e-05, "loss": 2.5219, "step": 760 }, { "epoch": 11.199637023593466, "grad_norm": 2.070516347885132, "learning_rate": 1.9041264202007158e-05, "loss": 2.4407, "step": 770 }, { "epoch": 11.344827586206897, "grad_norm": 2.526477098464966, "learning_rate": 1.8996922709216456e-05, "loss": 2.4408, "step": 780 }, { "epoch": 11.490018148820326, "grad_norm": 2.280230760574341, "learning_rate": 1.8951632913550625e-05, "loss": 2.4217, "step": 790 }, { "epoch": 11.635208711433757, "grad_norm": 2.5102462768554688, "learning_rate": 1.8905399588691165e-05, "loss": 2.4129, "step": 800 }, { "epoch": 11.635208711433757, "eval_loss": 2.4271934032440186, "eval_runtime": 14.4721, "eval_samples_per_second": 196.93, "eval_steps_per_second": 6.219, "step": 800 }, { "epoch": 11.780399274047188, "grad_norm": 2.3329319953918457, "learning_rate": 1.8858227607770398e-05, "loss": 2.4068, "step": 810 }, { "epoch": 11.925589836660617, "grad_norm": 2.9200258255004883, "learning_rate": 1.8810121942857848e-05, "loss": 2.393, "step": 820 }, { "epoch": 12.072595281306715, "grad_norm": 2.835029363632202, "learning_rate": 1.8761087664436137e-05, "loss": 2.4508, "step": 830 }, { "epoch": 12.217785843920145, "grad_norm": 2.7595760822296143, "learning_rate": 1.8711129940866577e-05, "loss": 2.3873, "step": 840 }, { "epoch": 12.362976406533575, "grad_norm": 2.940290689468384, "learning_rate": 1.866025403784439e-05, "loss": 2.3817, "step": 850 }, { "epoch": 12.508166969147005, "grad_norm": 2.936760902404785, "learning_rate": 1.860846531784368e-05, "loss": 2.3642, "step": 860 }, { "epoch": 12.653357531760436, "grad_norm": 3.160423994064331, "learning_rate": 1.8555769239552232e-05, "loss": 2.3586, "step": 870 }, { "epoch": 12.798548094373865, "grad_norm": 2.8737099170684814, "learning_rate": 1.8502171357296144e-05, "loss": 2.3481, "step": 880 }, { "epoch": 12.943738656987296, "grad_norm": 2.706122398376465, "learning_rate": 1.8447677320454367e-05, "loss": 2.3496, "step": 890 }, { "epoch": 13.090744101633394, "grad_norm": 2.1658377647399902, "learning_rate": 1.839229287286327e-05, "loss": 2.3982, "step": 900 }, { "epoch": 13.090744101633394, "eval_loss": 2.3612313270568848, "eval_runtime": 14.4663, "eval_samples_per_second": 197.009, "eval_steps_per_second": 6.221, "step": 900 }, { "epoch": 13.235934664246823, "grad_norm": 2.8399763107299805, "learning_rate": 1.8336023852211197e-05, "loss": 2.3385, "step": 910 }, { "epoch": 13.381125226860254, "grad_norm": 2.8449740409851074, "learning_rate": 1.827887618942318e-05, "loss": 2.329, "step": 920 }, { "epoch": 13.526315789473685, "grad_norm": 3.4475836753845215, "learning_rate": 1.8220855908035783e-05, "loss": 2.3102, "step": 930 }, { "epoch": 13.671506352087114, "grad_norm": 2.820624589920044, "learning_rate": 1.816196912356222e-05, "loss": 2.3118, "step": 940 }, { "epoch": 13.816696914700545, "grad_norm": 2.9867615699768066, "learning_rate": 1.8102222042847735e-05, "loss": 2.3077, "step": 950 }, { "epoch": 13.961887477313974, "grad_norm": 4.323665142059326, "learning_rate": 1.8041620963415418e-05, "loss": 2.3013, "step": 960 }, { "epoch": 14.108892921960072, "grad_norm": 4.4349236488342285, "learning_rate": 1.7980172272802398e-05, "loss": 2.3514, "step": 970 }, { "epoch": 14.254083484573503, "grad_norm": 4.404689311981201, "learning_rate": 1.7917882447886585e-05, "loss": 2.2881, "step": 980 }, { "epoch": 14.399274047186934, "grad_norm": 4.489727020263672, "learning_rate": 1.785475805420399e-05, "loss": 2.2839, "step": 990 }, { "epoch": 14.544464609800363, "grad_norm": 3.8734374046325684, "learning_rate": 1.7790805745256703e-05, "loss": 2.2785, "step": 1000 }, { "epoch": 14.544464609800363, "eval_loss": 2.2907564640045166, "eval_runtime": 14.4981, "eval_samples_per_second": 196.577, "eval_steps_per_second": 6.208, "step": 1000 }, { "epoch": 14.689655172413794, "grad_norm": 4.450066089630127, "learning_rate": 1.772603226181159e-05, "loss": 2.2566, "step": 1010 }, { "epoch": 14.834845735027223, "grad_norm": 4.106595993041992, "learning_rate": 1.766044443118978e-05, "loss": 2.261, "step": 1020 }, { "epoch": 14.980036297640654, "grad_norm": 3.8136234283447266, "learning_rate": 1.7594049166547073e-05, "loss": 2.2535, "step": 1030 }, { "epoch": 15.127041742286751, "grad_norm": 5.098197937011719, "learning_rate": 1.7526853466145248e-05, "loss": 2.3093, "step": 1040 }, { "epoch": 15.272232304900182, "grad_norm": 3.752929210662842, "learning_rate": 1.7458864412614436e-05, "loss": 2.2377, "step": 1050 }, { "epoch": 15.417422867513611, "grad_norm": 3.992673397064209, "learning_rate": 1.7390089172206594e-05, "loss": 2.2416, "step": 1060 }, { "epoch": 15.562613430127042, "grad_norm": 3.857272148132324, "learning_rate": 1.7320534994040148e-05, "loss": 2.2251, "step": 1070 }, { "epoch": 15.707803992740471, "grad_norm": 4.833571434020996, "learning_rate": 1.725020920933593e-05, "loss": 2.2262, "step": 1080 }, { "epoch": 15.852994555353902, "grad_norm": 2.919546127319336, "learning_rate": 1.717911923064442e-05, "loss": 2.2204, "step": 1090 }, { "epoch": 15.998185117967331, "grad_norm": 4.030925273895264, "learning_rate": 1.710727255106447e-05, "loss": 2.274, "step": 1100 }, { "epoch": 15.998185117967331, "eval_loss": 2.2391433715820312, "eval_runtime": 14.4613, "eval_samples_per_second": 197.077, "eval_steps_per_second": 6.223, "step": 1100 } ], "logging_steps": 10, "max_steps": 3400, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.414959764411515e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }