| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 15.998185117967331, | |
| "eval_steps": 100, | |
| "global_step": 1100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.14519056261343014, | |
| "grad_norm": 20.98495864868164, | |
| "learning_rate": 5.882352941176471e-07, | |
| "loss": 9.4496, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.29038112522686027, | |
| "grad_norm": 16.1613712310791, | |
| "learning_rate": 1.1764705882352942e-06, | |
| "loss": 8.8125, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.4355716878402904, | |
| "grad_norm": 6.7023138999938965, | |
| "learning_rate": 1.7647058823529414e-06, | |
| "loss": 7.7501, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.5807622504537205, | |
| "grad_norm": 4.253345489501953, | |
| "learning_rate": 2.3529411764705885e-06, | |
| "loss": 7.0378, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.7259528130671506, | |
| "grad_norm": 2.4256012439727783, | |
| "learning_rate": 2.9411764705882355e-06, | |
| "loss": 6.6558, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.8711433756805808, | |
| "grad_norm": 1.664031982421875, | |
| "learning_rate": 3.529411764705883e-06, | |
| "loss": 6.3545, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.0181488203266789, | |
| "grad_norm": 1.206116795539856, | |
| "learning_rate": 4.11764705882353e-06, | |
| "loss": 6.2306, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.1633393829401089, | |
| "grad_norm": 1.1027638912200928, | |
| "learning_rate": 4.705882352941177e-06, | |
| "loss": 5.8555, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.308529945553539, | |
| "grad_norm": 1.2014143466949463, | |
| "learning_rate": 5.294117647058824e-06, | |
| "loss": 5.6277, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.453720508166969, | |
| "grad_norm": 1.3407199382781982, | |
| "learning_rate": 5.882352941176471e-06, | |
| "loss": 5.4611, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.453720508166969, | |
| "eval_loss": 5.380606174468994, | |
| "eval_runtime": 14.5497, | |
| "eval_samples_per_second": 195.88, | |
| "eval_steps_per_second": 6.186, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.5989110707803993, | |
| "grad_norm": 1.6456304788589478, | |
| "learning_rate": 6.470588235294119e-06, | |
| "loss": 5.3117, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.7441016333938295, | |
| "grad_norm": 2.4888839721679688, | |
| "learning_rate": 7.058823529411766e-06, | |
| "loss": 5.1868, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.8892921960072595, | |
| "grad_norm": 2.137110471725464, | |
| "learning_rate": 7.647058823529411e-06, | |
| "loss": 5.0505, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.0362976406533577, | |
| "grad_norm": 2.2031850814819336, | |
| "learning_rate": 8.23529411764706e-06, | |
| "loss": 5.0571, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.1814882032667877, | |
| "grad_norm": 2.2017667293548584, | |
| "learning_rate": 8.823529411764707e-06, | |
| "loss": 4.8034, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.3266787658802177, | |
| "grad_norm": 2.2998626232147217, | |
| "learning_rate": 9.411764705882354e-06, | |
| "loss": 4.6938, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.471869328493648, | |
| "grad_norm": 2.088256359100342, | |
| "learning_rate": 1e-05, | |
| "loss": 4.5817, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.617059891107078, | |
| "grad_norm": 2.1210474967956543, | |
| "learning_rate": 1.0588235294117648e-05, | |
| "loss": 4.4561, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.762250453720508, | |
| "grad_norm": 2.330993175506592, | |
| "learning_rate": 1.1176470588235295e-05, | |
| "loss": 4.368, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.907441016333938, | |
| "grad_norm": 2.2989509105682373, | |
| "learning_rate": 1.1764705882352942e-05, | |
| "loss": 4.268, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.907441016333938, | |
| "eval_loss": 4.2127814292907715, | |
| "eval_runtime": 14.464, | |
| "eval_samples_per_second": 197.041, | |
| "eval_steps_per_second": 6.222, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 3.0544464609800364, | |
| "grad_norm": 1.6547863483428955, | |
| "learning_rate": 1.235294117647059e-05, | |
| "loss": 4.2584, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 3.1996370235934664, | |
| "grad_norm": 2.2462234497070312, | |
| "learning_rate": 1.2941176470588238e-05, | |
| "loss": 4.0844, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 3.344827586206897, | |
| "grad_norm": 2.176753044128418, | |
| "learning_rate": 1.3529411764705885e-05, | |
| "loss": 3.9943, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 3.490018148820327, | |
| "grad_norm": 2.2812905311584473, | |
| "learning_rate": 1.4117647058823532e-05, | |
| "loss": 3.9237, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 3.635208711433757, | |
| "grad_norm": 2.2799627780914307, | |
| "learning_rate": 1.4705882352941179e-05, | |
| "loss": 3.849, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 3.780399274047187, | |
| "grad_norm": 2.3910913467407227, | |
| "learning_rate": 1.5294117647058822e-05, | |
| "loss": 3.7829, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 3.925589836660617, | |
| "grad_norm": 2.793339490890503, | |
| "learning_rate": 1.5882352941176473e-05, | |
| "loss": 3.7159, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 4.072595281306715, | |
| "grad_norm": 4.1607465744018555, | |
| "learning_rate": 1.647058823529412e-05, | |
| "loss": 3.7605, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 4.217785843920145, | |
| "grad_norm": 2.6675713062286377, | |
| "learning_rate": 1.7058823529411767e-05, | |
| "loss": 3.6045, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 4.362976406533575, | |
| "grad_norm": 3.1864140033721924, | |
| "learning_rate": 1.7647058823529414e-05, | |
| "loss": 3.5499, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 4.362976406533575, | |
| "eval_loss": 3.5333378314971924, | |
| "eval_runtime": 14.5283, | |
| "eval_samples_per_second": 196.168, | |
| "eval_steps_per_second": 6.195, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 4.508166969147005, | |
| "grad_norm": 3.361107110977173, | |
| "learning_rate": 1.823529411764706e-05, | |
| "loss": 3.4898, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 4.653357531760435, | |
| "grad_norm": 3.826258897781372, | |
| "learning_rate": 1.8823529411764708e-05, | |
| "loss": 3.4595, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 4.798548094373865, | |
| "grad_norm": 3.7704880237579346, | |
| "learning_rate": 1.9411764705882355e-05, | |
| "loss": 3.4113, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 4.943738656987296, | |
| "grad_norm": 3.5223851203918457, | |
| "learning_rate": 2e-05, | |
| "loss": 3.3734, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 5.090744101633394, | |
| "grad_norm": 3.332577705383301, | |
| "learning_rate": 1.9999472984871734e-05, | |
| "loss": 3.4133, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 5.235934664246824, | |
| "grad_norm": 4.306556701660156, | |
| "learning_rate": 1.9997891995035914e-05, | |
| "loss": 3.2955, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 5.381125226860254, | |
| "grad_norm": 4.148169040679932, | |
| "learning_rate": 1.999525719713366e-05, | |
| "loss": 3.2456, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 5.526315789473684, | |
| "grad_norm": 4.137167930603027, | |
| "learning_rate": 1.999156886888064e-05, | |
| "loss": 3.2064, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 5.671506352087114, | |
| "grad_norm": 3.123608350753784, | |
| "learning_rate": 1.998682739903781e-05, | |
| "loss": 3.1841, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 5.816696914700545, | |
| "grad_norm": 4.209367752075195, | |
| "learning_rate": 1.9981033287370443e-05, | |
| "loss": 3.1453, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 5.816696914700545, | |
| "eval_loss": 3.1412713527679443, | |
| "eval_runtime": 14.4671, | |
| "eval_samples_per_second": 196.998, | |
| "eval_steps_per_second": 6.221, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 5.961887477313975, | |
| "grad_norm": 3.708157539367676, | |
| "learning_rate": 1.9974187144595433e-05, | |
| "loss": 3.1183, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 6.108892921960073, | |
| "grad_norm": 6.567568778991699, | |
| "learning_rate": 1.9966289692316944e-05, | |
| "loss": 3.166, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 6.254083484573503, | |
| "grad_norm": 4.073953151702881, | |
| "learning_rate": 1.9957341762950346e-05, | |
| "loss": 3.0523, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 6.399274047186933, | |
| "grad_norm": 4.078774452209473, | |
| "learning_rate": 1.9947344299634464e-05, | |
| "loss": 3.018, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 6.544464609800363, | |
| "grad_norm": 2.7505741119384766, | |
| "learning_rate": 1.993629835613218e-05, | |
| "loss": 2.9874, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 6.689655172413794, | |
| "grad_norm": 4.441661834716797, | |
| "learning_rate": 1.992420509671936e-05, | |
| "loss": 2.9679, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 6.834845735027224, | |
| "grad_norm": 3.656827211380005, | |
| "learning_rate": 1.9911065796062137e-05, | |
| "loss": 2.9333, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 6.980036297640654, | |
| "grad_norm": 3.519759178161621, | |
| "learning_rate": 1.9896881839082554e-05, | |
| "loss": 2.9003, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 7.127041742286751, | |
| "grad_norm": 2.58138108253479, | |
| "learning_rate": 1.9881654720812594e-05, | |
| "loss": 2.9466, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 7.272232304900181, | |
| "grad_norm": 3.1261541843414307, | |
| "learning_rate": 1.9865386046236597e-05, | |
| "loss": 2.865, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 7.272232304900181, | |
| "eval_loss": 2.8578507900238037, | |
| "eval_runtime": 14.4738, | |
| "eval_samples_per_second": 196.908, | |
| "eval_steps_per_second": 6.218, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 7.417422867513611, | |
| "grad_norm": 4.163350582122803, | |
| "learning_rate": 1.9848077530122083e-05, | |
| "loss": 2.8359, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 7.562613430127042, | |
| "grad_norm": 2.9917635917663574, | |
| "learning_rate": 1.982973099683902e-05, | |
| "loss": 2.7953, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 7.707803992740472, | |
| "grad_norm": 3.293595790863037, | |
| "learning_rate": 1.9810348380167527e-05, | |
| "loss": 2.7838, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 7.852994555353902, | |
| "grad_norm": 3.7709453105926514, | |
| "learning_rate": 1.9789931723094046e-05, | |
| "loss": 2.7497, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 7.998185117967332, | |
| "grad_norm": 3.3971333503723145, | |
| "learning_rate": 1.9768483177596008e-05, | |
| "loss": 2.8238, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 8.14519056261343, | |
| "grad_norm": 4.206657886505127, | |
| "learning_rate": 1.9746005004415004e-05, | |
| "loss": 2.7141, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 8.29038112522686, | |
| "grad_norm": 3.42154598236084, | |
| "learning_rate": 1.9722499572818496e-05, | |
| "loss": 2.7061, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 8.43557168784029, | |
| "grad_norm": 2.6466500759124756, | |
| "learning_rate": 1.9697969360350098e-05, | |
| "loss": 2.6849, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 8.58076225045372, | |
| "grad_norm": 3.4602091312408447, | |
| "learning_rate": 1.9672416952568416e-05, | |
| "loss": 2.6546, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 8.72595281306715, | |
| "grad_norm": 2.882288694381714, | |
| "learning_rate": 1.9645845042774555e-05, | |
| "loss": 2.6592, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 8.72595281306715, | |
| "eval_loss": 2.655930280685425, | |
| "eval_runtime": 14.4429, | |
| "eval_samples_per_second": 197.329, | |
| "eval_steps_per_second": 6.231, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 8.87114337568058, | |
| "grad_norm": 2.86531662940979, | |
| "learning_rate": 1.961825643172819e-05, | |
| "loss": 2.6246, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 9.01814882032668, | |
| "grad_norm": 2.0800743103027344, | |
| "learning_rate": 1.9589654027352412e-05, | |
| "loss": 2.6887, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 9.163339382940109, | |
| "grad_norm": 2.1428987979888916, | |
| "learning_rate": 1.956004084442718e-05, | |
| "loss": 2.6034, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 9.30852994555354, | |
| "grad_norm": 2.0337836742401123, | |
| "learning_rate": 1.9529420004271568e-05, | |
| "loss": 2.6018, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 9.453720508166969, | |
| "grad_norm": 2.25555419921875, | |
| "learning_rate": 1.9497794734414782e-05, | |
| "loss": 2.5723, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 9.5989110707804, | |
| "grad_norm": 2.281365156173706, | |
| "learning_rate": 1.9465168368255946e-05, | |
| "loss": 2.5639, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 9.744101633393829, | |
| "grad_norm": 1.9673478603363037, | |
| "learning_rate": 1.9431544344712776e-05, | |
| "loss": 2.5486, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 9.88929219600726, | |
| "grad_norm": 2.3862695693969727, | |
| "learning_rate": 1.9396926207859085e-05, | |
| "loss": 2.5319, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 10.036297640653357, | |
| "grad_norm": 2.3560924530029297, | |
| "learning_rate": 1.936131760655124e-05, | |
| "loss": 2.5827, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 10.181488203266788, | |
| "grad_norm": 3.1034605503082275, | |
| "learning_rate": 1.932472229404356e-05, | |
| "loss": 2.514, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 10.181488203266788, | |
| "eval_loss": 2.5237531661987305, | |
| "eval_runtime": 14.6638, | |
| "eval_samples_per_second": 194.356, | |
| "eval_steps_per_second": 6.138, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 10.326678765880217, | |
| "grad_norm": 2.507720470428467, | |
| "learning_rate": 1.9287144127592704e-05, | |
| "loss": 2.5069, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 10.471869328493648, | |
| "grad_norm": 2.375530481338501, | |
| "learning_rate": 1.924858706805112e-05, | |
| "loss": 2.4914, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 10.617059891107077, | |
| "grad_norm": 2.781869649887085, | |
| "learning_rate": 1.920905517944954e-05, | |
| "loss": 2.4731, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 10.762250453720508, | |
| "grad_norm": 2.3014352321624756, | |
| "learning_rate": 1.9168552628568632e-05, | |
| "loss": 2.4679, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 10.907441016333939, | |
| "grad_norm": 2.277211904525757, | |
| "learning_rate": 1.9127083684499805e-05, | |
| "loss": 2.4708, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 11.054446460980037, | |
| "grad_norm": 2.731947660446167, | |
| "learning_rate": 1.9084652718195237e-05, | |
| "loss": 2.5219, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 11.199637023593466, | |
| "grad_norm": 2.070516347885132, | |
| "learning_rate": 1.9041264202007158e-05, | |
| "loss": 2.4407, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 11.344827586206897, | |
| "grad_norm": 2.526477098464966, | |
| "learning_rate": 1.8996922709216456e-05, | |
| "loss": 2.4408, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 11.490018148820326, | |
| "grad_norm": 2.280230760574341, | |
| "learning_rate": 1.8951632913550625e-05, | |
| "loss": 2.4217, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 11.635208711433757, | |
| "grad_norm": 2.5102462768554688, | |
| "learning_rate": 1.8905399588691165e-05, | |
| "loss": 2.4129, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 11.635208711433757, | |
| "eval_loss": 2.4271934032440186, | |
| "eval_runtime": 14.4721, | |
| "eval_samples_per_second": 196.93, | |
| "eval_steps_per_second": 6.219, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 11.780399274047188, | |
| "grad_norm": 2.3329319953918457, | |
| "learning_rate": 1.8858227607770398e-05, | |
| "loss": 2.4068, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 11.925589836660617, | |
| "grad_norm": 2.9200258255004883, | |
| "learning_rate": 1.8810121942857848e-05, | |
| "loss": 2.393, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 12.072595281306715, | |
| "grad_norm": 2.835029363632202, | |
| "learning_rate": 1.8761087664436137e-05, | |
| "loss": 2.4508, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 12.217785843920145, | |
| "grad_norm": 2.7595760822296143, | |
| "learning_rate": 1.8711129940866577e-05, | |
| "loss": 2.3873, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 12.362976406533575, | |
| "grad_norm": 2.940290689468384, | |
| "learning_rate": 1.866025403784439e-05, | |
| "loss": 2.3817, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 12.508166969147005, | |
| "grad_norm": 2.936760902404785, | |
| "learning_rate": 1.860846531784368e-05, | |
| "loss": 2.3642, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 12.653357531760436, | |
| "grad_norm": 3.160423994064331, | |
| "learning_rate": 1.8555769239552232e-05, | |
| "loss": 2.3586, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 12.798548094373865, | |
| "grad_norm": 2.8737099170684814, | |
| "learning_rate": 1.8502171357296144e-05, | |
| "loss": 2.3481, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 12.943738656987296, | |
| "grad_norm": 2.706122398376465, | |
| "learning_rate": 1.8447677320454367e-05, | |
| "loss": 2.3496, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 13.090744101633394, | |
| "grad_norm": 2.1658377647399902, | |
| "learning_rate": 1.839229287286327e-05, | |
| "loss": 2.3982, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 13.090744101633394, | |
| "eval_loss": 2.3612313270568848, | |
| "eval_runtime": 14.4663, | |
| "eval_samples_per_second": 197.009, | |
| "eval_steps_per_second": 6.221, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 13.235934664246823, | |
| "grad_norm": 2.8399763107299805, | |
| "learning_rate": 1.8336023852211197e-05, | |
| "loss": 2.3385, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 13.381125226860254, | |
| "grad_norm": 2.8449740409851074, | |
| "learning_rate": 1.827887618942318e-05, | |
| "loss": 2.329, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 13.526315789473685, | |
| "grad_norm": 3.4475836753845215, | |
| "learning_rate": 1.8220855908035783e-05, | |
| "loss": 2.3102, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 13.671506352087114, | |
| "grad_norm": 2.820624589920044, | |
| "learning_rate": 1.816196912356222e-05, | |
| "loss": 2.3118, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 13.816696914700545, | |
| "grad_norm": 2.9867615699768066, | |
| "learning_rate": 1.8102222042847735e-05, | |
| "loss": 2.3077, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 13.961887477313974, | |
| "grad_norm": 4.323665142059326, | |
| "learning_rate": 1.8041620963415418e-05, | |
| "loss": 2.3013, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 14.108892921960072, | |
| "grad_norm": 4.4349236488342285, | |
| "learning_rate": 1.7980172272802398e-05, | |
| "loss": 2.3514, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 14.254083484573503, | |
| "grad_norm": 4.404689311981201, | |
| "learning_rate": 1.7917882447886585e-05, | |
| "loss": 2.2881, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 14.399274047186934, | |
| "grad_norm": 4.489727020263672, | |
| "learning_rate": 1.785475805420399e-05, | |
| "loss": 2.2839, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 14.544464609800363, | |
| "grad_norm": 3.8734374046325684, | |
| "learning_rate": 1.7790805745256703e-05, | |
| "loss": 2.2785, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 14.544464609800363, | |
| "eval_loss": 2.2907564640045166, | |
| "eval_runtime": 14.4981, | |
| "eval_samples_per_second": 196.577, | |
| "eval_steps_per_second": 6.208, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 14.689655172413794, | |
| "grad_norm": 4.450066089630127, | |
| "learning_rate": 1.772603226181159e-05, | |
| "loss": 2.2566, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 14.834845735027223, | |
| "grad_norm": 4.106595993041992, | |
| "learning_rate": 1.766044443118978e-05, | |
| "loss": 2.261, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 14.980036297640654, | |
| "grad_norm": 3.8136234283447266, | |
| "learning_rate": 1.7594049166547073e-05, | |
| "loss": 2.2535, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 15.127041742286751, | |
| "grad_norm": 5.098197937011719, | |
| "learning_rate": 1.7526853466145248e-05, | |
| "loss": 2.3093, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 15.272232304900182, | |
| "grad_norm": 3.752929210662842, | |
| "learning_rate": 1.7458864412614436e-05, | |
| "loss": 2.2377, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 15.417422867513611, | |
| "grad_norm": 3.992673397064209, | |
| "learning_rate": 1.7390089172206594e-05, | |
| "loss": 2.2416, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 15.562613430127042, | |
| "grad_norm": 3.857272148132324, | |
| "learning_rate": 1.7320534994040148e-05, | |
| "loss": 2.2251, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 15.707803992740471, | |
| "grad_norm": 4.833571434020996, | |
| "learning_rate": 1.725020920933593e-05, | |
| "loss": 2.2262, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 15.852994555353902, | |
| "grad_norm": 2.919546127319336, | |
| "learning_rate": 1.717911923064442e-05, | |
| "loss": 2.2204, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 15.998185117967331, | |
| "grad_norm": 4.030925273895264, | |
| "learning_rate": 1.710727255106447e-05, | |
| "loss": 2.274, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 15.998185117967331, | |
| "eval_loss": 2.2391433715820312, | |
| "eval_runtime": 14.4613, | |
| "eval_samples_per_second": 197.077, | |
| "eval_steps_per_second": 6.223, | |
| "step": 1100 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3400, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.414959764411515e+19, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |