| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 1480, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.013527223537368955, |
| "grad_norm": 110.5, |
| "learning_rate": 1.2162162162162163e-07, |
| "loss": 2.9783, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.02705444707473791, |
| "grad_norm": 113.0, |
| "learning_rate": 2.567567567567567e-07, |
| "loss": 3.0679, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.040581670612106865, |
| "grad_norm": 106.0, |
| "learning_rate": 3.918918918918919e-07, |
| "loss": 3.0966, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.05410889414947582, |
| "grad_norm": 102.5, |
| "learning_rate": 5.270270270270269e-07, |
| "loss": 2.9748, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.06763611768684477, |
| "grad_norm": 95.5, |
| "learning_rate": 6.621621621621622e-07, |
| "loss": 2.9264, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.08116334122421373, |
| "grad_norm": 85.5, |
| "learning_rate": 7.972972972972972e-07, |
| "loss": 2.7277, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.09469056476158269, |
| "grad_norm": 85.5, |
| "learning_rate": 9.324324324324324e-07, |
| "loss": 2.6271, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.10821778829895164, |
| "grad_norm": 96.0, |
| "learning_rate": 1.0675675675675675e-06, |
| "loss": 2.4285, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.1217450118363206, |
| "grad_norm": 70.0, |
| "learning_rate": 1.2027027027027026e-06, |
| "loss": 2.0623, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.13527223537368954, |
| "grad_norm": 54.5, |
| "learning_rate": 1.3378378378378378e-06, |
| "loss": 1.7022, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.1487994589110585, |
| "grad_norm": 31.5, |
| "learning_rate": 1.472972972972973e-06, |
| "loss": 1.4259, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.16232668244842746, |
| "grad_norm": 19.375, |
| "learning_rate": 1.608108108108108e-06, |
| "loss": 1.1437, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.17585390598579642, |
| "grad_norm": 13.0, |
| "learning_rate": 1.743243243243243e-06, |
| "loss": 0.9886, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.18938112952316538, |
| "grad_norm": 9.625, |
| "learning_rate": 1.8783783783783783e-06, |
| "loss": 0.8594, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.2029083530605343, |
| "grad_norm": 8.5, |
| "learning_rate": 1.9999972186150605e-06, |
| "loss": 0.7673, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.21643557659790327, |
| "grad_norm": 5.78125, |
| "learning_rate": 1.9996634711432784e-06, |
| "loss": 0.7322, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.22996280013527223, |
| "grad_norm": 6.09375, |
| "learning_rate": 1.9987736594069414e-06, |
| "loss": 0.6586, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.2434900236726412, |
| "grad_norm": 5.40625, |
| "learning_rate": 1.997328278365126e-06, |
| "loss": 0.6214, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.2570172472100101, |
| "grad_norm": 7.15625, |
| "learning_rate": 1.9953281320131465e-06, |
| "loss": 0.5825, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.2705444707473791, |
| "grad_norm": 5.1875, |
| "learning_rate": 1.992774332935329e-06, |
| "loss": 0.5826, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.28407169428474804, |
| "grad_norm": 4.875, |
| "learning_rate": 1.989668301686138e-06, |
| "loss": 0.5562, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.297598917822117, |
| "grad_norm": 6.0625, |
| "learning_rate": 1.9860117659999875e-06, |
| "loss": 0.5323, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.31112614135948596, |
| "grad_norm": 5.40625, |
| "learning_rate": 1.981806759830189e-06, |
| "loss": 0.5352, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.3246533648968549, |
| "grad_norm": 5.3125, |
| "learning_rate": 1.9770556222175607e-06, |
| "loss": 0.5354, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.3381805884342239, |
| "grad_norm": 4.125, |
| "learning_rate": 1.9717609959893315e-06, |
| "loss": 0.4923, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.35170781197159284, |
| "grad_norm": 4.6875, |
| "learning_rate": 1.965925826289068e-06, |
| "loss": 0.4907, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.3652350355089618, |
| "grad_norm": 5.34375, |
| "learning_rate": 1.9595533589384306e-06, |
| "loss": 0.4875, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.37876225904633076, |
| "grad_norm": 5.09375, |
| "learning_rate": 1.952647138631682e-06, |
| "loss": 0.4771, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.3922894825836997, |
| "grad_norm": 6.1875, |
| "learning_rate": 1.945211006963945e-06, |
| "loss": 0.4683, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.4058167061210686, |
| "grad_norm": 4.96875, |
| "learning_rate": 1.937249100294311e-06, |
| "loss": 0.4597, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.4193439296584376, |
| "grad_norm": 5.6875, |
| "learning_rate": 1.9287658474449836e-06, |
| "loss": 0.4553, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.43287115319580655, |
| "grad_norm": 4.46875, |
| "learning_rate": 1.9197659672377386e-06, |
| "loss": 0.4499, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.4463983767331755, |
| "grad_norm": 5.25, |
| "learning_rate": 1.9102544658690745e-06, |
| "loss": 0.4559, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.45992560027054447, |
| "grad_norm": 4.625, |
| "learning_rate": 1.9002366341255067e-06, |
| "loss": 0.4354, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.4734528238079134, |
| "grad_norm": 5.8125, |
| "learning_rate": 1.8897180444405612e-06, |
| "loss": 0.4524, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.4869800473452824, |
| "grad_norm": 6.28125, |
| "learning_rate": 1.8787045477950988e-06, |
| "loss": 0.4259, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.5005072708826513, |
| "grad_norm": 5.1875, |
| "learning_rate": 1.8672022704627e-06, |
| "loss": 0.4254, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.5140344944200202, |
| "grad_norm": 5.875, |
| "learning_rate": 1.8552176106019153e-06, |
| "loss": 0.43, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.5275617179573893, |
| "grad_norm": 6.65625, |
| "learning_rate": 1.8427572346972803e-06, |
| "loss": 0.415, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.5410889414947582, |
| "grad_norm": 5.4375, |
| "learning_rate": 1.829828073851075e-06, |
| "loss": 0.4145, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.5546161650321272, |
| "grad_norm": 6.90625, |
| "learning_rate": 1.8164373199278855e-06, |
| "loss": 0.4049, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.5681433885694961, |
| "grad_norm": 4.90625, |
| "learning_rate": 1.8025924215541229e-06, |
| "loss": 0.4087, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.5816706121068651, |
| "grad_norm": 5.8125, |
| "learning_rate": 1.7883010799747095e-06, |
| "loss": 0.4131, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.595197835644234, |
| "grad_norm": 8.0, |
| "learning_rate": 1.7735712447692536e-06, |
| "loss": 0.4282, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.608725059181603, |
| "grad_norm": 4.46875, |
| "learning_rate": 1.7584111094300824e-06, |
| "loss": 0.4094, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.6222522827189719, |
| "grad_norm": 4.84375, |
| "learning_rate": 1.7428291068045998e-06, |
| "loss": 0.3987, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.6357795062563408, |
| "grad_norm": 5.125, |
| "learning_rate": 1.726833904404504e-06, |
| "loss": 0.3932, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.6493067297937098, |
| "grad_norm": 6.25, |
| "learning_rate": 1.7104343995844715e-06, |
| "loss": 0.4026, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.6628339533310788, |
| "grad_norm": 5.25, |
| "learning_rate": 1.6936397145929877e-06, |
| "loss": 0.4307, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.6763611768684478, |
| "grad_norm": 5.53125, |
| "learning_rate": 1.676459191498087e-06, |
| "loss": 0.3874, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.6898884004058167, |
| "grad_norm": 6.03125, |
| "learning_rate": 1.6589023869908107e-06, |
| "loss": 0.3754, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.7034156239431857, |
| "grad_norm": 7.21875, |
| "learning_rate": 1.6409790670692858e-06, |
| "loss": 0.3837, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.7169428474805546, |
| "grad_norm": 6.3125, |
| "learning_rate": 1.6226992016063724e-06, |
| "loss": 0.3889, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.7304700710179236, |
| "grad_norm": 6.125, |
| "learning_rate": 1.6040729588039088e-06, |
| "loss": 0.3778, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.7439972945552925, |
| "grad_norm": 7.5, |
| "learning_rate": 1.5851106995366337e-06, |
| "loss": 0.3654, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.7575245180926615, |
| "grad_norm": 7.125, |
| "learning_rate": 1.5658229715889345e-06, |
| "loss": 0.3725, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.7710517416300304, |
| "grad_norm": 5.84375, |
| "learning_rate": 1.5462205037876272e-06, |
| "loss": 0.3743, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.7845789651673994, |
| "grad_norm": 7.03125, |
| "learning_rate": 1.526314200034031e-06, |
| "loss": 0.39, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.7981061887047683, |
| "grad_norm": 8.1875, |
| "learning_rate": 1.5061151332386564e-06, |
| "loss": 0.3562, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.8116334122421373, |
| "grad_norm": 6.625, |
| "learning_rate": 1.4856345391618827e-06, |
| "loss": 0.3658, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.8251606357795063, |
| "grad_norm": 6.4375, |
| "learning_rate": 1.4648838101640517e-06, |
| "loss": 0.3547, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.8386878593168752, |
| "grad_norm": 5.75, |
| "learning_rate": 1.443874488868448e-06, |
| "loss": 0.3692, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.8522150828542442, |
| "grad_norm": 7.03125, |
| "learning_rate": 1.4226182617406994e-06, |
| "loss": 0.3878, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.8657423063916131, |
| "grad_norm": 6.875, |
| "learning_rate": 1.4011269525881635e-06, |
| "loss": 0.371, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.8792695299289821, |
| "grad_norm": 7.03125, |
| "learning_rate": 1.379412515982917e-06, |
| "loss": 0.3665, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.892796753466351, |
| "grad_norm": 9.6875, |
| "learning_rate": 1.3574870306120077e-06, |
| "loss": 0.3801, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.90632397700372, |
| "grad_norm": 9.5, |
| "learning_rate": 1.335362692558667e-06, |
| "loss": 0.3746, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.9198512005410889, |
| "grad_norm": 6.4375, |
| "learning_rate": 1.3130518085182222e-06, |
| "loss": 0.3675, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.933378424078458, |
| "grad_norm": 7.21875, |
| "learning_rate": 1.2905667889524769e-06, |
| "loss": 0.3494, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.9469056476158269, |
| "grad_norm": 7.09375, |
| "learning_rate": 1.2679201411863749e-06, |
| "loss": 0.3706, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.9604328711531958, |
| "grad_norm": 6.0, |
| "learning_rate": 1.245124462450783e-06, |
| "loss": 0.3664, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.9739600946905648, |
| "grad_norm": 5.8125, |
| "learning_rate": 1.2221924328752616e-06, |
| "loss": 0.3694, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.9874873182279337, |
| "grad_norm": 7.65625, |
| "learning_rate": 1.199136808434725e-06, |
| "loss": 0.3601, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 13.0625, |
| "learning_rate": 1.1759704138539119e-06, |
| "loss": 0.3654, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.013527223537369, |
| "grad_norm": 8.0625, |
| "learning_rate": 1.1527061354736127e-06, |
| "loss": 0.3354, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.0270544470747378, |
| "grad_norm": 6.21875, |
| "learning_rate": 1.1293569140826237e-06, |
| "loss": 0.3584, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.040581670612107, |
| "grad_norm": 7.625, |
| "learning_rate": 1.105935737719416e-06, |
| "loss": 0.3522, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.0541088941494758, |
| "grad_norm": 8.125, |
| "learning_rate": 1.082455634447518e-06, |
| "loss": 0.3658, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.0676361176868447, |
| "grad_norm": 6.46875, |
| "learning_rate": 1.0589296651086376e-06, |
| "loss": 0.3669, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.0811633412242136, |
| "grad_norm": 6.0, |
| "learning_rate": 1.0353709160575487e-06, |
| "loss": 0.3513, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.0946905647615828, |
| "grad_norm": 5.59375, |
| "learning_rate": 1.011792491882789e-06, |
| "loss": 0.3563, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.1082177882989517, |
| "grad_norm": 6.3125, |
| "learning_rate": 9.882075081172112e-07, |
| "loss": 0.3527, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.1217450118363206, |
| "grad_norm": 6.09375, |
| "learning_rate": 9.646290839424514e-07, |
| "loss": 0.351, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.1352722353736895, |
| "grad_norm": 7.40625, |
| "learning_rate": 9.410703348913626e-07, |
| "loss": 0.363, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.1487994589110584, |
| "grad_norm": 6.21875, |
| "learning_rate": 9.17544365552482e-07, |
| "loss": 0.3461, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.1623266824484275, |
| "grad_norm": 9.1875, |
| "learning_rate": 8.94064262280584e-07, |
| "loss": 0.341, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.1758539059857964, |
| "grad_norm": 6.75, |
| "learning_rate": 8.706430859173763e-07, |
| "loss": 0.3629, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.1893811295231653, |
| "grad_norm": 5.84375, |
| "learning_rate": 8.472938645263874e-07, |
| "loss": 0.3517, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.2029083530605342, |
| "grad_norm": 6.5625, |
| "learning_rate": 8.24029586146088e-07, |
| "loss": 0.3583, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.2164355765979034, |
| "grad_norm": 6.28125, |
| "learning_rate": 8.00863191565275e-07, |
| "loss": 0.3587, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.2299628001352723, |
| "grad_norm": 5.625, |
| "learning_rate": 7.778075671247385e-07, |
| "loss": 0.3413, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.2434900236726412, |
| "grad_norm": 6.5625, |
| "learning_rate": 7.548755375492172e-07, |
| "loss": 0.3496, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.25701724721001, |
| "grad_norm": 5.9375, |
| "learning_rate": 7.320798588136253e-07, |
| "loss": 0.346, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.270544470747379, |
| "grad_norm": 6.0625, |
| "learning_rate": 7.094332110475234e-07, |
| "loss": 0.3389, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.284071694284748, |
| "grad_norm": 7.34375, |
| "learning_rate": 6.869481914817779e-07, |
| "loss": 0.3491, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.297598917822117, |
| "grad_norm": 7.21875, |
| "learning_rate": 6.646373074413329e-07, |
| "loss": 0.3394, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.311126141359486, |
| "grad_norm": 9.5, |
| "learning_rate": 6.425129693879925e-07, |
| "loss": 0.3489, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.324653364896855, |
| "grad_norm": 6.375, |
| "learning_rate": 6.205874840170832e-07, |
| "loss": 0.3424, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.338180588434224, |
| "grad_norm": 6.96875, |
| "learning_rate": 5.988730474118367e-07, |
| "loss": 0.3559, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.3517078119715928, |
| "grad_norm": 5.5625, |
| "learning_rate": 5.773817382593007e-07, |
| "loss": 0.3402, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.3652350355089617, |
| "grad_norm": 8.25, |
| "learning_rate": 5.561255111315523e-07, |
| "loss": 0.3494, |
| "step": 1010 |
| }, |
| { |
| "epoch": 1.3787622590463307, |
| "grad_norm": 6.40625, |
| "learning_rate": 5.351161898359484e-07, |
| "loss": 0.3321, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.3922894825836998, |
| "grad_norm": 9.5625, |
| "learning_rate": 5.143654608381171e-07, |
| "loss": 0.3432, |
| "step": 1030 |
| }, |
| { |
| "epoch": 1.4058167061210687, |
| "grad_norm": 6.90625, |
| "learning_rate": 4.938848667613436e-07, |
| "loss": 0.3378, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.4193439296584376, |
| "grad_norm": 7.6875, |
| "learning_rate": 4.7368579996596903e-07, |
| "loss": 0.3529, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.4328711531958065, |
| "grad_norm": 6.3125, |
| "learning_rate": 4.5377949621237255e-07, |
| "loss": 0.3541, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.4463983767331756, |
| "grad_norm": 6.0, |
| "learning_rate": 4.341770284110655e-07, |
| "loss": 0.3264, |
| "step": 1070 |
| }, |
| { |
| "epoch": 1.4599256002705445, |
| "grad_norm": 6.3125, |
| "learning_rate": 4.1488930046336623e-07, |
| "loss": 0.3408, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.4734528238079134, |
| "grad_norm": 8.25, |
| "learning_rate": 3.9592704119609124e-07, |
| "loss": 0.363, |
| "step": 1090 |
| }, |
| { |
| "epoch": 1.4869800473452823, |
| "grad_norm": 6.59375, |
| "learning_rate": 3.773007983936275e-07, |
| "loss": 0.3512, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.5005072708826512, |
| "grad_norm": 9.3125, |
| "learning_rate": 3.5902093293071423e-07, |
| "loss": 0.335, |
| "step": 1110 |
| }, |
| { |
| "epoch": 1.5140344944200201, |
| "grad_norm": 6.3125, |
| "learning_rate": 3.4109761300918917e-07, |
| "loss": 0.3461, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.5275617179573893, |
| "grad_norm": 6.53125, |
| "learning_rate": 3.2354080850191325e-07, |
| "loss": 0.3425, |
| "step": 1130 |
| }, |
| { |
| "epoch": 1.5410889414947582, |
| "grad_norm": 6.625, |
| "learning_rate": 3.0636028540701233e-07, |
| "loss": 0.34, |
| "step": 1140 |
| }, |
| { |
| "epoch": 1.5546161650321273, |
| "grad_norm": 6.75, |
| "learning_rate": 2.895656004155288e-07, |
| "loss": 0.3464, |
| "step": 1150 |
| }, |
| { |
| "epoch": 1.5681433885694962, |
| "grad_norm": 5.875, |
| "learning_rate": 2.7316609559549563e-07, |
| "loss": 0.341, |
| "step": 1160 |
| }, |
| { |
| "epoch": 1.581670612106865, |
| "grad_norm": 8.1875, |
| "learning_rate": 2.571708931954e-07, |
| "loss": 0.3426, |
| "step": 1170 |
| }, |
| { |
| "epoch": 1.595197835644234, |
| "grad_norm": 6.375, |
| "learning_rate": 2.4158889056991773e-07, |
| "loss": 0.3494, |
| "step": 1180 |
| }, |
| { |
| "epoch": 1.608725059181603, |
| "grad_norm": 6.625, |
| "learning_rate": 2.2642875523074613e-07, |
| "loss": 0.3509, |
| "step": 1190 |
| }, |
| { |
| "epoch": 1.6222522827189718, |
| "grad_norm": 5.9375, |
| "learning_rate": 2.1169892002529044e-07, |
| "loss": 0.3508, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.6357795062563407, |
| "grad_norm": 7.75, |
| "learning_rate": 1.9740757844587708e-07, |
| "loss": 0.3542, |
| "step": 1210 |
| }, |
| { |
| "epoch": 1.6493067297937098, |
| "grad_norm": 5.875, |
| "learning_rate": 1.835626800721144e-07, |
| "loss": 0.3559, |
| "step": 1220 |
| }, |
| { |
| "epoch": 1.6628339533310788, |
| "grad_norm": 5.75, |
| "learning_rate": 1.7017192614892506e-07, |
| "loss": 0.33, |
| "step": 1230 |
| }, |
| { |
| "epoch": 1.6763611768684479, |
| "grad_norm": 6.8125, |
| "learning_rate": 1.5724276530271962e-07, |
| "loss": 0.3553, |
| "step": 1240 |
| }, |
| { |
| "epoch": 1.6898884004058168, |
| "grad_norm": 6.625, |
| "learning_rate": 1.447823893980845e-07, |
| "loss": 0.3394, |
| "step": 1250 |
| }, |
| { |
| "epoch": 1.7034156239431857, |
| "grad_norm": 6.75, |
| "learning_rate": 1.3279772953729983e-07, |
| "loss": 0.3379, |
| "step": 1260 |
| }, |
| { |
| "epoch": 1.7169428474805546, |
| "grad_norm": 8.25, |
| "learning_rate": 1.21295452204901e-07, |
| "loss": 0.3532, |
| "step": 1270 |
| }, |
| { |
| "epoch": 1.7304700710179235, |
| "grad_norm": 6.5625, |
| "learning_rate": 1.1028195555943876e-07, |
| "loss": 0.3352, |
| "step": 1280 |
| }, |
| { |
| "epoch": 1.7439972945552924, |
| "grad_norm": 5.5, |
| "learning_rate": 9.976336587449307e-08, |
| "loss": 0.3532, |
| "step": 1290 |
| }, |
| { |
| "epoch": 1.7575245180926615, |
| "grad_norm": 8.75, |
| "learning_rate": 8.974553413092556e-08, |
| "loss": 0.3405, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.7710517416300304, |
| "grad_norm": 6.4375, |
| "learning_rate": 8.023403276226127e-08, |
| "loss": 0.3612, |
| "step": 1310 |
| }, |
| { |
| "epoch": 1.7845789651673996, |
| "grad_norm": 6.46875, |
| "learning_rate": 7.123415255501652e-08, |
| "loss": 0.3513, |
| "step": 1320 |
| }, |
| { |
| "epoch": 1.7981061887047685, |
| "grad_norm": 7.625, |
| "learning_rate": 6.27508997056888e-08, |
| "loss": 0.3525, |
| "step": 1330 |
| }, |
| { |
| "epoch": 1.8116334122421374, |
| "grad_norm": 6.78125, |
| "learning_rate": 5.478899303605511e-08, |
| "loss": 0.3466, |
| "step": 1340 |
| }, |
| { |
| "epoch": 1.8251606357795063, |
| "grad_norm": 6.15625, |
| "learning_rate": 4.735286136831806e-08, |
| "loss": 0.3486, |
| "step": 1350 |
| }, |
| { |
| "epoch": 1.8386878593168752, |
| "grad_norm": 6.65625, |
| "learning_rate": 4.0446641061569144e-08, |
| "loss": 0.3363, |
| "step": 1360 |
| }, |
| { |
| "epoch": 1.852215082854244, |
| "grad_norm": 7.84375, |
| "learning_rate": 3.4074173710931796e-08, |
| "loss": 0.3437, |
| "step": 1370 |
| }, |
| { |
| "epoch": 1.865742306391613, |
| "grad_norm": 8.0625, |
| "learning_rate": 2.8239004010668367e-08, |
| "loss": 0.3358, |
| "step": 1380 |
| }, |
| { |
| "epoch": 1.879269529928982, |
| "grad_norm": 7.09375, |
| "learning_rate": 2.294437778243963e-08, |
| "loss": 0.3632, |
| "step": 1390 |
| }, |
| { |
| "epoch": 1.892796753466351, |
| "grad_norm": 6.375, |
| "learning_rate": 1.8193240169810942e-08, |
| "loss": 0.381, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.9063239770037201, |
| "grad_norm": 6.3125, |
| "learning_rate": 1.3988234000012367e-08, |
| "loss": 0.3476, |
| "step": 1410 |
| }, |
| { |
| "epoch": 1.919851200541089, |
| "grad_norm": 6.71875, |
| "learning_rate": 1.0331698313861937e-08, |
| "loss": 0.3403, |
| "step": 1420 |
| }, |
| { |
| "epoch": 1.933378424078458, |
| "grad_norm": 6.1875, |
| "learning_rate": 7.22566706467076e-09, |
| "loss": 0.345, |
| "step": 1430 |
| }, |
| { |
| "epoch": 1.9469056476158269, |
| "grad_norm": 6.5625, |
| "learning_rate": 4.6718679868533725e-09, |
| "loss": 0.3618, |
| "step": 1440 |
| }, |
| { |
| "epoch": 1.9604328711531958, |
| "grad_norm": 7.78125, |
| "learning_rate": 2.671721634873725e-09, |
| "loss": 0.3297, |
| "step": 1450 |
| }, |
| { |
| "epoch": 1.9739600946905647, |
| "grad_norm": 7.46875, |
| "learning_rate": 1.2263405930585947e-09, |
| "loss": 0.3393, |
| "step": 1460 |
| }, |
| { |
| "epoch": 1.9874873182279336, |
| "grad_norm": 7.125, |
| "learning_rate": 3.365288567216407e-10, |
| "loss": 0.362, |
| "step": 1470 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 11.3125, |
| "learning_rate": 2.7813849394764386e-12, |
| "loss": 0.3342, |
| "step": 1480 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1480, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 1100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.144769262871982e+17, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|