| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.0, |
| "eval_steps": 500, |
| "global_step": 2385, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.020964360587002098, |
| "grad_norm": 3.998500347137451, |
| "learning_rate": 9.958071278825997e-05, |
| "loss": 0.1821, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.041928721174004195, |
| "grad_norm": 4.23317813873291, |
| "learning_rate": 9.916142557651992e-05, |
| "loss": 0.1617, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.06289308176100629, |
| "grad_norm": 4.348649024963379, |
| "learning_rate": 9.874213836477988e-05, |
| "loss": 0.1799, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.08385744234800839, |
| "grad_norm": 4.8091020584106445, |
| "learning_rate": 9.832285115303984e-05, |
| "loss": 0.2127, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.10482180293501048, |
| "grad_norm": 4.523597240447998, |
| "learning_rate": 9.790356394129979e-05, |
| "loss": 0.2195, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.12578616352201258, |
| "grad_norm": 6.083479404449463, |
| "learning_rate": 9.748427672955975e-05, |
| "loss": 0.2142, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.14675052410901468, |
| "grad_norm": 5.268312454223633, |
| "learning_rate": 9.706498951781971e-05, |
| "loss": 0.2293, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.16771488469601678, |
| "grad_norm": 5.908782482147217, |
| "learning_rate": 9.664570230607967e-05, |
| "loss": 0.2391, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.18867924528301888, |
| "grad_norm": 5.333123207092285, |
| "learning_rate": 9.622641509433963e-05, |
| "loss": 0.2519, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.20964360587002095, |
| "grad_norm": 4.541295051574707, |
| "learning_rate": 9.58071278825996e-05, |
| "loss": 0.241, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.23060796645702306, |
| "grad_norm": 5.367836952209473, |
| "learning_rate": 9.538784067085954e-05, |
| "loss": 0.2516, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.25157232704402516, |
| "grad_norm": 4.476315975189209, |
| "learning_rate": 9.496855345911951e-05, |
| "loss": 0.2581, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.27253668763102723, |
| "grad_norm": 4.104060173034668, |
| "learning_rate": 9.454926624737947e-05, |
| "loss": 0.2612, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.29350104821802936, |
| "grad_norm": 5.152798652648926, |
| "learning_rate": 9.412997903563942e-05, |
| "loss": 0.2736, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.31446540880503143, |
| "grad_norm": 4.785223960876465, |
| "learning_rate": 9.371069182389938e-05, |
| "loss": 0.267, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.33542976939203356, |
| "grad_norm": 4.769683837890625, |
| "learning_rate": 9.329140461215934e-05, |
| "loss": 0.2766, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.35639412997903563, |
| "grad_norm": 4.8373942375183105, |
| "learning_rate": 9.287211740041929e-05, |
| "loss": 0.2878, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.37735849056603776, |
| "grad_norm": 4.6672515869140625, |
| "learning_rate": 9.245283018867925e-05, |
| "loss": 0.2748, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.39832285115303984, |
| "grad_norm": 7.549396514892578, |
| "learning_rate": 9.203354297693921e-05, |
| "loss": 0.264, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.4192872117400419, |
| "grad_norm": 6.70684289932251, |
| "learning_rate": 9.161425576519916e-05, |
| "loss": 0.2843, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.44025157232704404, |
| "grad_norm": 5.79678201675415, |
| "learning_rate": 9.119496855345912e-05, |
| "loss": 0.2907, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.4612159329140461, |
| "grad_norm": 6.699160575866699, |
| "learning_rate": 9.077568134171907e-05, |
| "loss": 0.2678, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.48218029350104824, |
| "grad_norm": 4.357872486114502, |
| "learning_rate": 9.035639412997903e-05, |
| "loss": 0.2965, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.5031446540880503, |
| "grad_norm": 5.521259307861328, |
| "learning_rate": 8.9937106918239e-05, |
| "loss": 0.2876, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.5241090146750524, |
| "grad_norm": 6.627674579620361, |
| "learning_rate": 8.951781970649896e-05, |
| "loss": 0.2825, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.5450733752620545, |
| "grad_norm": 5.5945634841918945, |
| "learning_rate": 8.909853249475892e-05, |
| "loss": 0.2855, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.5660377358490566, |
| "grad_norm": 4.596319198608398, |
| "learning_rate": 8.867924528301888e-05, |
| "loss": 0.2753, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.5870020964360587, |
| "grad_norm": 5.92698860168457, |
| "learning_rate": 8.825995807127883e-05, |
| "loss": 0.3076, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.6079664570230608, |
| "grad_norm": 4.971231937408447, |
| "learning_rate": 8.784067085953879e-05, |
| "loss": 0.2945, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.6289308176100629, |
| "grad_norm": 5.803814888000488, |
| "learning_rate": 8.742138364779875e-05, |
| "loss": 0.3121, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.649895178197065, |
| "grad_norm": 4.640927314758301, |
| "learning_rate": 8.70020964360587e-05, |
| "loss": 0.2965, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.6708595387840671, |
| "grad_norm": 4.943049430847168, |
| "learning_rate": 8.662473794549267e-05, |
| "loss": 0.2814, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.6918238993710691, |
| "grad_norm": 18.59223175048828, |
| "learning_rate": 8.620545073375263e-05, |
| "loss": 0.2998, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.7127882599580713, |
| "grad_norm": 4.966686725616455, |
| "learning_rate": 8.578616352201259e-05, |
| "loss": 0.282, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.7337526205450734, |
| "grad_norm": 6.274560451507568, |
| "learning_rate": 8.536687631027254e-05, |
| "loss": 0.3123, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.7547169811320755, |
| "grad_norm": 5.680440902709961, |
| "learning_rate": 8.49475890985325e-05, |
| "loss": 0.2915, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.7756813417190775, |
| "grad_norm": 5.426858425140381, |
| "learning_rate": 8.452830188679246e-05, |
| "loss": 0.3192, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.7966457023060797, |
| "grad_norm": 5.293428421020508, |
| "learning_rate": 8.410901467505241e-05, |
| "loss": 0.3207, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.8176100628930818, |
| "grad_norm": 5.778203964233398, |
| "learning_rate": 8.368972746331237e-05, |
| "loss": 0.3202, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.8385744234800838, |
| "grad_norm": 5.326625347137451, |
| "learning_rate": 8.327044025157233e-05, |
| "loss": 0.3088, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.859538784067086, |
| "grad_norm": 5.26389217376709, |
| "learning_rate": 8.285115303983228e-05, |
| "loss": 0.3084, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.8805031446540881, |
| "grad_norm": 5.999328136444092, |
| "learning_rate": 8.243186582809224e-05, |
| "loss": 0.2822, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.9014675052410901, |
| "grad_norm": 5.0098958015441895, |
| "learning_rate": 8.20125786163522e-05, |
| "loss": 0.307, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.9224318658280922, |
| "grad_norm": 6.384605407714844, |
| "learning_rate": 8.159329140461215e-05, |
| "loss": 0.2976, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.9433962264150944, |
| "grad_norm": 9.32125186920166, |
| "learning_rate": 8.117400419287212e-05, |
| "loss": 0.3198, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.9643605870020965, |
| "grad_norm": 4.630229949951172, |
| "learning_rate": 8.075471698113208e-05, |
| "loss": 0.3031, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.9853249475890985, |
| "grad_norm": 4.775624752044678, |
| "learning_rate": 8.033542976939204e-05, |
| "loss": 0.2676, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.6731473207473755, |
| "eval_runtime": 20.7096, |
| "eval_samples_per_second": 72.43, |
| "eval_steps_per_second": 9.078, |
| "step": 477 |
| }, |
| { |
| "epoch": 1.0062893081761006, |
| "grad_norm": 1.9550973176956177, |
| "learning_rate": 7.9916142557652e-05, |
| "loss": 0.2451, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.0272536687631026, |
| "grad_norm": 3.6781623363494873, |
| "learning_rate": 7.949685534591196e-05, |
| "loss": 0.1127, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.0482180293501049, |
| "grad_norm": 3.6007821559906006, |
| "learning_rate": 7.907756813417191e-05, |
| "loss": 0.1089, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.069182389937107, |
| "grad_norm": 3.0092623233795166, |
| "learning_rate": 7.865828092243187e-05, |
| "loss": 0.1203, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.090146750524109, |
| "grad_norm": 2.852006196975708, |
| "learning_rate": 7.823899371069184e-05, |
| "loss": 0.1126, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.1111111111111112, |
| "grad_norm": 3.5420737266540527, |
| "learning_rate": 7.781970649895178e-05, |
| "loss": 0.1254, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.1320754716981132, |
| "grad_norm": 3.2206077575683594, |
| "learning_rate": 7.740041928721175e-05, |
| "loss": 0.1328, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.1530398322851152, |
| "grad_norm": 3.895273447036743, |
| "learning_rate": 7.698113207547171e-05, |
| "loss": 0.125, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.1740041928721174, |
| "grad_norm": 3.2703685760498047, |
| "learning_rate": 7.656184486373166e-05, |
| "loss": 0.1306, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.1949685534591195, |
| "grad_norm": 2.977438449859619, |
| "learning_rate": 7.614255765199162e-05, |
| "loss": 0.1261, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.2159329140461215, |
| "grad_norm": 3.917140007019043, |
| "learning_rate": 7.572327044025158e-05, |
| "loss": 0.1232, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.2368972746331237, |
| "grad_norm": 4.090723514556885, |
| "learning_rate": 7.530398322851153e-05, |
| "loss": 0.1374, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.2578616352201257, |
| "grad_norm": 3.666203737258911, |
| "learning_rate": 7.488469601677149e-05, |
| "loss": 0.1317, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.2788259958071277, |
| "grad_norm": 4.610381126403809, |
| "learning_rate": 7.446540880503144e-05, |
| "loss": 0.1276, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.29979035639413, |
| "grad_norm": 5.928828716278076, |
| "learning_rate": 7.40461215932914e-05, |
| "loss": 0.1258, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.320754716981132, |
| "grad_norm": 4.312713623046875, |
| "learning_rate": 7.362683438155136e-05, |
| "loss": 0.1318, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.3417190775681342, |
| "grad_norm": 3.831658363342285, |
| "learning_rate": 7.320754716981132e-05, |
| "loss": 0.1217, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.3626834381551363, |
| "grad_norm": 2.7465438842773438, |
| "learning_rate": 7.278825995807129e-05, |
| "loss": 0.1157, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.3836477987421385, |
| "grad_norm": 2.915738582611084, |
| "learning_rate": 7.236897274633125e-05, |
| "loss": 0.1414, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.4046121593291405, |
| "grad_norm": 3.2107832431793213, |
| "learning_rate": 7.19496855345912e-05, |
| "loss": 0.1253, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.4255765199161425, |
| "grad_norm": 3.7944188117980957, |
| "learning_rate": 7.153039832285116e-05, |
| "loss": 0.1249, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.4465408805031448, |
| "grad_norm": 3.0400636196136475, |
| "learning_rate": 7.111111111111112e-05, |
| "loss": 0.1323, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.4675052410901468, |
| "grad_norm": 3.911773443222046, |
| "learning_rate": 7.069182389937107e-05, |
| "loss": 0.1265, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.4884696016771488, |
| "grad_norm": 4.3404765129089355, |
| "learning_rate": 7.027253668763103e-05, |
| "loss": 0.1328, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.509433962264151, |
| "grad_norm": 3.5862231254577637, |
| "learning_rate": 6.985324947589099e-05, |
| "loss": 0.1336, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.530398322851153, |
| "grad_norm": 4.099681377410889, |
| "learning_rate": 6.943396226415094e-05, |
| "loss": 0.1345, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.551362683438155, |
| "grad_norm": 4.485490322113037, |
| "learning_rate": 6.90146750524109e-05, |
| "loss": 0.1368, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.5723270440251573, |
| "grad_norm": 3.4771294593811035, |
| "learning_rate": 6.859538784067086e-05, |
| "loss": 0.1197, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.5932914046121593, |
| "grad_norm": 3.602762460708618, |
| "learning_rate": 6.817610062893081e-05, |
| "loss": 0.125, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.6142557651991614, |
| "grad_norm": 3.6131410598754883, |
| "learning_rate": 6.775681341719077e-05, |
| "loss": 0.1363, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.6352201257861636, |
| "grad_norm": 4.117647647857666, |
| "learning_rate": 6.733752620545074e-05, |
| "loss": 0.1289, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.6561844863731656, |
| "grad_norm": 3.723130941390991, |
| "learning_rate": 6.691823899371068e-05, |
| "loss": 0.1319, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.6771488469601676, |
| "grad_norm": 3.42946195602417, |
| "learning_rate": 6.649895178197065e-05, |
| "loss": 0.1449, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.6981132075471699, |
| "grad_norm": 3.705895185470581, |
| "learning_rate": 6.607966457023061e-05, |
| "loss": 0.1446, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.719077568134172, |
| "grad_norm": 4.407285213470459, |
| "learning_rate": 6.566037735849057e-05, |
| "loss": 0.1417, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.740041928721174, |
| "grad_norm": 3.557760000228882, |
| "learning_rate": 6.524109014675053e-05, |
| "loss": 0.1338, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.7610062893081762, |
| "grad_norm": 3.342160224914551, |
| "learning_rate": 6.48218029350105e-05, |
| "loss": 0.1439, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.7819706498951782, |
| "grad_norm": 3.4098939895629883, |
| "learning_rate": 6.440251572327044e-05, |
| "loss": 0.1444, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.8029350104821802, |
| "grad_norm": 3.757795572280884, |
| "learning_rate": 6.39832285115304e-05, |
| "loss": 0.1327, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.8238993710691824, |
| "grad_norm": 3.3094663619995117, |
| "learning_rate": 6.356394129979037e-05, |
| "loss": 0.1338, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.8448637316561844, |
| "grad_norm": 3.925997495651245, |
| "learning_rate": 6.314465408805031e-05, |
| "loss": 0.1391, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.8658280922431865, |
| "grad_norm": 3.38191819190979, |
| "learning_rate": 6.272536687631028e-05, |
| "loss": 0.1396, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.8867924528301887, |
| "grad_norm": 3.194735288619995, |
| "learning_rate": 6.230607966457024e-05, |
| "loss": 0.1304, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.9077568134171907, |
| "grad_norm": 3.6266653537750244, |
| "learning_rate": 6.188679245283019e-05, |
| "loss": 0.1259, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.9287211740041927, |
| "grad_norm": 4.087035655975342, |
| "learning_rate": 6.146750524109015e-05, |
| "loss": 0.145, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.949685534591195, |
| "grad_norm": 3.725505828857422, |
| "learning_rate": 6.104821802935011e-05, |
| "loss": 0.1291, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.9706498951781972, |
| "grad_norm": 3.440652847290039, |
| "learning_rate": 6.0628930817610065e-05, |
| "loss": 0.1348, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.991614255765199, |
| "grad_norm": 3.5756237506866455, |
| "learning_rate": 6.020964360587003e-05, |
| "loss": 0.1535, |
| "step": 950 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.6702887415885925, |
| "eval_runtime": 20.8625, |
| "eval_samples_per_second": 71.9, |
| "eval_steps_per_second": 9.011, |
| "step": 954 |
| }, |
| { |
| "epoch": 2.0125786163522013, |
| "grad_norm": 1.9692338705062866, |
| "learning_rate": 5.979035639412999e-05, |
| "loss": 0.0812, |
| "step": 960 |
| }, |
| { |
| "epoch": 2.0335429769392035, |
| "grad_norm": 2.1989524364471436, |
| "learning_rate": 5.937106918238994e-05, |
| "loss": 0.0489, |
| "step": 970 |
| }, |
| { |
| "epoch": 2.0545073375262053, |
| "grad_norm": 1.9158607721328735, |
| "learning_rate": 5.89517819706499e-05, |
| "loss": 0.0465, |
| "step": 980 |
| }, |
| { |
| "epoch": 2.0754716981132075, |
| "grad_norm": 1.7822378873825073, |
| "learning_rate": 5.853249475890986e-05, |
| "loss": 0.049, |
| "step": 990 |
| }, |
| { |
| "epoch": 2.0964360587002098, |
| "grad_norm": 2.644869089126587, |
| "learning_rate": 5.811320754716981e-05, |
| "loss": 0.0545, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.1174004192872116, |
| "grad_norm": 2.888030529022217, |
| "learning_rate": 5.769392033542977e-05, |
| "loss": 0.0563, |
| "step": 1010 |
| }, |
| { |
| "epoch": 2.138364779874214, |
| "grad_norm": 2.034641742706299, |
| "learning_rate": 5.727463312368973e-05, |
| "loss": 0.0527, |
| "step": 1020 |
| }, |
| { |
| "epoch": 2.159329140461216, |
| "grad_norm": 1.8326114416122437, |
| "learning_rate": 5.685534591194969e-05, |
| "loss": 0.0509, |
| "step": 1030 |
| }, |
| { |
| "epoch": 2.180293501048218, |
| "grad_norm": 2.1253228187561035, |
| "learning_rate": 5.643605870020965e-05, |
| "loss": 0.0533, |
| "step": 1040 |
| }, |
| { |
| "epoch": 2.20125786163522, |
| "grad_norm": 2.369814872741699, |
| "learning_rate": 5.60167714884696e-05, |
| "loss": 0.0535, |
| "step": 1050 |
| }, |
| { |
| "epoch": 2.2222222222222223, |
| "grad_norm": 2.15043568611145, |
| "learning_rate": 5.559748427672956e-05, |
| "loss": 0.0525, |
| "step": 1060 |
| }, |
| { |
| "epoch": 2.243186582809224, |
| "grad_norm": 2.138932943344116, |
| "learning_rate": 5.517819706498952e-05, |
| "loss": 0.052, |
| "step": 1070 |
| }, |
| { |
| "epoch": 2.2641509433962264, |
| "grad_norm": 2.2840254306793213, |
| "learning_rate": 5.475890985324947e-05, |
| "loss": 0.0582, |
| "step": 1080 |
| }, |
| { |
| "epoch": 2.2851153039832286, |
| "grad_norm": 1.516945481300354, |
| "learning_rate": 5.433962264150943e-05, |
| "loss": 0.0557, |
| "step": 1090 |
| }, |
| { |
| "epoch": 2.3060796645702304, |
| "grad_norm": 2.2309811115264893, |
| "learning_rate": 5.3920335429769395e-05, |
| "loss": 0.0563, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.3270440251572326, |
| "grad_norm": 2.9257094860076904, |
| "learning_rate": 5.350104821802935e-05, |
| "loss": 0.056, |
| "step": 1110 |
| }, |
| { |
| "epoch": 2.348008385744235, |
| "grad_norm": 2.112194538116455, |
| "learning_rate": 5.308176100628931e-05, |
| "loss": 0.0567, |
| "step": 1120 |
| }, |
| { |
| "epoch": 2.368972746331237, |
| "grad_norm": 2.411298990249634, |
| "learning_rate": 5.2662473794549274e-05, |
| "loss": 0.0558, |
| "step": 1130 |
| }, |
| { |
| "epoch": 2.389937106918239, |
| "grad_norm": 2.410545825958252, |
| "learning_rate": 5.224318658280922e-05, |
| "loss": 0.0557, |
| "step": 1140 |
| }, |
| { |
| "epoch": 2.410901467505241, |
| "grad_norm": 2.1554999351501465, |
| "learning_rate": 5.1823899371069184e-05, |
| "loss": 0.0589, |
| "step": 1150 |
| }, |
| { |
| "epoch": 2.431865828092243, |
| "grad_norm": 2.1737091541290283, |
| "learning_rate": 5.1404612159329146e-05, |
| "loss": 0.0673, |
| "step": 1160 |
| }, |
| { |
| "epoch": 2.452830188679245, |
| "grad_norm": 3.798069477081299, |
| "learning_rate": 5.0985324947589094e-05, |
| "loss": 0.0546, |
| "step": 1170 |
| }, |
| { |
| "epoch": 2.4737945492662474, |
| "grad_norm": 2.5850603580474854, |
| "learning_rate": 5.0566037735849056e-05, |
| "loss": 0.0546, |
| "step": 1180 |
| }, |
| { |
| "epoch": 2.4947589098532497, |
| "grad_norm": 2.899369239807129, |
| "learning_rate": 5.014675052410902e-05, |
| "loss": 0.0576, |
| "step": 1190 |
| }, |
| { |
| "epoch": 2.5157232704402515, |
| "grad_norm": 2.534893035888672, |
| "learning_rate": 4.972746331236898e-05, |
| "loss": 0.0529, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.5366876310272537, |
| "grad_norm": 4.3690924644470215, |
| "learning_rate": 4.9308176100628935e-05, |
| "loss": 0.0494, |
| "step": 1210 |
| }, |
| { |
| "epoch": 2.5576519916142555, |
| "grad_norm": 1.8138058185577393, |
| "learning_rate": 4.888888888888889e-05, |
| "loss": 0.0559, |
| "step": 1220 |
| }, |
| { |
| "epoch": 2.5786163522012577, |
| "grad_norm": 1.9707310199737549, |
| "learning_rate": 4.846960167714885e-05, |
| "loss": 0.0544, |
| "step": 1230 |
| }, |
| { |
| "epoch": 2.59958071278826, |
| "grad_norm": 2.480036973953247, |
| "learning_rate": 4.805031446540881e-05, |
| "loss": 0.0591, |
| "step": 1240 |
| }, |
| { |
| "epoch": 2.620545073375262, |
| "grad_norm": 3.3198599815368652, |
| "learning_rate": 4.763102725366876e-05, |
| "loss": 0.0519, |
| "step": 1250 |
| }, |
| { |
| "epoch": 2.641509433962264, |
| "grad_norm": 4.302845478057861, |
| "learning_rate": 4.7211740041928724e-05, |
| "loss": 0.0588, |
| "step": 1260 |
| }, |
| { |
| "epoch": 2.6624737945492662, |
| "grad_norm": 2.769380807876587, |
| "learning_rate": 4.679245283018868e-05, |
| "loss": 0.0561, |
| "step": 1270 |
| }, |
| { |
| "epoch": 2.6834381551362685, |
| "grad_norm": 2.0273640155792236, |
| "learning_rate": 4.637316561844864e-05, |
| "loss": 0.0531, |
| "step": 1280 |
| }, |
| { |
| "epoch": 2.7044025157232703, |
| "grad_norm": 3.1066033840179443, |
| "learning_rate": 4.59538784067086e-05, |
| "loss": 0.0547, |
| "step": 1290 |
| }, |
| { |
| "epoch": 2.7253668763102725, |
| "grad_norm": 3.7857868671417236, |
| "learning_rate": 4.553459119496856e-05, |
| "loss": 0.0552, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.7463312368972748, |
| "grad_norm": 2.8125133514404297, |
| "learning_rate": 4.511530398322851e-05, |
| "loss": 0.0592, |
| "step": 1310 |
| }, |
| { |
| "epoch": 2.767295597484277, |
| "grad_norm": 1.8335483074188232, |
| "learning_rate": 4.469601677148847e-05, |
| "loss": 0.0576, |
| "step": 1320 |
| }, |
| { |
| "epoch": 2.788259958071279, |
| "grad_norm": 2.8290367126464844, |
| "learning_rate": 4.427672955974843e-05, |
| "loss": 0.0566, |
| "step": 1330 |
| }, |
| { |
| "epoch": 2.809224318658281, |
| "grad_norm": 2.31840443611145, |
| "learning_rate": 4.3857442348008385e-05, |
| "loss": 0.0504, |
| "step": 1340 |
| }, |
| { |
| "epoch": 2.830188679245283, |
| "grad_norm": 2.372063159942627, |
| "learning_rate": 4.343815513626835e-05, |
| "loss": 0.0591, |
| "step": 1350 |
| }, |
| { |
| "epoch": 2.851153039832285, |
| "grad_norm": 2.2945821285247803, |
| "learning_rate": 4.301886792452831e-05, |
| "loss": 0.0571, |
| "step": 1360 |
| }, |
| { |
| "epoch": 2.8721174004192873, |
| "grad_norm": 1.8437625169754028, |
| "learning_rate": 4.2599580712788264e-05, |
| "loss": 0.0544, |
| "step": 1370 |
| }, |
| { |
| "epoch": 2.8930817610062896, |
| "grad_norm": 2.612316608428955, |
| "learning_rate": 4.218029350104822e-05, |
| "loss": 0.0517, |
| "step": 1380 |
| }, |
| { |
| "epoch": 2.9140461215932913, |
| "grad_norm": 2.7713685035705566, |
| "learning_rate": 4.176100628930818e-05, |
| "loss": 0.0565, |
| "step": 1390 |
| }, |
| { |
| "epoch": 2.9350104821802936, |
| "grad_norm": 2.0892138481140137, |
| "learning_rate": 4.1341719077568136e-05, |
| "loss": 0.0591, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.9559748427672954, |
| "grad_norm": 1.9455903768539429, |
| "learning_rate": 4.092243186582809e-05, |
| "loss": 0.0494, |
| "step": 1410 |
| }, |
| { |
| "epoch": 2.9769392033542976, |
| "grad_norm": 3.191281318664551, |
| "learning_rate": 4.050314465408805e-05, |
| "loss": 0.0562, |
| "step": 1420 |
| }, |
| { |
| "epoch": 2.9979035639413, |
| "grad_norm": 2.252624034881592, |
| "learning_rate": 4.008385744234801e-05, |
| "loss": 0.053, |
| "step": 1430 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.667631983757019, |
| "eval_runtime": 22.0522, |
| "eval_samples_per_second": 68.02, |
| "eval_steps_per_second": 8.525, |
| "step": 1431 |
| }, |
| { |
| "epoch": 3.018867924528302, |
| "grad_norm": 1.2633923292160034, |
| "learning_rate": 3.966457023060797e-05, |
| "loss": 0.025, |
| "step": 1440 |
| }, |
| { |
| "epoch": 3.039832285115304, |
| "grad_norm": 1.2608442306518555, |
| "learning_rate": 3.924528301886793e-05, |
| "loss": 0.0191, |
| "step": 1450 |
| }, |
| { |
| "epoch": 3.060796645702306, |
| "grad_norm": 3.670048475265503, |
| "learning_rate": 3.882599580712789e-05, |
| "loss": 0.0231, |
| "step": 1460 |
| }, |
| { |
| "epoch": 3.0817610062893084, |
| "grad_norm": 3.2211225032806396, |
| "learning_rate": 3.840670859538784e-05, |
| "loss": 0.0225, |
| "step": 1470 |
| }, |
| { |
| "epoch": 3.10272536687631, |
| "grad_norm": 1.0529983043670654, |
| "learning_rate": 3.7987421383647804e-05, |
| "loss": 0.0242, |
| "step": 1480 |
| }, |
| { |
| "epoch": 3.1236897274633124, |
| "grad_norm": 1.5843929052352905, |
| "learning_rate": 3.756813417190776e-05, |
| "loss": 0.0199, |
| "step": 1490 |
| }, |
| { |
| "epoch": 3.1446540880503147, |
| "grad_norm": 1.28718900680542, |
| "learning_rate": 3.7148846960167714e-05, |
| "loss": 0.022, |
| "step": 1500 |
| }, |
| { |
| "epoch": 3.1656184486373165, |
| "grad_norm": 1.1010375022888184, |
| "learning_rate": 3.672955974842767e-05, |
| "loss": 0.0185, |
| "step": 1510 |
| }, |
| { |
| "epoch": 3.1865828092243187, |
| "grad_norm": 1.0108301639556885, |
| "learning_rate": 3.631027253668763e-05, |
| "loss": 0.0203, |
| "step": 1520 |
| }, |
| { |
| "epoch": 3.207547169811321, |
| "grad_norm": 1.856557846069336, |
| "learning_rate": 3.589098532494759e-05, |
| "loss": 0.0188, |
| "step": 1530 |
| }, |
| { |
| "epoch": 3.2285115303983227, |
| "grad_norm": 1.2554458379745483, |
| "learning_rate": 3.547169811320755e-05, |
| "loss": 0.0217, |
| "step": 1540 |
| }, |
| { |
| "epoch": 3.249475890985325, |
| "grad_norm": 1.5458911657333374, |
| "learning_rate": 3.505241090146751e-05, |
| "loss": 0.0217, |
| "step": 1550 |
| }, |
| { |
| "epoch": 3.270440251572327, |
| "grad_norm": 1.8719966411590576, |
| "learning_rate": 3.4633123689727465e-05, |
| "loss": 0.0239, |
| "step": 1560 |
| }, |
| { |
| "epoch": 3.291404612159329, |
| "grad_norm": 0.8294363617897034, |
| "learning_rate": 3.421383647798742e-05, |
| "loss": 0.0174, |
| "step": 1570 |
| }, |
| { |
| "epoch": 3.3123689727463312, |
| "grad_norm": 1.013922095298767, |
| "learning_rate": 3.379454926624738e-05, |
| "loss": 0.0191, |
| "step": 1580 |
| }, |
| { |
| "epoch": 3.3333333333333335, |
| "grad_norm": 2.7998366355895996, |
| "learning_rate": 3.337526205450734e-05, |
| "loss": 0.0211, |
| "step": 1590 |
| }, |
| { |
| "epoch": 3.3542976939203353, |
| "grad_norm": 1.3313286304473877, |
| "learning_rate": 3.295597484276729e-05, |
| "loss": 0.0214, |
| "step": 1600 |
| }, |
| { |
| "epoch": 3.3752620545073375, |
| "grad_norm": 1.3300907611846924, |
| "learning_rate": 3.2536687631027254e-05, |
| "loss": 0.0229, |
| "step": 1610 |
| }, |
| { |
| "epoch": 3.3962264150943398, |
| "grad_norm": 1.3926533460617065, |
| "learning_rate": 3.2117400419287216e-05, |
| "loss": 0.0192, |
| "step": 1620 |
| }, |
| { |
| "epoch": 3.4171907756813416, |
| "grad_norm": 1.8201309442520142, |
| "learning_rate": 3.169811320754717e-05, |
| "loss": 0.0194, |
| "step": 1630 |
| }, |
| { |
| "epoch": 3.438155136268344, |
| "grad_norm": 1.1363880634307861, |
| "learning_rate": 3.127882599580713e-05, |
| "loss": 0.0196, |
| "step": 1640 |
| }, |
| { |
| "epoch": 3.459119496855346, |
| "grad_norm": 1.1997497081756592, |
| "learning_rate": 3.085953878406709e-05, |
| "loss": 0.0192, |
| "step": 1650 |
| }, |
| { |
| "epoch": 3.480083857442348, |
| "grad_norm": 1.1632133722305298, |
| "learning_rate": 3.0440251572327043e-05, |
| "loss": 0.0165, |
| "step": 1660 |
| }, |
| { |
| "epoch": 3.50104821802935, |
| "grad_norm": 1.141566514968872, |
| "learning_rate": 3.0020964360587005e-05, |
| "loss": 0.0182, |
| "step": 1670 |
| }, |
| { |
| "epoch": 3.5220125786163523, |
| "grad_norm": 1.4071044921875, |
| "learning_rate": 2.9601677148846964e-05, |
| "loss": 0.0177, |
| "step": 1680 |
| }, |
| { |
| "epoch": 3.5429769392033545, |
| "grad_norm": 0.6424669623374939, |
| "learning_rate": 2.918238993710692e-05, |
| "loss": 0.018, |
| "step": 1690 |
| }, |
| { |
| "epoch": 3.5639412997903563, |
| "grad_norm": 1.240339756011963, |
| "learning_rate": 2.8763102725366874e-05, |
| "loss": 0.0181, |
| "step": 1700 |
| }, |
| { |
| "epoch": 3.5849056603773586, |
| "grad_norm": 1.8148901462554932, |
| "learning_rate": 2.8343815513626836e-05, |
| "loss": 0.0203, |
| "step": 1710 |
| }, |
| { |
| "epoch": 3.6058700209643604, |
| "grad_norm": 2.1308140754699707, |
| "learning_rate": 2.7924528301886794e-05, |
| "loss": 0.0208, |
| "step": 1720 |
| }, |
| { |
| "epoch": 3.6268343815513626, |
| "grad_norm": 1.0319384336471558, |
| "learning_rate": 2.750524109014675e-05, |
| "loss": 0.0171, |
| "step": 1730 |
| }, |
| { |
| "epoch": 3.647798742138365, |
| "grad_norm": 1.056301236152649, |
| "learning_rate": 2.708595387840671e-05, |
| "loss": 0.0193, |
| "step": 1740 |
| }, |
| { |
| "epoch": 3.668763102725367, |
| "grad_norm": 1.0502632856369019, |
| "learning_rate": 2.6666666666666667e-05, |
| "loss": 0.0181, |
| "step": 1750 |
| }, |
| { |
| "epoch": 3.689727463312369, |
| "grad_norm": 1.5024515390396118, |
| "learning_rate": 2.6247379454926625e-05, |
| "loss": 0.0194, |
| "step": 1760 |
| }, |
| { |
| "epoch": 3.710691823899371, |
| "grad_norm": 1.1332629919052124, |
| "learning_rate": 2.5828092243186587e-05, |
| "loss": 0.0154, |
| "step": 1770 |
| }, |
| { |
| "epoch": 3.731656184486373, |
| "grad_norm": 1.3538895845413208, |
| "learning_rate": 2.5408805031446542e-05, |
| "loss": 0.0174, |
| "step": 1780 |
| }, |
| { |
| "epoch": 3.752620545073375, |
| "grad_norm": 3.9446728229522705, |
| "learning_rate": 2.49895178197065e-05, |
| "loss": 0.0174, |
| "step": 1790 |
| }, |
| { |
| "epoch": 3.7735849056603774, |
| "grad_norm": 1.6350576877593994, |
| "learning_rate": 2.4570230607966456e-05, |
| "loss": 0.0211, |
| "step": 1800 |
| }, |
| { |
| "epoch": 3.7945492662473796, |
| "grad_norm": 0.998974621295929, |
| "learning_rate": 2.4150943396226418e-05, |
| "loss": 0.0185, |
| "step": 1810 |
| }, |
| { |
| "epoch": 3.8155136268343814, |
| "grad_norm": 1.1701918840408325, |
| "learning_rate": 2.3731656184486376e-05, |
| "loss": 0.019, |
| "step": 1820 |
| }, |
| { |
| "epoch": 3.8364779874213837, |
| "grad_norm": 1.005288004875183, |
| "learning_rate": 2.331236897274633e-05, |
| "loss": 0.0169, |
| "step": 1830 |
| }, |
| { |
| "epoch": 3.8574423480083855, |
| "grad_norm": 1.4519301652908325, |
| "learning_rate": 2.289308176100629e-05, |
| "loss": 0.0191, |
| "step": 1840 |
| }, |
| { |
| "epoch": 3.8784067085953877, |
| "grad_norm": 0.8834218382835388, |
| "learning_rate": 2.2473794549266248e-05, |
| "loss": 0.0171, |
| "step": 1850 |
| }, |
| { |
| "epoch": 3.89937106918239, |
| "grad_norm": 1.3449293375015259, |
| "learning_rate": 2.2054507337526207e-05, |
| "loss": 0.0149, |
| "step": 1860 |
| }, |
| { |
| "epoch": 3.920335429769392, |
| "grad_norm": 1.1032202243804932, |
| "learning_rate": 2.1635220125786165e-05, |
| "loss": 0.0175, |
| "step": 1870 |
| }, |
| { |
| "epoch": 3.941299790356394, |
| "grad_norm": 0.9004182815551758, |
| "learning_rate": 2.121593291404612e-05, |
| "loss": 0.0152, |
| "step": 1880 |
| }, |
| { |
| "epoch": 3.9622641509433962, |
| "grad_norm": 1.1716896295547485, |
| "learning_rate": 2.0796645702306082e-05, |
| "loss": 0.0189, |
| "step": 1890 |
| }, |
| { |
| "epoch": 3.9832285115303985, |
| "grad_norm": 1.6147270202636719, |
| "learning_rate": 2.037735849056604e-05, |
| "loss": 0.0183, |
| "step": 1900 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 0.6632634401321411, |
| "eval_runtime": 22.7298, |
| "eval_samples_per_second": 65.993, |
| "eval_steps_per_second": 8.271, |
| "step": 1908 |
| }, |
| { |
| "epoch": 4.0041928721174, |
| "grad_norm": 0.37951168417930603, |
| "learning_rate": 1.9958071278825996e-05, |
| "loss": 0.0163, |
| "step": 1910 |
| }, |
| { |
| "epoch": 4.0251572327044025, |
| "grad_norm": 0.3050394654273987, |
| "learning_rate": 1.9538784067085954e-05, |
| "loss": 0.007, |
| "step": 1920 |
| }, |
| { |
| "epoch": 4.046121593291405, |
| "grad_norm": 0.34913668036460876, |
| "learning_rate": 1.9119496855345913e-05, |
| "loss": 0.0064, |
| "step": 1930 |
| }, |
| { |
| "epoch": 4.067085953878407, |
| "grad_norm": 0.36414623260498047, |
| "learning_rate": 1.870020964360587e-05, |
| "loss": 0.0067, |
| "step": 1940 |
| }, |
| { |
| "epoch": 4.088050314465409, |
| "grad_norm": 0.5772935152053833, |
| "learning_rate": 1.828092243186583e-05, |
| "loss": 0.0061, |
| "step": 1950 |
| }, |
| { |
| "epoch": 4.109014675052411, |
| "grad_norm": 0.2230810970067978, |
| "learning_rate": 1.7861635220125785e-05, |
| "loss": 0.0059, |
| "step": 1960 |
| }, |
| { |
| "epoch": 4.129979035639413, |
| "grad_norm": 0.46709367632865906, |
| "learning_rate": 1.7442348008385743e-05, |
| "loss": 0.0063, |
| "step": 1970 |
| }, |
| { |
| "epoch": 4.150943396226415, |
| "grad_norm": 0.40340206027030945, |
| "learning_rate": 1.7023060796645705e-05, |
| "loss": 0.006, |
| "step": 1980 |
| }, |
| { |
| "epoch": 4.171907756813417, |
| "grad_norm": 0.22066733241081238, |
| "learning_rate": 1.660377358490566e-05, |
| "loss": 0.0055, |
| "step": 1990 |
| }, |
| { |
| "epoch": 4.1928721174004195, |
| "grad_norm": 0.22712306678295135, |
| "learning_rate": 1.618448637316562e-05, |
| "loss": 0.0075, |
| "step": 2000 |
| }, |
| { |
| "epoch": 4.213836477987422, |
| "grad_norm": 0.3246201276779175, |
| "learning_rate": 1.5765199161425577e-05, |
| "loss": 0.0059, |
| "step": 2010 |
| }, |
| { |
| "epoch": 4.234800838574423, |
| "grad_norm": 0.6034521460533142, |
| "learning_rate": 1.5345911949685536e-05, |
| "loss": 0.0058, |
| "step": 2020 |
| }, |
| { |
| "epoch": 4.255765199161425, |
| "grad_norm": 0.3003758490085602, |
| "learning_rate": 1.4926624737945494e-05, |
| "loss": 0.0059, |
| "step": 2030 |
| }, |
| { |
| "epoch": 4.276729559748428, |
| "grad_norm": 0.38291364908218384, |
| "learning_rate": 1.450733752620545e-05, |
| "loss": 0.0053, |
| "step": 2040 |
| }, |
| { |
| "epoch": 4.29769392033543, |
| "grad_norm": 0.24918225407600403, |
| "learning_rate": 1.408805031446541e-05, |
| "loss": 0.0055, |
| "step": 2050 |
| }, |
| { |
| "epoch": 4.318658280922432, |
| "grad_norm": 0.29115813970565796, |
| "learning_rate": 1.3668763102725368e-05, |
| "loss": 0.0057, |
| "step": 2060 |
| }, |
| { |
| "epoch": 4.339622641509434, |
| "grad_norm": 0.6018015146255493, |
| "learning_rate": 1.3249475890985325e-05, |
| "loss": 0.0056, |
| "step": 2070 |
| }, |
| { |
| "epoch": 4.360587002096436, |
| "grad_norm": 0.4561573565006256, |
| "learning_rate": 1.2830188679245283e-05, |
| "loss": 0.0053, |
| "step": 2080 |
| }, |
| { |
| "epoch": 4.381551362683438, |
| "grad_norm": 0.2961859703063965, |
| "learning_rate": 1.2410901467505242e-05, |
| "loss": 0.0058, |
| "step": 2090 |
| }, |
| { |
| "epoch": 4.40251572327044, |
| "grad_norm": 0.7806200385093689, |
| "learning_rate": 1.19916142557652e-05, |
| "loss": 0.0051, |
| "step": 2100 |
| }, |
| { |
| "epoch": 4.423480083857442, |
| "grad_norm": 0.2288259118795395, |
| "learning_rate": 1.1572327044025157e-05, |
| "loss": 0.0062, |
| "step": 2110 |
| }, |
| { |
| "epoch": 4.444444444444445, |
| "grad_norm": 0.5480152368545532, |
| "learning_rate": 1.1153039832285116e-05, |
| "loss": 0.0054, |
| "step": 2120 |
| }, |
| { |
| "epoch": 4.465408805031447, |
| "grad_norm": 0.3903510272502899, |
| "learning_rate": 1.0733752620545073e-05, |
| "loss": 0.0068, |
| "step": 2130 |
| }, |
| { |
| "epoch": 4.486373165618448, |
| "grad_norm": 0.26396581530570984, |
| "learning_rate": 1.0314465408805033e-05, |
| "loss": 0.0051, |
| "step": 2140 |
| }, |
| { |
| "epoch": 4.5073375262054505, |
| "grad_norm": 0.3533041477203369, |
| "learning_rate": 9.89517819706499e-06, |
| "loss": 0.0062, |
| "step": 2150 |
| }, |
| { |
| "epoch": 4.528301886792453, |
| "grad_norm": 0.23736946284770966, |
| "learning_rate": 9.475890985324948e-06, |
| "loss": 0.0056, |
| "step": 2160 |
| }, |
| { |
| "epoch": 4.549266247379455, |
| "grad_norm": 0.7073323726654053, |
| "learning_rate": 9.056603773584905e-06, |
| "loss": 0.0054, |
| "step": 2170 |
| }, |
| { |
| "epoch": 4.570230607966457, |
| "grad_norm": 0.22164012491703033, |
| "learning_rate": 8.637316561844865e-06, |
| "loss": 0.0055, |
| "step": 2180 |
| }, |
| { |
| "epoch": 4.591194968553459, |
| "grad_norm": 0.27965155243873596, |
| "learning_rate": 8.218029350104822e-06, |
| "loss": 0.0048, |
| "step": 2190 |
| }, |
| { |
| "epoch": 4.612159329140461, |
| "grad_norm": 0.7329670786857605, |
| "learning_rate": 7.79874213836478e-06, |
| "loss": 0.0057, |
| "step": 2200 |
| }, |
| { |
| "epoch": 4.633123689727463, |
| "grad_norm": 0.310923308134079, |
| "learning_rate": 7.379454926624739e-06, |
| "loss": 0.0054, |
| "step": 2210 |
| }, |
| { |
| "epoch": 4.654088050314465, |
| "grad_norm": 0.19455011188983917, |
| "learning_rate": 6.9601677148846965e-06, |
| "loss": 0.0072, |
| "step": 2220 |
| }, |
| { |
| "epoch": 4.6750524109014675, |
| "grad_norm": 0.257545530796051, |
| "learning_rate": 6.540880503144654e-06, |
| "loss": 0.0057, |
| "step": 2230 |
| }, |
| { |
| "epoch": 4.69601677148847, |
| "grad_norm": 1.0093029737472534, |
| "learning_rate": 6.121593291404613e-06, |
| "loss": 0.006, |
| "step": 2240 |
| }, |
| { |
| "epoch": 4.716981132075472, |
| "grad_norm": 0.4301735758781433, |
| "learning_rate": 5.70230607966457e-06, |
| "loss": 0.0056, |
| "step": 2250 |
| }, |
| { |
| "epoch": 4.737945492662474, |
| "grad_norm": 0.18497976660728455, |
| "learning_rate": 5.283018867924529e-06, |
| "loss": 0.0052, |
| "step": 2260 |
| }, |
| { |
| "epoch": 4.758909853249476, |
| "grad_norm": 0.21521350741386414, |
| "learning_rate": 4.8637316561844865e-06, |
| "loss": 0.0055, |
| "step": 2270 |
| }, |
| { |
| "epoch": 4.779874213836478, |
| "grad_norm": 0.20319782197475433, |
| "learning_rate": 4.444444444444445e-06, |
| "loss": 0.0058, |
| "step": 2280 |
| }, |
| { |
| "epoch": 4.80083857442348, |
| "grad_norm": 0.2288380116224289, |
| "learning_rate": 4.025157232704403e-06, |
| "loss": 0.005, |
| "step": 2290 |
| }, |
| { |
| "epoch": 4.821802935010482, |
| "grad_norm": 1.407477855682373, |
| "learning_rate": 3.6058700209643607e-06, |
| "loss": 0.0073, |
| "step": 2300 |
| }, |
| { |
| "epoch": 4.8427672955974845, |
| "grad_norm": 0.26325657963752747, |
| "learning_rate": 3.1865828092243184e-06, |
| "loss": 0.0057, |
| "step": 2310 |
| }, |
| { |
| "epoch": 4.863731656184486, |
| "grad_norm": 0.4354378283023834, |
| "learning_rate": 2.767295597484277e-06, |
| "loss": 0.006, |
| "step": 2320 |
| }, |
| { |
| "epoch": 4.884696016771488, |
| "grad_norm": 0.7161938548088074, |
| "learning_rate": 2.348008385744235e-06, |
| "loss": 0.0055, |
| "step": 2330 |
| }, |
| { |
| "epoch": 4.90566037735849, |
| "grad_norm": 1.1639270782470703, |
| "learning_rate": 1.928721174004193e-06, |
| "loss": 0.0056, |
| "step": 2340 |
| }, |
| { |
| "epoch": 4.926624737945493, |
| "grad_norm": 0.3333396017551422, |
| "learning_rate": 1.509433962264151e-06, |
| "loss": 0.0061, |
| "step": 2350 |
| }, |
| { |
| "epoch": 4.947589098532495, |
| "grad_norm": 0.3744851052761078, |
| "learning_rate": 1.090146750524109e-06, |
| "loss": 0.0056, |
| "step": 2360 |
| }, |
| { |
| "epoch": 4.968553459119497, |
| "grad_norm": 0.1569022685289383, |
| "learning_rate": 6.70859538784067e-07, |
| "loss": 0.0052, |
| "step": 2370 |
| }, |
| { |
| "epoch": 4.989517819706499, |
| "grad_norm": 0.30756279826164246, |
| "learning_rate": 2.5157232704402517e-07, |
| "loss": 0.0056, |
| "step": 2380 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 0.6631835699081421, |
| "eval_runtime": 22.601, |
| "eval_samples_per_second": 66.369, |
| "eval_steps_per_second": 8.318, |
| "step": 2385 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 2385, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.8785440290816e+18, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|