qwen_orm_1.5b_8ksamples / trainer_state.json
Colder203's picture
Upload trainer_state.json with huggingface_hub
916d2ab verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.13175905660515472,
"eval_steps": 2000,
"global_step": 18000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007319947589175262,
"grad_norm": 800.0,
"learning_rate": 4.879238838741157e-07,
"loss": 0.952,
"step": 100
},
{
"epoch": 0.0014639895178350524,
"grad_norm": 768.0,
"learning_rate": 9.758477677482314e-07,
"loss": 0.8458,
"step": 200
},
{
"epoch": 0.0021959842767525785,
"grad_norm": 1600.0,
"learning_rate": 1.4637716516223471e-06,
"loss": 0.8841,
"step": 300
},
{
"epoch": 0.0029279790356701047,
"grad_norm": 616.0,
"learning_rate": 1.951695535496463e-06,
"loss": 0.9277,
"step": 400
},
{
"epoch": 0.003659973794587631,
"grad_norm": 436.0,
"learning_rate": 2.4396194193705783e-06,
"loss": 0.6913,
"step": 500
},
{
"epoch": 0.004391968553505157,
"grad_norm": 1256.0,
"learning_rate": 2.9275433032446943e-06,
"loss": 0.6372,
"step": 600
},
{
"epoch": 0.005123963312422683,
"grad_norm": 764.0,
"learning_rate": 3.41546718711881e-06,
"loss": 1.0322,
"step": 700
},
{
"epoch": 0.005855958071340209,
"grad_norm": 1408.0,
"learning_rate": 3.903391070992926e-06,
"loss": 0.7853,
"step": 800
},
{
"epoch": 0.006587952830257735,
"grad_norm": 43.5,
"learning_rate": 4.391314954867041e-06,
"loss": 0.9377,
"step": 900
},
{
"epoch": 0.007319947589175262,
"grad_norm": 684.0,
"learning_rate": 4.879238838741157e-06,
"loss": 1.4695,
"step": 1000
},
{
"epoch": 0.008051942348092788,
"grad_norm": 524.0,
"learning_rate": 5.367162722615272e-06,
"loss": 1.4889,
"step": 1100
},
{
"epoch": 0.008783937107010314,
"grad_norm": 33.25,
"learning_rate": 5.8550866064893885e-06,
"loss": 1.2154,
"step": 1200
},
{
"epoch": 0.00951593186592784,
"grad_norm": 1.171875,
"learning_rate": 6.343010490363504e-06,
"loss": 1.3154,
"step": 1300
},
{
"epoch": 0.010247926624845366,
"grad_norm": 388.0,
"learning_rate": 6.83093437423762e-06,
"loss": 1.5739,
"step": 1400
},
{
"epoch": 0.010979921383762893,
"grad_norm": 251.0,
"learning_rate": 7.318858258111735e-06,
"loss": 0.7903,
"step": 1500
},
{
"epoch": 0.011711916142680419,
"grad_norm": 95.5,
"learning_rate": 7.806782141985851e-06,
"loss": 0.6962,
"step": 1600
},
{
"epoch": 0.012443910901597945,
"grad_norm": 516.0,
"learning_rate": 8.294706025859967e-06,
"loss": 1.2352,
"step": 1700
},
{
"epoch": 0.01317590566051547,
"grad_norm": 41.0,
"learning_rate": 8.782629909734082e-06,
"loss": 0.924,
"step": 1800
},
{
"epoch": 0.013907900419432996,
"grad_norm": 0.91015625,
"learning_rate": 9.270553793608198e-06,
"loss": 1.467,
"step": 1900
},
{
"epoch": 0.014639895178350524,
"grad_norm": 1.15625,
"learning_rate": 9.758477677482313e-06,
"loss": 0.9323,
"step": 2000
},
{
"epoch": 0.014639895178350524,
"eval_loss": 1.4799224138259888,
"eval_runtime": 27.9405,
"eval_samples_per_second": 17.895,
"eval_steps_per_second": 17.895,
"step": 2000
},
{
"epoch": 0.01537188993726805,
"grad_norm": 39.5,
"learning_rate": 1.0246401561356429e-05,
"loss": 1.2229,
"step": 2100
},
{
"epoch": 0.016103884696185577,
"grad_norm": 984.0,
"learning_rate": 1.0734325445230544e-05,
"loss": 1.7086,
"step": 2200
},
{
"epoch": 0.016835879455103103,
"grad_norm": 684.0,
"learning_rate": 1.122224932910466e-05,
"loss": 0.9654,
"step": 2300
},
{
"epoch": 0.01756787421402063,
"grad_norm": 840.0,
"learning_rate": 1.1710173212978777e-05,
"loss": 1.3702,
"step": 2400
},
{
"epoch": 0.018299868972938154,
"grad_norm": 388.0,
"learning_rate": 1.2198097096852893e-05,
"loss": 1.0888,
"step": 2500
},
{
"epoch": 0.01903186373185568,
"grad_norm": 832.0,
"learning_rate": 1.2686020980727008e-05,
"loss": 0.9989,
"step": 2600
},
{
"epoch": 0.019763858490773206,
"grad_norm": 0.3984375,
"learning_rate": 1.3173944864601122e-05,
"loss": 1.3161,
"step": 2700
},
{
"epoch": 0.02049585324969073,
"grad_norm": 644.0,
"learning_rate": 1.366186874847524e-05,
"loss": 1.1279,
"step": 2800
},
{
"epoch": 0.021227848008608257,
"grad_norm": 414.0,
"learning_rate": 1.4149792632349354e-05,
"loss": 1.2044,
"step": 2900
},
{
"epoch": 0.021959842767525786,
"grad_norm": 12.1875,
"learning_rate": 1.463771651622347e-05,
"loss": 1.0637,
"step": 3000
},
{
"epoch": 0.022691837526443312,
"grad_norm": 0.95703125,
"learning_rate": 1.5125640400097585e-05,
"loss": 1.2628,
"step": 3100
},
{
"epoch": 0.023423832285360838,
"grad_norm": 10.875,
"learning_rate": 1.5613564283971703e-05,
"loss": 1.3439,
"step": 3200
},
{
"epoch": 0.024155827044278363,
"grad_norm": 256.0,
"learning_rate": 1.6101488167845818e-05,
"loss": 1.2094,
"step": 3300
},
{
"epoch": 0.02488782180319589,
"grad_norm": 0.267578125,
"learning_rate": 1.6589412051719934e-05,
"loss": 1.4852,
"step": 3400
},
{
"epoch": 0.025619816562113415,
"grad_norm": 54.25,
"learning_rate": 1.707733593559405e-05,
"loss": 1.1689,
"step": 3500
},
{
"epoch": 0.02635181132103094,
"grad_norm": 308.0,
"learning_rate": 1.7565259819468165e-05,
"loss": 1.0845,
"step": 3600
},
{
"epoch": 0.027083806079948466,
"grad_norm": 264.0,
"learning_rate": 1.805318370334228e-05,
"loss": 1.2785,
"step": 3700
},
{
"epoch": 0.027815800838865992,
"grad_norm": 81.5,
"learning_rate": 1.8541107587216396e-05,
"loss": 1.0887,
"step": 3800
},
{
"epoch": 0.02854779559778352,
"grad_norm": 696.0,
"learning_rate": 1.902903147109051e-05,
"loss": 1.2968,
"step": 3900
},
{
"epoch": 0.029279790356701047,
"grad_norm": 186.0,
"learning_rate": 1.9516955354964627e-05,
"loss": 1.3166,
"step": 4000
},
{
"epoch": 0.029279790356701047,
"eval_loss": 0.8257483839988708,
"eval_runtime": 27.9567,
"eval_samples_per_second": 17.885,
"eval_steps_per_second": 17.885,
"step": 4000
},
{
"epoch": 0.030011785115618573,
"grad_norm": 1.9140625,
"learning_rate": 1.9999999997189743e-05,
"loss": 1.171,
"step": 4100
},
{
"epoch": 0.0307437798745361,
"grad_norm": 211.0,
"learning_rate": 1.9999971332569874e-05,
"loss": 1.1772,
"step": 4200
},
{
"epoch": 0.03147577463345363,
"grad_norm": 0.1416015625,
"learning_rate": 1.9999886462973602e-05,
"loss": 1.0697,
"step": 4300
},
{
"epoch": 0.032207769392371154,
"grad_norm": 0.220703125,
"learning_rate": 1.9999745388877933e-05,
"loss": 1.2177,
"step": 4400
},
{
"epoch": 0.03293976415128868,
"grad_norm": 0.283203125,
"learning_rate": 1.999954811107578e-05,
"loss": 1.1959,
"step": 4500
},
{
"epoch": 0.033671758910206205,
"grad_norm": 0.490234375,
"learning_rate": 1.9999294630675945e-05,
"loss": 1.1617,
"step": 4600
},
{
"epoch": 0.03440375366912373,
"grad_norm": 390.0,
"learning_rate": 1.999898494910312e-05,
"loss": 1.1348,
"step": 4700
},
{
"epoch": 0.03513574842804126,
"grad_norm": 0.279296875,
"learning_rate": 1.999861906809787e-05,
"loss": 1.1857,
"step": 4800
},
{
"epoch": 0.03586774318695878,
"grad_norm": 620.0,
"learning_rate": 1.9998196989716637e-05,
"loss": 1.1041,
"step": 4900
},
{
"epoch": 0.03659973794587631,
"grad_norm": 7.9375,
"learning_rate": 1.999771871633172e-05,
"loss": 1.2604,
"step": 5000
},
{
"epoch": 0.037331732704793834,
"grad_norm": 0.1328125,
"learning_rate": 1.9997184250631257e-05,
"loss": 1.1525,
"step": 5100
},
{
"epoch": 0.03806372746371136,
"grad_norm": 988.0,
"learning_rate": 1.999659359561922e-05,
"loss": 1.1125,
"step": 5200
},
{
"epoch": 0.038795722222628885,
"grad_norm": 528.0,
"learning_rate": 1.99959467546154e-05,
"loss": 1.0241,
"step": 5300
},
{
"epoch": 0.03952771698154641,
"grad_norm": 0.08203125,
"learning_rate": 1.999524373125537e-05,
"loss": 1.0007,
"step": 5400
},
{
"epoch": 0.04025971174046394,
"grad_norm": 0.06494140625,
"learning_rate": 1.9994484529490483e-05,
"loss": 1.7392,
"step": 5500
},
{
"epoch": 0.04099170649938146,
"grad_norm": 155.0,
"learning_rate": 1.9993669153587842e-05,
"loss": 1.6975,
"step": 5600
},
{
"epoch": 0.04172370125829899,
"grad_norm": 0.1787109375,
"learning_rate": 1.9992797608130284e-05,
"loss": 1.3126,
"step": 5700
},
{
"epoch": 0.042455696017216514,
"grad_norm": 102.5,
"learning_rate": 1.9991869898016337e-05,
"loss": 1.0694,
"step": 5800
},
{
"epoch": 0.04318769077613404,
"grad_norm": 282.0,
"learning_rate": 1.999088602846021e-05,
"loss": 1.1731,
"step": 5900
},
{
"epoch": 0.04391968553505157,
"grad_norm": 756.0,
"learning_rate": 1.998984600499175e-05,
"loss": 0.9569,
"step": 6000
},
{
"epoch": 0.04391968553505157,
"eval_loss": 1.0243369340896606,
"eval_runtime": 27.9367,
"eval_samples_per_second": 17.898,
"eval_steps_per_second": 17.898,
"step": 6000
},
{
"epoch": 0.0446516802939691,
"grad_norm": 0.08935546875,
"learning_rate": 1.9988749833456433e-05,
"loss": 0.8217,
"step": 6100
},
{
"epoch": 0.045383675052886624,
"grad_norm": 0.1650390625,
"learning_rate": 1.9987597520015302e-05,
"loss": 0.9041,
"step": 6200
},
{
"epoch": 0.04611566981180415,
"grad_norm": 70.0,
"learning_rate": 1.998638907114495e-05,
"loss": 1.0699,
"step": 6300
},
{
"epoch": 0.046847664570721675,
"grad_norm": 178.0,
"learning_rate": 1.998512449363748e-05,
"loss": 0.9322,
"step": 6400
},
{
"epoch": 0.0475796593296392,
"grad_norm": 0.1533203125,
"learning_rate": 1.9983803794600468e-05,
"loss": 0.9877,
"step": 6500
},
{
"epoch": 0.04831165408855673,
"grad_norm": 368.0,
"learning_rate": 1.998242698145692e-05,
"loss": 1.0714,
"step": 6600
},
{
"epoch": 0.04904364884747425,
"grad_norm": 0.279296875,
"learning_rate": 1.9980994061945238e-05,
"loss": 0.9344,
"step": 6700
},
{
"epoch": 0.04977564360639178,
"grad_norm": 2800.0,
"learning_rate": 1.997950504411916e-05,
"loss": 1.2076,
"step": 6800
},
{
"epoch": 0.050507638365309304,
"grad_norm": 0.31640625,
"learning_rate": 1.9977959936347732e-05,
"loss": 1.0685,
"step": 6900
},
{
"epoch": 0.05123963312422683,
"grad_norm": 29.75,
"learning_rate": 1.9976358747315254e-05,
"loss": 1.1026,
"step": 7000
},
{
"epoch": 0.051971627883144356,
"grad_norm": 2112.0,
"learning_rate": 1.9974701486021233e-05,
"loss": 1.0783,
"step": 7100
},
{
"epoch": 0.05270362264206188,
"grad_norm": 0.111328125,
"learning_rate": 1.997298816178033e-05,
"loss": 0.8777,
"step": 7200
},
{
"epoch": 0.05343561740097941,
"grad_norm": 0.07080078125,
"learning_rate": 1.9971218784222302e-05,
"loss": 0.9701,
"step": 7300
},
{
"epoch": 0.05416761215989693,
"grad_norm": 132.0,
"learning_rate": 1.9969393363291963e-05,
"loss": 0.9978,
"step": 7400
},
{
"epoch": 0.05489960691881446,
"grad_norm": 2.03125,
"learning_rate": 1.9967511909249118e-05,
"loss": 1.2451,
"step": 7500
},
{
"epoch": 0.055631601677731984,
"grad_norm": 912.0,
"learning_rate": 1.99655744326685e-05,
"loss": 0.8866,
"step": 7600
},
{
"epoch": 0.05636359643664951,
"grad_norm": 0.10986328125,
"learning_rate": 1.9963580944439732e-05,
"loss": 0.9139,
"step": 7700
},
{
"epoch": 0.05709559119556704,
"grad_norm": 0.1796875,
"learning_rate": 1.9961531455767233e-05,
"loss": 1.0991,
"step": 7800
},
{
"epoch": 0.05782758595448457,
"grad_norm": 0.45703125,
"learning_rate": 1.9959425978170187e-05,
"loss": 1.0318,
"step": 7900
},
{
"epoch": 0.058559580713402094,
"grad_norm": 161.0,
"learning_rate": 1.995726452348246e-05,
"loss": 1.0115,
"step": 8000
},
{
"epoch": 0.058559580713402094,
"eval_loss": 1.2017102241516113,
"eval_runtime": 27.9424,
"eval_samples_per_second": 17.894,
"eval_steps_per_second": 17.894,
"step": 8000
},
{
"epoch": 0.05929157547231962,
"grad_norm": 94.5,
"learning_rate": 1.9955047103852534e-05,
"loss": 1.3752,
"step": 8100
},
{
"epoch": 0.060023570231237146,
"grad_norm": 83.0,
"learning_rate": 1.995277373174345e-05,
"loss": 1.0333,
"step": 8200
},
{
"epoch": 0.06075556499015467,
"grad_norm": 0.1708984375,
"learning_rate": 1.9950444419932723e-05,
"loss": 1.0582,
"step": 8300
},
{
"epoch": 0.0614875597490722,
"grad_norm": 8.5,
"learning_rate": 1.994805918151229e-05,
"loss": 0.9273,
"step": 8400
},
{
"epoch": 0.06221955450798972,
"grad_norm": 0.1376953125,
"learning_rate": 1.9945618029888408e-05,
"loss": 0.8619,
"step": 8500
},
{
"epoch": 0.06295154926690726,
"grad_norm": 552.0,
"learning_rate": 1.994312097878161e-05,
"loss": 1.2394,
"step": 8600
},
{
"epoch": 0.06368354402582478,
"grad_norm": 0.62890625,
"learning_rate": 1.99405680422266e-05,
"loss": 0.8713,
"step": 8700
},
{
"epoch": 0.06441553878474231,
"grad_norm": 152.0,
"learning_rate": 1.9937959234572198e-05,
"loss": 0.9949,
"step": 8800
},
{
"epoch": 0.06514753354365983,
"grad_norm": 99.0,
"learning_rate": 1.993529457048124e-05,
"loss": 1.0313,
"step": 8900
},
{
"epoch": 0.06587952830257736,
"grad_norm": 1004.0,
"learning_rate": 1.993257406493051e-05,
"loss": 1.0299,
"step": 9000
},
{
"epoch": 0.06661152306149488,
"grad_norm": 0.16796875,
"learning_rate": 1.9929797733210644e-05,
"loss": 0.9293,
"step": 9100
},
{
"epoch": 0.06734351782041241,
"grad_norm": 0.75,
"learning_rate": 1.992696559092605e-05,
"loss": 1.04,
"step": 9200
},
{
"epoch": 0.06807551257932994,
"grad_norm": 5.15625,
"learning_rate": 1.992407765399483e-05,
"loss": 1.072,
"step": 9300
},
{
"epoch": 0.06880750733824746,
"grad_norm": 0.12890625,
"learning_rate": 1.992113393864867e-05,
"loss": 1.102,
"step": 9400
},
{
"epoch": 0.06953950209716499,
"grad_norm": 0.66796875,
"learning_rate": 1.9918134461432763e-05,
"loss": 1.0206,
"step": 9500
},
{
"epoch": 0.07027149685608251,
"grad_norm": 0.158203125,
"learning_rate": 1.991507923920571e-05,
"loss": 0.7945,
"step": 9600
},
{
"epoch": 0.07100349161500004,
"grad_norm": 4.75,
"learning_rate": 1.991196828913943e-05,
"loss": 1.1373,
"step": 9700
},
{
"epoch": 0.07173548637391756,
"grad_norm": 88.0,
"learning_rate": 1.9908801628719063e-05,
"loss": 1.0789,
"step": 9800
},
{
"epoch": 0.07246748113283509,
"grad_norm": 0.283203125,
"learning_rate": 1.9905579275742866e-05,
"loss": 0.9591,
"step": 9900
},
{
"epoch": 0.07319947589175262,
"grad_norm": 484.0,
"learning_rate": 1.990230124832212e-05,
"loss": 1.1461,
"step": 10000
},
{
"epoch": 0.07319947589175262,
"eval_loss": 0.7712569832801819,
"eval_runtime": 28.1186,
"eval_samples_per_second": 17.782,
"eval_steps_per_second": 17.782,
"step": 10000
},
{
"epoch": 0.07393147065067014,
"grad_norm": 696.0,
"learning_rate": 1.9898967564881014e-05,
"loss": 1.0556,
"step": 10100
},
{
"epoch": 0.07466346540958767,
"grad_norm": 9.4375,
"learning_rate": 1.9895578244156576e-05,
"loss": 1.1493,
"step": 10200
},
{
"epoch": 0.0753954601685052,
"grad_norm": 1.34375,
"learning_rate": 1.989213330519852e-05,
"loss": 0.8955,
"step": 10300
},
{
"epoch": 0.07612745492742272,
"grad_norm": 146.0,
"learning_rate": 1.988863276736918e-05,
"loss": 1.2152,
"step": 10400
},
{
"epoch": 0.07685944968634024,
"grad_norm": 756.0,
"learning_rate": 1.9885076650343364e-05,
"loss": 1.0884,
"step": 10500
},
{
"epoch": 0.07759144444525777,
"grad_norm": 151.0,
"learning_rate": 1.988146497410829e-05,
"loss": 1.1883,
"step": 10600
},
{
"epoch": 0.0783234392041753,
"grad_norm": 100.5,
"learning_rate": 1.987779775896343e-05,
"loss": 0.9924,
"step": 10700
},
{
"epoch": 0.07905543396309282,
"grad_norm": 0.77734375,
"learning_rate": 1.9874075025520417e-05,
"loss": 0.7545,
"step": 10800
},
{
"epoch": 0.07978742872201035,
"grad_norm": 2.0625,
"learning_rate": 1.987029679470292e-05,
"loss": 0.7715,
"step": 10900
},
{
"epoch": 0.08051942348092787,
"grad_norm": 0.33203125,
"learning_rate": 1.9866463087746544e-05,
"loss": 0.7923,
"step": 11000
},
{
"epoch": 0.0812514182398454,
"grad_norm": 0.53125,
"learning_rate": 1.986257392619869e-05,
"loss": 1.122,
"step": 11100
},
{
"epoch": 0.08198341299876293,
"grad_norm": 0.1845703125,
"learning_rate": 1.9858629331918445e-05,
"loss": 0.9972,
"step": 11200
},
{
"epoch": 0.08271540775768045,
"grad_norm": 130.0,
"learning_rate": 1.9854629327076454e-05,
"loss": 1.0698,
"step": 11300
},
{
"epoch": 0.08344740251659798,
"grad_norm": 119.5,
"learning_rate": 1.9850573934154798e-05,
"loss": 1.163,
"step": 11400
},
{
"epoch": 0.0841793972755155,
"grad_norm": 0.17578125,
"learning_rate": 1.9846463175946872e-05,
"loss": 0.8634,
"step": 11500
},
{
"epoch": 0.08491139203443303,
"grad_norm": 0.1806640625,
"learning_rate": 1.9842297075557243e-05,
"loss": 1.0536,
"step": 11600
},
{
"epoch": 0.08564338679335055,
"grad_norm": 0.12353515625,
"learning_rate": 1.9838075656401546e-05,
"loss": 0.826,
"step": 11700
},
{
"epoch": 0.08637538155226808,
"grad_norm": 3.5625,
"learning_rate": 1.9833798942206312e-05,
"loss": 0.9368,
"step": 11800
},
{
"epoch": 0.0871073763111856,
"grad_norm": 3712.0,
"learning_rate": 1.9829466957008884e-05,
"loss": 0.9388,
"step": 11900
},
{
"epoch": 0.08783937107010314,
"grad_norm": 352.0,
"learning_rate": 1.9825079725157236e-05,
"loss": 1.0504,
"step": 12000
},
{
"epoch": 0.08783937107010314,
"eval_loss": 0.770910382270813,
"eval_runtime": 27.9411,
"eval_samples_per_second": 17.895,
"eval_steps_per_second": 17.895,
"step": 12000
},
{
"epoch": 0.08857136582902067,
"grad_norm": 211.0,
"learning_rate": 1.982063727130987e-05,
"loss": 0.9014,
"step": 12100
},
{
"epoch": 0.0893033605879382,
"grad_norm": 258.0,
"learning_rate": 1.9816139620435657e-05,
"loss": 0.9101,
"step": 12200
},
{
"epoch": 0.09003535534685572,
"grad_norm": 0.765625,
"learning_rate": 1.9811586797813706e-05,
"loss": 1.0403,
"step": 12300
},
{
"epoch": 0.09076735010577325,
"grad_norm": 8.1875,
"learning_rate": 1.9806978829033218e-05,
"loss": 0.9556,
"step": 12400
},
{
"epoch": 0.09149934486469077,
"grad_norm": 0.232421875,
"learning_rate": 1.9802315739993346e-05,
"loss": 0.8063,
"step": 12500
},
{
"epoch": 0.0922313396236083,
"grad_norm": 0.396484375,
"learning_rate": 1.9797597556903048e-05,
"loss": 0.8704,
"step": 12600
},
{
"epoch": 0.09296333438252583,
"grad_norm": 94.5,
"learning_rate": 1.9792824306280934e-05,
"loss": 1.0443,
"step": 12700
},
{
"epoch": 0.09369532914144335,
"grad_norm": 0.09912109375,
"learning_rate": 1.9787996014955126e-05,
"loss": 0.9383,
"step": 12800
},
{
"epoch": 0.09442732390036088,
"grad_norm": 0.2216796875,
"learning_rate": 1.9783112710063098e-05,
"loss": 0.9516,
"step": 12900
},
{
"epoch": 0.0951593186592784,
"grad_norm": 0.1005859375,
"learning_rate": 1.9778174419051538e-05,
"loss": 0.9241,
"step": 13000
},
{
"epoch": 0.09589131341819593,
"grad_norm": 106.0,
"learning_rate": 1.977318116967618e-05,
"loss": 0.9661,
"step": 13100
},
{
"epoch": 0.09662330817711345,
"grad_norm": 0.314453125,
"learning_rate": 1.976813299000164e-05,
"loss": 1.0954,
"step": 13200
},
{
"epoch": 0.09735530293603098,
"grad_norm": 114.5,
"learning_rate": 1.9763029908401294e-05,
"loss": 0.9344,
"step": 13300
},
{
"epoch": 0.0980872976949485,
"grad_norm": 0.447265625,
"learning_rate": 1.9757871953557078e-05,
"loss": 1.0499,
"step": 13400
},
{
"epoch": 0.09881929245386603,
"grad_norm": 78.5,
"learning_rate": 1.975265915445934e-05,
"loss": 0.9215,
"step": 13500
},
{
"epoch": 0.09955128721278356,
"grad_norm": 0.1767578125,
"learning_rate": 1.97473915404067e-05,
"loss": 1.048,
"step": 13600
},
{
"epoch": 0.10028328197170108,
"grad_norm": 0.490234375,
"learning_rate": 1.9742069141005853e-05,
"loss": 1.0092,
"step": 13700
},
{
"epoch": 0.10101527673061861,
"grad_norm": 1.4296875,
"learning_rate": 1.9736691986171413e-05,
"loss": 0.964,
"step": 13800
},
{
"epoch": 0.10174727148953613,
"grad_norm": 0.2421875,
"learning_rate": 1.9731260106125757e-05,
"loss": 0.8828,
"step": 13900
},
{
"epoch": 0.10247926624845366,
"grad_norm": 88.0,
"learning_rate": 1.972577353139884e-05,
"loss": 1.0908,
"step": 14000
},
{
"epoch": 0.10247926624845366,
"eval_loss": 0.8814056515693665,
"eval_runtime": 27.9852,
"eval_samples_per_second": 17.867,
"eval_steps_per_second": 17.867,
"step": 14000
},
{
"epoch": 0.10321126100737119,
"grad_norm": 0.18359375,
"learning_rate": 1.9720232292828033e-05,
"loss": 0.9781,
"step": 14100
},
{
"epoch": 0.10394325576628871,
"grad_norm": 0.31640625,
"learning_rate": 1.971463642155794e-05,
"loss": 0.9888,
"step": 14200
},
{
"epoch": 0.10467525052520624,
"grad_norm": 0.0966796875,
"learning_rate": 1.9708985949040237e-05,
"loss": 1.0119,
"step": 14300
},
{
"epoch": 0.10540724528412376,
"grad_norm": 1.203125,
"learning_rate": 1.9703280907033475e-05,
"loss": 1.0127,
"step": 14400
},
{
"epoch": 0.10613924004304129,
"grad_norm": 0.53125,
"learning_rate": 1.9697521327602928e-05,
"loss": 1.0275,
"step": 14500
},
{
"epoch": 0.10687123480195881,
"grad_norm": 0.28515625,
"learning_rate": 1.9691707243120386e-05,
"loss": 0.869,
"step": 14600
},
{
"epoch": 0.10760322956087634,
"grad_norm": 1.3125,
"learning_rate": 1.9685838686263998e-05,
"loss": 0.8713,
"step": 14700
},
{
"epoch": 0.10833522431979387,
"grad_norm": 244.0,
"learning_rate": 1.9679915690018062e-05,
"loss": 1.0574,
"step": 14800
},
{
"epoch": 0.10906721907871139,
"grad_norm": 0.298828125,
"learning_rate": 1.9673938287672865e-05,
"loss": 0.8997,
"step": 14900
},
{
"epoch": 0.10979921383762892,
"grad_norm": 0.76171875,
"learning_rate": 1.966790651282447e-05,
"loss": 1.2234,
"step": 15000
},
{
"epoch": 0.11053120859654644,
"grad_norm": 0.2060546875,
"learning_rate": 1.9661820399374564e-05,
"loss": 0.8861,
"step": 15100
},
{
"epoch": 0.11126320335546397,
"grad_norm": 9.0,
"learning_rate": 1.9655679981530224e-05,
"loss": 0.9659,
"step": 15200
},
{
"epoch": 0.1119951981143815,
"grad_norm": 142.0,
"learning_rate": 1.964948529380375e-05,
"loss": 1.0234,
"step": 15300
},
{
"epoch": 0.11272719287329902,
"grad_norm": 0.171875,
"learning_rate": 1.964323637101247e-05,
"loss": 1.011,
"step": 15400
},
{
"epoch": 0.11345918763221656,
"grad_norm": 0.65625,
"learning_rate": 1.9636933248278545e-05,
"loss": 0.9565,
"step": 15500
},
{
"epoch": 0.11419118239113409,
"grad_norm": 76.0,
"learning_rate": 1.9630575961028765e-05,
"loss": 0.9768,
"step": 15600
},
{
"epoch": 0.11492317715005161,
"grad_norm": 242.0,
"learning_rate": 1.9624164544994343e-05,
"loss": 0.7916,
"step": 15700
},
{
"epoch": 0.11565517190896914,
"grad_norm": 160.0,
"learning_rate": 1.9617699036210737e-05,
"loss": 0.8392,
"step": 15800
},
{
"epoch": 0.11638716666788666,
"grad_norm": 0.5390625,
"learning_rate": 1.9611179471017423e-05,
"loss": 0.8403,
"step": 15900
},
{
"epoch": 0.11711916142680419,
"grad_norm": 3.453125,
"learning_rate": 1.9604605886057712e-05,
"loss": 0.7843,
"step": 16000
},
{
"epoch": 0.11711916142680419,
"eval_loss": 0.9412841796875,
"eval_runtime": 27.963,
"eval_samples_per_second": 17.881,
"eval_steps_per_second": 17.881,
"step": 16000
},
{
"epoch": 0.11785115618572171,
"grad_norm": 0.67578125,
"learning_rate": 1.9597978318278523e-05,
"loss": 1.0179,
"step": 16100
},
{
"epoch": 0.11858315094463924,
"grad_norm": 116.0,
"learning_rate": 1.9591296804930198e-05,
"loss": 0.9158,
"step": 16200
},
{
"epoch": 0.11931514570355677,
"grad_norm": 9.875,
"learning_rate": 1.958456138356627e-05,
"loss": 0.9174,
"step": 16300
},
{
"epoch": 0.12004714046247429,
"grad_norm": 1.1953125,
"learning_rate": 1.957777209204327e-05,
"loss": 0.942,
"step": 16400
},
{
"epoch": 0.12077913522139182,
"grad_norm": 288.0,
"learning_rate": 1.95709289685205e-05,
"loss": 0.8128,
"step": 16500
},
{
"epoch": 0.12151112998030934,
"grad_norm": 117.0,
"learning_rate": 1.956403205145984e-05,
"loss": 1.0152,
"step": 16600
},
{
"epoch": 0.12224312473922687,
"grad_norm": 90.0,
"learning_rate": 1.9557081379625494e-05,
"loss": 0.809,
"step": 16700
},
{
"epoch": 0.1229751194981444,
"grad_norm": 0.050537109375,
"learning_rate": 1.9550076992083818e-05,
"loss": 0.7162,
"step": 16800
},
{
"epoch": 0.12370711425706192,
"grad_norm": 0.18359375,
"learning_rate": 1.9543018928203066e-05,
"loss": 0.7201,
"step": 16900
},
{
"epoch": 0.12443910901597945,
"grad_norm": 0.1748046875,
"learning_rate": 1.9535907227653182e-05,
"loss": 1.279,
"step": 17000
},
{
"epoch": 0.12517110377489696,
"grad_norm": 0.1708984375,
"learning_rate": 1.952874193040558e-05,
"loss": 0.5654,
"step": 17100
},
{
"epoch": 0.1259030985338145,
"grad_norm": 0.0673828125,
"learning_rate": 1.9521523076732903e-05,
"loss": 0.7602,
"step": 17200
},
{
"epoch": 0.12663509329273204,
"grad_norm": 3760.0,
"learning_rate": 1.951425070720883e-05,
"loss": 0.9334,
"step": 17300
},
{
"epoch": 0.12736708805164956,
"grad_norm": 93.0,
"learning_rate": 1.9506924862707804e-05,
"loss": 1.1316,
"step": 17400
},
{
"epoch": 0.1280990828105671,
"grad_norm": 0.28125,
"learning_rate": 1.949954558440484e-05,
"loss": 1.0999,
"step": 17500
},
{
"epoch": 0.12883107756948461,
"grad_norm": 0.1806640625,
"learning_rate": 1.9492112913775273e-05,
"loss": 0.929,
"step": 17600
},
{
"epoch": 0.12956307232840214,
"grad_norm": 3904.0,
"learning_rate": 1.9484626892594525e-05,
"loss": 0.7699,
"step": 17700
},
{
"epoch": 0.13029506708731967,
"grad_norm": 0.447265625,
"learning_rate": 1.9477087562937888e-05,
"loss": 1.0148,
"step": 17800
},
{
"epoch": 0.1310270618462372,
"grad_norm": 8.25,
"learning_rate": 1.9469494967180262e-05,
"loss": 0.8446,
"step": 17900
},
{
"epoch": 0.13175905660515472,
"grad_norm": 148.0,
"learning_rate": 1.9461849147995942e-05,
"loss": 0.8187,
"step": 18000
},
{
"epoch": 0.13175905660515472,
"eval_loss": 0.8699705600738525,
"eval_runtime": 27.9351,
"eval_samples_per_second": 17.899,
"eval_steps_per_second": 17.899,
"step": 18000
}
],
"logging_steps": 100,
"max_steps": 136613,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.742241467994931e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}