| { | |
| "best_global_step": 68838, | |
| "best_metric": 0.8680973729027178, | |
| "best_model_checkpoint": "./results/checkpoint-68838", | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 68838, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.021790290246666087, | |
| "grad_norm": 21.07741355895996, | |
| "learning_rate": 1.985502193555885e-05, | |
| "loss": 0.656, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.043580580493332174, | |
| "grad_norm": 24.045154571533203, | |
| "learning_rate": 1.9709753333914408e-05, | |
| "loss": 0.5404, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.06537087073999825, | |
| "grad_norm": 11.716601371765137, | |
| "learning_rate": 1.956448473226997e-05, | |
| "loss": 0.5397, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.08716116098666435, | |
| "grad_norm": 12.603126525878906, | |
| "learning_rate": 1.9419216130625527e-05, | |
| "loss": 0.5234, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.10895145123333043, | |
| "grad_norm": 7.865340232849121, | |
| "learning_rate": 1.927394752898109e-05, | |
| "loss": 0.5091, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.1307417414799965, | |
| "grad_norm": 48.59724807739258, | |
| "learning_rate": 1.912867892733665e-05, | |
| "loss": 0.5202, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.1525320317266626, | |
| "grad_norm": 40.003543853759766, | |
| "learning_rate": 1.8983410325692207e-05, | |
| "loss": 0.4991, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.1743223219733287, | |
| "grad_norm": 36.06184768676758, | |
| "learning_rate": 1.8838141724047765e-05, | |
| "loss": 0.4751, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.19611261221999476, | |
| "grad_norm": 8.7717924118042, | |
| "learning_rate": 1.8692873122403327e-05, | |
| "loss": 0.4807, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.21790290246666086, | |
| "grad_norm": 31.210308074951172, | |
| "learning_rate": 1.8547604520758884e-05, | |
| "loss": 0.4816, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.23969319271332695, | |
| "grad_norm": 32.92966079711914, | |
| "learning_rate": 1.8402335919114446e-05, | |
| "loss": 0.4734, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.261483482959993, | |
| "grad_norm": 16.130868911743164, | |
| "learning_rate": 1.8257067317470003e-05, | |
| "loss": 0.4671, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.2832737732066591, | |
| "grad_norm": 24.621110916137695, | |
| "learning_rate": 1.8111798715825565e-05, | |
| "loss": 0.4803, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.3050640634533252, | |
| "grad_norm": 38.10552978515625, | |
| "learning_rate": 1.7966530114181122e-05, | |
| "loss": 0.4649, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.3268543536999913, | |
| "grad_norm": 38.748111724853516, | |
| "learning_rate": 1.7821261512536684e-05, | |
| "loss": 0.4609, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.3486446439466574, | |
| "grad_norm": 22.89567756652832, | |
| "learning_rate": 1.767599291089224e-05, | |
| "loss": 0.4446, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.37043493419332346, | |
| "grad_norm": 19.919641494750977, | |
| "learning_rate": 1.75307243092478e-05, | |
| "loss": 0.4438, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.39222522443998953, | |
| "grad_norm": 7.110844135284424, | |
| "learning_rate": 1.738545570760336e-05, | |
| "loss": 0.4525, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.41401551468665565, | |
| "grad_norm": 11.265097618103027, | |
| "learning_rate": 1.7240187105958918e-05, | |
| "loss": 0.4518, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.4358058049333217, | |
| "grad_norm": 16.030864715576172, | |
| "learning_rate": 1.709491850431448e-05, | |
| "loss": 0.4461, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.4575960951799878, | |
| "grad_norm": 20.574634552001953, | |
| "learning_rate": 1.6949649902670037e-05, | |
| "loss": 0.4485, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.4793863854266539, | |
| "grad_norm": 12.368102073669434, | |
| "learning_rate": 1.68043813010256e-05, | |
| "loss": 0.4266, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.5011766756733199, | |
| "grad_norm": 59.58558654785156, | |
| "learning_rate": 1.6659112699381156e-05, | |
| "loss": 0.4512, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.522966965919986, | |
| "grad_norm": 22.04444694519043, | |
| "learning_rate": 1.6513844097736717e-05, | |
| "loss": 0.4362, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.5447572561666522, | |
| "grad_norm": 27.14982032775879, | |
| "learning_rate": 1.6368575496092275e-05, | |
| "loss": 0.4315, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.5665475464133182, | |
| "grad_norm": 12.58919620513916, | |
| "learning_rate": 1.6223306894447833e-05, | |
| "loss": 0.4328, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.5883378366599843, | |
| "grad_norm": 39.1475944519043, | |
| "learning_rate": 1.6078038292803394e-05, | |
| "loss": 0.4366, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.6101281269066504, | |
| "grad_norm": 17.51034927368164, | |
| "learning_rate": 1.5932769691158952e-05, | |
| "loss": 0.4175, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.6319184171533165, | |
| "grad_norm": 7.703125476837158, | |
| "learning_rate": 1.5787501089514513e-05, | |
| "loss": 0.4039, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.6537087073999825, | |
| "grad_norm": 7.714233875274658, | |
| "learning_rate": 1.564223248787007e-05, | |
| "loss": 0.4153, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.6754989976466487, | |
| "grad_norm": 15.32762336730957, | |
| "learning_rate": 1.5496963886225632e-05, | |
| "loss": 0.4222, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.6972892878933148, | |
| "grad_norm": 11.171319007873535, | |
| "learning_rate": 1.535169528458119e-05, | |
| "loss": 0.4087, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.7190795781399808, | |
| "grad_norm": 17.960086822509766, | |
| "learning_rate": 1.5206426682936753e-05, | |
| "loss": 0.4327, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.7408698683866469, | |
| "grad_norm": 15.726662635803223, | |
| "learning_rate": 1.506115808129231e-05, | |
| "loss": 0.4146, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.762660158633313, | |
| "grad_norm": 19.11526107788086, | |
| "learning_rate": 1.4915889479647869e-05, | |
| "loss": 0.4212, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.7844504488799791, | |
| "grad_norm": 13.394328117370605, | |
| "learning_rate": 1.477062087800343e-05, | |
| "loss": 0.4239, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.8062407391266452, | |
| "grad_norm": 23.153011322021484, | |
| "learning_rate": 1.4625352276358988e-05, | |
| "loss": 0.4166, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.8280310293733113, | |
| "grad_norm": 35.252017974853516, | |
| "learning_rate": 1.4480083674714549e-05, | |
| "loss": 0.4133, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.8498213196199773, | |
| "grad_norm": 26.121305465698242, | |
| "learning_rate": 1.4334815073070107e-05, | |
| "loss": 0.4154, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.8716116098666434, | |
| "grad_norm": 15.44404411315918, | |
| "learning_rate": 1.4189546471425668e-05, | |
| "loss": 0.4053, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.8934019001133096, | |
| "grad_norm": 33.22347640991211, | |
| "learning_rate": 1.4044277869781226e-05, | |
| "loss": 0.404, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.9151921903599756, | |
| "grad_norm": 8.3173828125, | |
| "learning_rate": 1.3899009268136785e-05, | |
| "loss": 0.3904, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.9369824806066417, | |
| "grad_norm": 17.548837661743164, | |
| "learning_rate": 1.3753740666492346e-05, | |
| "loss": 0.3899, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.9587727708533078, | |
| "grad_norm": 63.516632080078125, | |
| "learning_rate": 1.3608472064847904e-05, | |
| "loss": 0.3922, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.9805630610999738, | |
| "grad_norm": 15.121821403503418, | |
| "learning_rate": 1.3463203463203465e-05, | |
| "loss": 0.3987, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 0.8653213328393663, | |
| "eval_f1": 0.8643315347438766, | |
| "eval_loss": 0.38306593894958496, | |
| "eval_runtime": 101.9836, | |
| "eval_samples_per_second": 449.945, | |
| "eval_steps_per_second": 28.122, | |
| "step": 22946 | |
| }, | |
| { | |
| "epoch": 1.0023533513466398, | |
| "grad_norm": 15.860408782958984, | |
| "learning_rate": 1.3317934861559023e-05, | |
| "loss": 0.3836, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.024143641593306, | |
| "grad_norm": 5.384024143218994, | |
| "learning_rate": 1.3172666259914584e-05, | |
| "loss": 0.3025, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 1.045933931839972, | |
| "grad_norm": 16.427749633789062, | |
| "learning_rate": 1.3027397658270142e-05, | |
| "loss": 0.2983, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.067724222086638, | |
| "grad_norm": 5.875234603881836, | |
| "learning_rate": 1.2882129056625703e-05, | |
| "loss": 0.2798, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.0895145123333043, | |
| "grad_norm": 25.95697021484375, | |
| "learning_rate": 1.2736860454981261e-05, | |
| "loss": 0.2847, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.1113048025799703, | |
| "grad_norm": 32.1990852355957, | |
| "learning_rate": 1.2591591853336819e-05, | |
| "loss": 0.289, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 1.1330950928266366, | |
| "grad_norm": 54.05229949951172, | |
| "learning_rate": 1.244632325169238e-05, | |
| "loss": 0.3002, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.1548853830733026, | |
| "grad_norm": 19.801795959472656, | |
| "learning_rate": 1.2301054650047938e-05, | |
| "loss": 0.2849, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 1.1766756733199686, | |
| "grad_norm": 25.528541564941406, | |
| "learning_rate": 1.21557860484035e-05, | |
| "loss": 0.2865, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.1984659635666346, | |
| "grad_norm": 16.505584716796875, | |
| "learning_rate": 1.2010517446759057e-05, | |
| "loss": 0.2843, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 1.2202562538133008, | |
| "grad_norm": 27.86761474609375, | |
| "learning_rate": 1.1865248845114618e-05, | |
| "loss": 0.3044, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.2420465440599668, | |
| "grad_norm": 14.01541519165039, | |
| "learning_rate": 1.1719980243470178e-05, | |
| "loss": 0.3004, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 1.263836834306633, | |
| "grad_norm": 15.49959659576416, | |
| "learning_rate": 1.1574711641825737e-05, | |
| "loss": 0.283, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 1.285627124553299, | |
| "grad_norm": 13.934378623962402, | |
| "learning_rate": 1.1429443040181297e-05, | |
| "loss": 0.2925, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 1.307417414799965, | |
| "grad_norm": 34.908287048339844, | |
| "learning_rate": 1.1284174438536855e-05, | |
| "loss": 0.2888, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.329207705046631, | |
| "grad_norm": 1.3758283853530884, | |
| "learning_rate": 1.1138905836892416e-05, | |
| "loss": 0.291, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 1.3509979952932973, | |
| "grad_norm": 42.13234329223633, | |
| "learning_rate": 1.0993637235247974e-05, | |
| "loss": 0.2792, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 1.3727882855399633, | |
| "grad_norm": 17.52948570251465, | |
| "learning_rate": 1.0848368633603535e-05, | |
| "loss": 0.2941, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 1.3945785757866296, | |
| "grad_norm": 17.316434860229492, | |
| "learning_rate": 1.0703100031959093e-05, | |
| "loss": 0.2854, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 1.4163688660332956, | |
| "grad_norm": 5.664457321166992, | |
| "learning_rate": 1.0557831430314654e-05, | |
| "loss": 0.2919, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 1.4381591562799616, | |
| "grad_norm": 4.736696720123291, | |
| "learning_rate": 1.0412562828670212e-05, | |
| "loss": 0.3005, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 1.4599494465266276, | |
| "grad_norm": 32.045555114746094, | |
| "learning_rate": 1.0267294227025773e-05, | |
| "loss": 0.2932, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 1.4817397367732938, | |
| "grad_norm": 12.678611755371094, | |
| "learning_rate": 1.012202562538133e-05, | |
| "loss": 0.3073, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 1.5035300270199599, | |
| "grad_norm": 73.06298828125, | |
| "learning_rate": 9.97675702373689e-06, | |
| "loss": 0.2799, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 1.525320317266626, | |
| "grad_norm": 9.1071138381958, | |
| "learning_rate": 9.83148842209245e-06, | |
| "loss": 0.2823, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 1.547110607513292, | |
| "grad_norm": 36.0024299621582, | |
| "learning_rate": 9.68621982044801e-06, | |
| "loss": 0.2923, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 1.5689008977599581, | |
| "grad_norm": 17.55816650390625, | |
| "learning_rate": 9.540951218803569e-06, | |
| "loss": 0.2886, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.5906911880066241, | |
| "grad_norm": 30.378305435180664, | |
| "learning_rate": 9.395682617159128e-06, | |
| "loss": 0.2917, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 1.6124814782532904, | |
| "grad_norm": 10.106338500976562, | |
| "learning_rate": 9.250414015514688e-06, | |
| "loss": 0.2907, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 1.6342717684999564, | |
| "grad_norm": 20.659482955932617, | |
| "learning_rate": 9.105145413870247e-06, | |
| "loss": 0.2842, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 1.6560620587466226, | |
| "grad_norm": 18.197683334350586, | |
| "learning_rate": 8.959876812225807e-06, | |
| "loss": 0.2912, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 1.6778523489932886, | |
| "grad_norm": 31.622310638427734, | |
| "learning_rate": 8.814608210581366e-06, | |
| "loss": 0.302, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 1.6996426392399546, | |
| "grad_norm": 7.478927135467529, | |
| "learning_rate": 8.669339608936926e-06, | |
| "loss": 0.2962, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 1.7214329294866206, | |
| "grad_norm": 1.150900959968567, | |
| "learning_rate": 8.524071007292485e-06, | |
| "loss": 0.2905, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 1.7432232197332869, | |
| "grad_norm": 24.470115661621094, | |
| "learning_rate": 8.378802405648045e-06, | |
| "loss": 0.2814, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 1.765013509979953, | |
| "grad_norm": 14.964280128479004, | |
| "learning_rate": 8.233533804003603e-06, | |
| "loss": 0.2935, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 1.786803800226619, | |
| "grad_norm": 22.0378360748291, | |
| "learning_rate": 8.088265202359162e-06, | |
| "loss": 0.2777, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 1.8085940904732851, | |
| "grad_norm": 1.5936992168426514, | |
| "learning_rate": 7.942996600714722e-06, | |
| "loss": 0.2557, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 1.8303843807199511, | |
| "grad_norm": 38.885902404785156, | |
| "learning_rate": 7.797727999070281e-06, | |
| "loss": 0.2877, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 1.8521746709666171, | |
| "grad_norm": 21.789640426635742, | |
| "learning_rate": 7.65245939742584e-06, | |
| "loss": 0.2938, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 1.8739649612132834, | |
| "grad_norm": 31.65484046936035, | |
| "learning_rate": 7.5071907957814e-06, | |
| "loss": 0.281, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 1.8957552514599496, | |
| "grad_norm": 41.5484504699707, | |
| "learning_rate": 7.36192219413696e-06, | |
| "loss": 0.2819, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 1.9175455417066156, | |
| "grad_norm": 44.10054016113281, | |
| "learning_rate": 7.216653592492519e-06, | |
| "loss": 0.2727, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 1.9393358319532816, | |
| "grad_norm": 19.873380661010742, | |
| "learning_rate": 7.071384990848079e-06, | |
| "loss": 0.2861, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 1.9611261221999476, | |
| "grad_norm": 39.448081970214844, | |
| "learning_rate": 6.926116389203637e-06, | |
| "loss": 0.2726, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 1.9829164124466137, | |
| "grad_norm": 4.273163318634033, | |
| "learning_rate": 6.780847787559197e-06, | |
| "loss": 0.2792, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 0.8681107939067709, | |
| "eval_f1": 0.8675808818446633, | |
| "eval_loss": 0.40907520055770874, | |
| "eval_runtime": 101.153, | |
| "eval_samples_per_second": 453.639, | |
| "eval_steps_per_second": 28.353, | |
| "step": 45892 | |
| }, | |
| { | |
| "epoch": 2.0047067026932797, | |
| "grad_norm": 12.477510452270508, | |
| "learning_rate": 6.6355791859147564e-06, | |
| "loss": 0.2583, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 2.026496992939946, | |
| "grad_norm": 0.07342194020748138, | |
| "learning_rate": 6.490310584270316e-06, | |
| "loss": 0.1871, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 2.048287283186612, | |
| "grad_norm": 69.57736206054688, | |
| "learning_rate": 6.3450419826258755e-06, | |
| "loss": 0.1979, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 2.070077573433278, | |
| "grad_norm": 7.542209148406982, | |
| "learning_rate": 6.199773380981435e-06, | |
| "loss": 0.193, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 2.091867863679944, | |
| "grad_norm": 0.9572351574897766, | |
| "learning_rate": 6.0545047793369945e-06, | |
| "loss": 0.1914, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 2.11365815392661, | |
| "grad_norm": 77.32627868652344, | |
| "learning_rate": 5.909236177692555e-06, | |
| "loss": 0.1896, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 2.135448444173276, | |
| "grad_norm": 2.729917049407959, | |
| "learning_rate": 5.763967576048114e-06, | |
| "loss": 0.1963, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 2.1572387344199426, | |
| "grad_norm": 2.4020707607269287, | |
| "learning_rate": 5.618698974403672e-06, | |
| "loss": 0.1812, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 2.1790290246666086, | |
| "grad_norm": 1.194051742553711, | |
| "learning_rate": 5.473430372759232e-06, | |
| "loss": 0.1933, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 2.2008193149132746, | |
| "grad_norm": 3.58627986907959, | |
| "learning_rate": 5.328161771114791e-06, | |
| "loss": 0.1738, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 2.2226096051599407, | |
| "grad_norm": 69.77458190917969, | |
| "learning_rate": 5.182893169470351e-06, | |
| "loss": 0.1827, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 2.2443998954066067, | |
| "grad_norm": 12.781591415405273, | |
| "learning_rate": 5.03762456782591e-06, | |
| "loss": 0.1799, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 2.266190185653273, | |
| "grad_norm": 23.71147346496582, | |
| "learning_rate": 4.8923559661814705e-06, | |
| "loss": 0.2041, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 2.287980475899939, | |
| "grad_norm": 24.240156173706055, | |
| "learning_rate": 4.747087364537029e-06, | |
| "loss": 0.1913, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 2.309770766146605, | |
| "grad_norm": 27.50016975402832, | |
| "learning_rate": 4.601818762892589e-06, | |
| "loss": 0.1809, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 2.331561056393271, | |
| "grad_norm": 0.6681903004646301, | |
| "learning_rate": 4.456550161248148e-06, | |
| "loss": 0.1896, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 2.353351346639937, | |
| "grad_norm": 4.321633338928223, | |
| "learning_rate": 4.311281559603708e-06, | |
| "loss": 0.1975, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 2.375141636886603, | |
| "grad_norm": 57.36803436279297, | |
| "learning_rate": 4.166012957959267e-06, | |
| "loss": 0.1911, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 2.396931927133269, | |
| "grad_norm": 0.14693371951580048, | |
| "learning_rate": 4.020744356314826e-06, | |
| "loss": 0.1979, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 2.4187222173799356, | |
| "grad_norm": 66.72421264648438, | |
| "learning_rate": 3.875475754670385e-06, | |
| "loss": 0.1894, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 2.4405125076266017, | |
| "grad_norm": 40.616798400878906, | |
| "learning_rate": 3.7302071530259454e-06, | |
| "loss": 0.1932, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 2.4623027978732677, | |
| "grad_norm": 14.315664291381836, | |
| "learning_rate": 3.584938551381505e-06, | |
| "loss": 0.1989, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 2.4840930881199337, | |
| "grad_norm": 3.2014641761779785, | |
| "learning_rate": 3.439669949737064e-06, | |
| "loss": 0.1854, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 2.5058833783665997, | |
| "grad_norm": 43.74154281616211, | |
| "learning_rate": 3.2944013480926235e-06, | |
| "loss": 0.191, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 2.527673668613266, | |
| "grad_norm": 12.705315589904785, | |
| "learning_rate": 3.149132746448183e-06, | |
| "loss": 0.1838, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 2.549463958859932, | |
| "grad_norm": 25.88142967224121, | |
| "learning_rate": 3.0038641448037425e-06, | |
| "loss": 0.1824, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 2.571254249106598, | |
| "grad_norm": 2.00431752204895, | |
| "learning_rate": 2.8585955431593016e-06, | |
| "loss": 0.2008, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 2.593044539353264, | |
| "grad_norm": 14.074618339538574, | |
| "learning_rate": 2.713326941514861e-06, | |
| "loss": 0.1771, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 2.61483482959993, | |
| "grad_norm": 24.28592300415039, | |
| "learning_rate": 2.5680583398704206e-06, | |
| "loss": 0.1867, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 2.636625119846596, | |
| "grad_norm": 46.3328742980957, | |
| "learning_rate": 2.42278973822598e-06, | |
| "loss": 0.186, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 2.658415410093262, | |
| "grad_norm": 15.886663436889648, | |
| "learning_rate": 2.2775211365815396e-06, | |
| "loss": 0.1874, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 2.6802057003399287, | |
| "grad_norm": 43.348026275634766, | |
| "learning_rate": 2.132252534937099e-06, | |
| "loss": 0.2014, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 2.7019959905865947, | |
| "grad_norm": 4.417407035827637, | |
| "learning_rate": 1.986983933292658e-06, | |
| "loss": 0.1874, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 2.7237862808332607, | |
| "grad_norm": 13.068084716796875, | |
| "learning_rate": 1.841715331648218e-06, | |
| "loss": 0.1954, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 2.7455765710799267, | |
| "grad_norm": 17.85508155822754, | |
| "learning_rate": 1.696446730003777e-06, | |
| "loss": 0.1803, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 2.7673668613265927, | |
| "grad_norm": 80.31468963623047, | |
| "learning_rate": 1.5511781283593367e-06, | |
| "loss": 0.1851, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 2.789157151573259, | |
| "grad_norm": 28.26786994934082, | |
| "learning_rate": 1.4059095267148958e-06, | |
| "loss": 0.1861, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 2.810947441819925, | |
| "grad_norm": 10.325495719909668, | |
| "learning_rate": 1.2606409250704555e-06, | |
| "loss": 0.187, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 2.832737732066591, | |
| "grad_norm": 8.102193832397461, | |
| "learning_rate": 1.1153723234260148e-06, | |
| "loss": 0.1712, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 2.854528022313257, | |
| "grad_norm": 36.713531494140625, | |
| "learning_rate": 9.701037217815743e-07, | |
| "loss": 0.1864, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 2.876318312559923, | |
| "grad_norm": 6.2764105796813965, | |
| "learning_rate": 8.248351201371336e-07, | |
| "loss": 0.1929, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 2.898108602806589, | |
| "grad_norm": 10.849760055541992, | |
| "learning_rate": 6.79566518492693e-07, | |
| "loss": 0.1925, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 2.9198988930532552, | |
| "grad_norm": 30.166927337646484, | |
| "learning_rate": 5.342979168482524e-07, | |
| "loss": 0.1923, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 2.9416891832999217, | |
| "grad_norm": 0.4179909825325012, | |
| "learning_rate": 3.890293152038119e-07, | |
| "loss": 0.1784, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 2.9634794735465877, | |
| "grad_norm": 1.3166455030441284, | |
| "learning_rate": 2.437607135593713e-07, | |
| "loss": 0.1518, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 2.9852697637932537, | |
| "grad_norm": 82.50801086425781, | |
| "learning_rate": 9.849211191493072e-08, | |
| "loss": 0.1817, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_accuracy": 0.8686120251923203, | |
| "eval_f1": 0.8680973729027178, | |
| "eval_loss": 0.5957307815551758, | |
| "eval_runtime": 101.1287, | |
| "eval_samples_per_second": 453.749, | |
| "eval_steps_per_second": 28.36, | |
| "step": 68838 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 68838, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.244734402325146e+16, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |