| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.298368298368298, | |
| "eval_steps": 500, | |
| "global_step": 3000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.014344629729245113, | |
| "grad_norm": 2.1563808263401123, | |
| "learning_rate": 2.1479713603818614e-06, | |
| "loss": 0.9832, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.028689259458490227, | |
| "grad_norm": 1.5376642197375787, | |
| "learning_rate": 4.5346062052505965e-06, | |
| "loss": 0.8531, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.04303388918773534, | |
| "grad_norm": 0.7108211564362369, | |
| "learning_rate": 6.921241050119331e-06, | |
| "loss": 0.7191, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.05737851891698045, | |
| "grad_norm": 0.5064006283904037, | |
| "learning_rate": 9.307875894988068e-06, | |
| "loss": 0.6532, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.07172314864622557, | |
| "grad_norm": 0.5016524926299113, | |
| "learning_rate": 1.1694510739856803e-05, | |
| "loss": 0.6245, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.08606777837547068, | |
| "grad_norm": 0.5064320977744877, | |
| "learning_rate": 1.4081145584725539e-05, | |
| "loss": 0.6017, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1004124081047158, | |
| "grad_norm": 0.4525315758811991, | |
| "learning_rate": 1.6467780429594274e-05, | |
| "loss": 0.5788, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.1147570378339609, | |
| "grad_norm": 0.4891140282424216, | |
| "learning_rate": 1.885441527446301e-05, | |
| "loss": 0.5726, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.129101667563206, | |
| "grad_norm": 0.5816465774117703, | |
| "learning_rate": 2.1241050119331742e-05, | |
| "loss": 0.5669, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.14344629729245115, | |
| "grad_norm": 0.5416459422496235, | |
| "learning_rate": 2.3627684964200477e-05, | |
| "loss": 0.5621, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.15779092702169625, | |
| "grad_norm": 0.5501550374753971, | |
| "learning_rate": 2.6014319809069216e-05, | |
| "loss": 0.5543, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.17213555675094136, | |
| "grad_norm": 0.48897325487812016, | |
| "learning_rate": 2.840095465393795e-05, | |
| "loss": 0.5519, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.1864801864801865, | |
| "grad_norm": 0.5193769497689257, | |
| "learning_rate": 3.0787589498806684e-05, | |
| "loss": 0.546, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2008248162094316, | |
| "grad_norm": 0.5100064220076413, | |
| "learning_rate": 3.3174224343675416e-05, | |
| "loss": 0.5454, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.2151694459386767, | |
| "grad_norm": 0.4992592686432826, | |
| "learning_rate": 3.5560859188544155e-05, | |
| "loss": 0.5437, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.2295140756679218, | |
| "grad_norm": 0.4838403686730783, | |
| "learning_rate": 3.794749403341289e-05, | |
| "loss": 0.5412, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.24385870539716695, | |
| "grad_norm": 0.45590237103117387, | |
| "learning_rate": 4.0334128878281626e-05, | |
| "loss": 0.541, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.258203335126412, | |
| "grad_norm": 0.44189108119445925, | |
| "learning_rate": 4.272076372315036e-05, | |
| "loss": 0.5328, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.2725479648556572, | |
| "grad_norm": 0.4596950199469469, | |
| "learning_rate": 4.510739856801909e-05, | |
| "loss": 0.5328, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.2868925945849023, | |
| "grad_norm": 0.4232582169854354, | |
| "learning_rate": 4.749403341288783e-05, | |
| "loss": 0.5331, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3012372243141474, | |
| "grad_norm": 0.47183677702470816, | |
| "learning_rate": 4.988066825775656e-05, | |
| "loss": 0.5405, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.3155818540433925, | |
| "grad_norm": 0.510543623904468, | |
| "learning_rate": 5.22673031026253e-05, | |
| "loss": 0.5448, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.3299264837726376, | |
| "grad_norm": 0.49518213450267645, | |
| "learning_rate": 5.465393794749404e-05, | |
| "loss": 0.5441, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.3442711135018827, | |
| "grad_norm": 0.4081838992878798, | |
| "learning_rate": 5.7040572792362765e-05, | |
| "loss": 0.5342, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.3586157432311278, | |
| "grad_norm": 0.45201996051478616, | |
| "learning_rate": 5.942720763723151e-05, | |
| "loss": 0.5379, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.372960372960373, | |
| "grad_norm": 0.5163583891265898, | |
| "learning_rate": 6.181384248210024e-05, | |
| "loss": 0.5352, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.3873050026896181, | |
| "grad_norm": 0.43220360318869133, | |
| "learning_rate": 6.420047732696898e-05, | |
| "loss": 0.5319, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.4016496324188632, | |
| "grad_norm": 0.39341356230222857, | |
| "learning_rate": 6.65871121718377e-05, | |
| "loss": 0.531, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.4159942621481083, | |
| "grad_norm": 0.4843450935832954, | |
| "learning_rate": 6.897374701670645e-05, | |
| "loss": 0.5247, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.4303388918773534, | |
| "grad_norm": 0.43790304963108484, | |
| "learning_rate": 7.136038186157519e-05, | |
| "loss": 0.535, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.4446835216065985, | |
| "grad_norm": 0.3679883238347063, | |
| "learning_rate": 7.374701670644391e-05, | |
| "loss": 0.529, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.4590281513358436, | |
| "grad_norm": 0.38496528216858267, | |
| "learning_rate": 7.613365155131266e-05, | |
| "loss": 0.5358, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.47337278106508873, | |
| "grad_norm": 0.3554015988338894, | |
| "learning_rate": 7.852028639618139e-05, | |
| "loss": 0.5366, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.4877174107943339, | |
| "grad_norm": 0.3647316947863913, | |
| "learning_rate": 8.090692124105012e-05, | |
| "loss": 0.5305, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.502062040523579, | |
| "grad_norm": 0.40796953260473273, | |
| "learning_rate": 8.329355608591885e-05, | |
| "loss": 0.5361, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.516406670252824, | |
| "grad_norm": 0.345466577791704, | |
| "learning_rate": 8.56801909307876e-05, | |
| "loss": 0.5317, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.5307512999820692, | |
| "grad_norm": 0.3897824320306703, | |
| "learning_rate": 8.806682577565633e-05, | |
| "loss": 0.5384, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.5450959297113144, | |
| "grad_norm": 0.39750641265021325, | |
| "learning_rate": 9.045346062052506e-05, | |
| "loss": 0.5344, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.5594405594405595, | |
| "grad_norm": 0.3294501292354428, | |
| "learning_rate": 9.28400954653938e-05, | |
| "loss": 0.531, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.5737851891698046, | |
| "grad_norm": 0.3325704984940842, | |
| "learning_rate": 9.522673031026254e-05, | |
| "loss": 0.5282, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5881298188990497, | |
| "grad_norm": 0.37577078172283135, | |
| "learning_rate": 9.761336515513126e-05, | |
| "loss": 0.5289, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.6024744486282948, | |
| "grad_norm": 0.32982285395946315, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5326, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.6168190783575399, | |
| "grad_norm": 0.315570817514683, | |
| "learning_rate": 9.999826305940802e-05, | |
| "loss": 0.5276, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.631163708086785, | |
| "grad_norm": 0.32873479872603517, | |
| "learning_rate": 9.99930523583106e-05, | |
| "loss": 0.5329, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.6455083378160301, | |
| "grad_norm": 0.3754610016068392, | |
| "learning_rate": 9.998436825873485e-05, | |
| "loss": 0.5339, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.6598529675452752, | |
| "grad_norm": 0.2973537072730961, | |
| "learning_rate": 9.997221136403139e-05, | |
| "loss": 0.5249, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.6741975972745203, | |
| "grad_norm": 0.33421087994681, | |
| "learning_rate": 9.995658251883237e-05, | |
| "loss": 0.5196, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.6885422270037654, | |
| "grad_norm": 0.3120052809857615, | |
| "learning_rate": 9.993748280899279e-05, | |
| "loss": 0.5236, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.7028868567330105, | |
| "grad_norm": 0.28477505127058517, | |
| "learning_rate": 9.991491356151515e-05, | |
| "loss": 0.5166, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.7172314864622557, | |
| "grad_norm": 0.3282614273702267, | |
| "learning_rate": 9.988887634445711e-05, | |
| "loss": 0.5191, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7315761161915008, | |
| "grad_norm": 0.3018575266984723, | |
| "learning_rate": 9.985937296682264e-05, | |
| "loss": 0.52, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.745920745920746, | |
| "grad_norm": 0.322642800391225, | |
| "learning_rate": 9.982640547843628e-05, | |
| "loss": 0.5193, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.7602653756499911, | |
| "grad_norm": 0.29334428049579575, | |
| "learning_rate": 9.978997616980083e-05, | |
| "loss": 0.5173, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.7746100053792362, | |
| "grad_norm": 0.30663608653187874, | |
| "learning_rate": 9.975008757193805e-05, | |
| "loss": 0.514, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.7889546351084813, | |
| "grad_norm": 0.2902575349921672, | |
| "learning_rate": 9.970674245621296e-05, | |
| "loss": 0.5173, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.8032992648377264, | |
| "grad_norm": 0.2750627268123107, | |
| "learning_rate": 9.965994383414116e-05, | |
| "loss": 0.5124, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.8176438945669715, | |
| "grad_norm": 0.31846444500772264, | |
| "learning_rate": 9.960969495717975e-05, | |
| "loss": 0.5105, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.8319885242962166, | |
| "grad_norm": 0.2850087845127407, | |
| "learning_rate": 9.955599931650127e-05, | |
| "loss": 0.505, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.8463331540254617, | |
| "grad_norm": 0.3011934822435399, | |
| "learning_rate": 9.949886064275123e-05, | |
| "loss": 0.4997, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.8606777837547068, | |
| "grad_norm": 0.28048546158040877, | |
| "learning_rate": 9.943828290578892e-05, | |
| "loss": 0.5039, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.8750224134839519, | |
| "grad_norm": 0.29482418586801606, | |
| "learning_rate": 9.937427031441152e-05, | |
| "loss": 0.5068, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.889367043213197, | |
| "grad_norm": 0.2868511054090736, | |
| "learning_rate": 9.93068273160618e-05, | |
| "loss": 0.5041, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.9037116729424421, | |
| "grad_norm": 0.26871712641833934, | |
| "learning_rate": 9.9235958596519e-05, | |
| "loss": 0.5031, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.9180563026716873, | |
| "grad_norm": 0.27176006324351715, | |
| "learning_rate": 9.916166907957336e-05, | |
| "loss": 0.4998, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.9324009324009324, | |
| "grad_norm": 0.2916123334399884, | |
| "learning_rate": 9.908396392668397e-05, | |
| "loss": 0.5045, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.9467455621301775, | |
| "grad_norm": 0.28657648009371867, | |
| "learning_rate": 9.90028485366202e-05, | |
| "loss": 0.5005, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.9610901918594227, | |
| "grad_norm": 0.2616587253845151, | |
| "learning_rate": 9.891832854508661e-05, | |
| "loss": 0.5017, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.9754348215886678, | |
| "grad_norm": 0.2642546302377749, | |
| "learning_rate": 9.883040982433133e-05, | |
| "loss": 0.492, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.9897794513179129, | |
| "grad_norm": 0.255936110537566, | |
| "learning_rate": 9.87390984827382e-05, | |
| "loss": 0.4934, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.002868925945849, | |
| "grad_norm": 0.3099905060480918, | |
| "learning_rate": 9.864440086440223e-05, | |
| "loss": 0.43, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.017213555675094, | |
| "grad_norm": 0.25984643430841164, | |
| "learning_rate": 9.854632354868889e-05, | |
| "loss": 0.3695, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.0315581854043392, | |
| "grad_norm": 0.28715953895875523, | |
| "learning_rate": 9.844487334977705e-05, | |
| "loss": 0.3792, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.0459028151335843, | |
| "grad_norm": 0.2649853690140653, | |
| "learning_rate": 9.834005731618543e-05, | |
| "loss": 0.3737, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.0602474448628294, | |
| "grad_norm": 0.26573177260306496, | |
| "learning_rate": 9.823188273028297e-05, | |
| "loss": 0.3771, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.0745920745920745, | |
| "grad_norm": 0.26574383245031147, | |
| "learning_rate": 9.812035710778283e-05, | |
| "loss": 0.3741, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.0889367043213196, | |
| "grad_norm": 0.28123208245100456, | |
| "learning_rate": 9.800548819722026e-05, | |
| "loss": 0.3731, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.1032813340505647, | |
| "grad_norm": 0.28349060376734364, | |
| "learning_rate": 9.78872839794142e-05, | |
| "loss": 0.3778, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.11762596377981, | |
| "grad_norm": 0.28893208285124705, | |
| "learning_rate": 9.776575266691279e-05, | |
| "loss": 0.3806, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.1319705935090552, | |
| "grad_norm": 0.2860386213893016, | |
| "learning_rate": 9.764090270342286e-05, | |
| "loss": 0.3799, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.1463152232383003, | |
| "grad_norm": 0.2460375162339276, | |
| "learning_rate": 9.751274276322316e-05, | |
| "loss": 0.3898, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.1606598529675454, | |
| "grad_norm": 0.251720247858439, | |
| "learning_rate": 9.738128175056179e-05, | |
| "loss": 0.3821, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.1750044826967905, | |
| "grad_norm": 0.25278081761302046, | |
| "learning_rate": 9.724652879903751e-05, | |
| "loss": 0.3798, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.1893491124260356, | |
| "grad_norm": 0.2599296471651565, | |
| "learning_rate": 9.71084932709652e-05, | |
| "loss": 0.3828, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.2036937421552807, | |
| "grad_norm": 0.2455565423628766, | |
| "learning_rate": 9.696718475672532e-05, | |
| "loss": 0.3743, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.2180383718845258, | |
| "grad_norm": 0.2888176539405626, | |
| "learning_rate": 9.682261307409766e-05, | |
| "loss": 0.381, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.232383001613771, | |
| "grad_norm": 0.25471629336116824, | |
| "learning_rate": 9.667478826757916e-05, | |
| "loss": 0.3832, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.246727631343016, | |
| "grad_norm": 0.2614351769400213, | |
| "learning_rate": 9.652372060768608e-05, | |
| "loss": 0.3848, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.2610722610722611, | |
| "grad_norm": 0.26412023986782596, | |
| "learning_rate": 9.63694205902405e-05, | |
| "loss": 0.3855, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.2754168908015062, | |
| "grad_norm": 0.2429603634039349, | |
| "learning_rate": 9.621189893564092e-05, | |
| "loss": 0.3819, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.2897615205307513, | |
| "grad_norm": 0.26829134565306034, | |
| "learning_rate": 9.605116658811759e-05, | |
| "loss": 0.3906, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.3041061502599964, | |
| "grad_norm": 0.27364921782590684, | |
| "learning_rate": 9.588723471497208e-05, | |
| "loss": 0.3848, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.3184507799892415, | |
| "grad_norm": 0.2515405826777948, | |
| "learning_rate": 9.572011470580136e-05, | |
| "loss": 0.3899, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.3327954097184866, | |
| "grad_norm": 0.2513235427794581, | |
| "learning_rate": 9.554981817170655e-05, | |
| "loss": 0.3912, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.3471400394477318, | |
| "grad_norm": 0.24955670786044107, | |
| "learning_rate": 9.537635694448615e-05, | |
| "loss": 0.3849, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.3614846691769769, | |
| "grad_norm": 0.24538389153554718, | |
| "learning_rate": 9.519974307581404e-05, | |
| "loss": 0.3867, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.375829298906222, | |
| "grad_norm": 0.25856641290644117, | |
| "learning_rate": 9.50199888364021e-05, | |
| "loss": 0.3899, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.390173928635467, | |
| "grad_norm": 0.26645029754281746, | |
| "learning_rate": 9.483710671514777e-05, | |
| "loss": 0.386, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.4045185583647122, | |
| "grad_norm": 0.25589965865278824, | |
| "learning_rate": 9.465110941826622e-05, | |
| "loss": 0.3856, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.4188631880939573, | |
| "grad_norm": 0.27815251555263987, | |
| "learning_rate": 9.446200986840765e-05, | |
| "loss": 0.3881, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.4332078178232024, | |
| "grad_norm": 0.26197920796578195, | |
| "learning_rate": 9.426982120375943e-05, | |
| "loss": 0.3878, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.4475524475524475, | |
| "grad_norm": 0.2606524388643148, | |
| "learning_rate": 9.407455677713328e-05, | |
| "loss": 0.3883, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.4618970772816926, | |
| "grad_norm": 0.2331343629588404, | |
| "learning_rate": 9.387623015503753e-05, | |
| "loss": 0.3848, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.4762417070109377, | |
| "grad_norm": 0.25873984889230295, | |
| "learning_rate": 9.367485511673462e-05, | |
| "loss": 0.3895, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.4905863367401828, | |
| "grad_norm": 0.2531065339256471, | |
| "learning_rate": 9.347044565328367e-05, | |
| "loss": 0.3937, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.504930966469428, | |
| "grad_norm": 0.26767841128977615, | |
| "learning_rate": 9.326301596656846e-05, | |
| "loss": 0.3894, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.519275596198673, | |
| "grad_norm": 0.2472066267101206, | |
| "learning_rate": 9.30525804683107e-05, | |
| "loss": 0.3889, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.5336202259279181, | |
| "grad_norm": 0.24916114849718174, | |
| "learning_rate": 9.283915377906875e-05, | |
| "loss": 0.3874, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.5479648556571632, | |
| "grad_norm": 0.24586564647507672, | |
| "learning_rate": 9.262275072722181e-05, | |
| "loss": 0.3899, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.5623094853864083, | |
| "grad_norm": 0.24194199620674006, | |
| "learning_rate": 9.240338634793969e-05, | |
| "loss": 0.3867, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.5766541151156535, | |
| "grad_norm": 0.24712089444412166, | |
| "learning_rate": 9.218107588213813e-05, | |
| "loss": 0.3902, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.5909987448448986, | |
| "grad_norm": 0.24987071841082312, | |
| "learning_rate": 9.195583477542009e-05, | |
| "loss": 0.3851, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.6053433745741437, | |
| "grad_norm": 0.2453499914136973, | |
| "learning_rate": 9.172767867700236e-05, | |
| "loss": 0.3906, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.6196880043033888, | |
| "grad_norm": 0.2453362805106032, | |
| "learning_rate": 9.149662343862851e-05, | |
| "loss": 0.3905, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.6340326340326339, | |
| "grad_norm": 0.23102523441076062, | |
| "learning_rate": 9.126268511346744e-05, | |
| "loss": 0.3903, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.648377263761879, | |
| "grad_norm": 0.2542501707506408, | |
| "learning_rate": 9.102587995499807e-05, | |
| "loss": 0.3953, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.6627218934911243, | |
| "grad_norm": 0.23285474972918488, | |
| "learning_rate": 9.078622441588009e-05, | |
| "loss": 0.391, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.6770665232203694, | |
| "grad_norm": 0.24565957619352327, | |
| "learning_rate": 9.054373514681085e-05, | |
| "loss": 0.3923, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.6914111529496145, | |
| "grad_norm": 0.2506739557436684, | |
| "learning_rate": 9.029842899536853e-05, | |
| "loss": 0.3909, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.7057557826788596, | |
| "grad_norm": 0.2441587365940751, | |
| "learning_rate": 9.005032300484162e-05, | |
| "loss": 0.3915, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.7201004124081047, | |
| "grad_norm": 0.26421110183322566, | |
| "learning_rate": 8.979943441304473e-05, | |
| "loss": 0.3904, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.7344450421373498, | |
| "grad_norm": 0.24194171269463752, | |
| "learning_rate": 8.954578065112107e-05, | |
| "loss": 0.3892, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.748789671866595, | |
| "grad_norm": 0.23054663152441476, | |
| "learning_rate": 8.928937934233123e-05, | |
| "loss": 0.3907, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.76313430159584, | |
| "grad_norm": 0.2369813966398001, | |
| "learning_rate": 8.903024830082887e-05, | |
| "loss": 0.3849, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.7774789313250852, | |
| "grad_norm": 0.24008220076352446, | |
| "learning_rate": 8.876840553042296e-05, | |
| "loss": 0.3904, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.7918235610543303, | |
| "grad_norm": 0.23428608617416669, | |
| "learning_rate": 8.850386922332696e-05, | |
| "loss": 0.387, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.8061681907835754, | |
| "grad_norm": 0.23331291847215246, | |
| "learning_rate": 8.823665775889486e-05, | |
| "loss": 0.3909, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.8205128205128205, | |
| "grad_norm": 0.23850242149763548, | |
| "learning_rate": 8.796678970234427e-05, | |
| "loss": 0.3833, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.8348574502420656, | |
| "grad_norm": 0.2219682422982644, | |
| "learning_rate": 8.769428380346642e-05, | |
| "loss": 0.3845, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.8492020799713107, | |
| "grad_norm": 0.22216238994388604, | |
| "learning_rate": 8.741915899532362e-05, | |
| "loss": 0.3865, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.8635467097005558, | |
| "grad_norm": 0.22824929990219722, | |
| "learning_rate": 8.714143439293376e-05, | |
| "loss": 0.3852, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.8778913394298011, | |
| "grad_norm": 0.24280880504252028, | |
| "learning_rate": 8.686112929194226e-05, | |
| "loss": 0.3861, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.8922359691590462, | |
| "grad_norm": 0.24001704017165865, | |
| "learning_rate": 8.657826316728142e-05, | |
| "loss": 0.3908, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.9065805988882913, | |
| "grad_norm": 0.22100828767921185, | |
| "learning_rate": 8.62928556718174e-05, | |
| "loss": 0.3871, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.9209252286175365, | |
| "grad_norm": 0.22202012109824457, | |
| "learning_rate": 8.600492663498477e-05, | |
| "loss": 0.3834, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.9352698583467816, | |
| "grad_norm": 0.21529127705519435, | |
| "learning_rate": 8.571449606140883e-05, | |
| "loss": 0.388, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.9496144880760267, | |
| "grad_norm": 0.23391440077905082, | |
| "learning_rate": 8.542158412951563e-05, | |
| "loss": 0.3844, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.9639591178052718, | |
| "grad_norm": 0.2331711540562185, | |
| "learning_rate": 8.512621119013013e-05, | |
| "loss": 0.393, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.9783037475345169, | |
| "grad_norm": 0.23305451446876246, | |
| "learning_rate": 8.482839776506232e-05, | |
| "loss": 0.3837, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.992648377263762, | |
| "grad_norm": 0.24517430064973736, | |
| "learning_rate": 8.452816454568124e-05, | |
| "loss": 0.3852, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.005737851891698, | |
| "grad_norm": 0.27916951560917247, | |
| "learning_rate": 8.422553239147754e-05, | |
| "loss": 0.2799, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.020082481620943, | |
| "grad_norm": 0.23593724272097047, | |
| "learning_rate": 8.392052232861411e-05, | |
| "loss": 0.201, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.034427111350188, | |
| "grad_norm": 0.23512809512185134, | |
| "learning_rate": 8.361315554846534e-05, | |
| "loss": 0.1983, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.0487717410794333, | |
| "grad_norm": 0.2286549447255431, | |
| "learning_rate": 8.330345340614471e-05, | |
| "loss": 0.1942, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.0631163708086784, | |
| "grad_norm": 0.24844506458021867, | |
| "learning_rate": 8.299143741902111e-05, | |
| "loss": 0.1943, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.0774610005379235, | |
| "grad_norm": 0.25585326293058985, | |
| "learning_rate": 8.267712926522389e-05, | |
| "loss": 0.1993, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.0918056302671686, | |
| "grad_norm": 0.2421387690266048, | |
| "learning_rate": 8.236055078213666e-05, | |
| "loss": 0.1965, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.1061502599964137, | |
| "grad_norm": 0.24707973152402415, | |
| "learning_rate": 8.204172396488013e-05, | |
| "loss": 0.1992, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.120494889725659, | |
| "grad_norm": 0.23509099311770917, | |
| "learning_rate": 8.172067096478395e-05, | |
| "loss": 0.2008, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.134839519454904, | |
| "grad_norm": 0.23605150518346998, | |
| "learning_rate": 8.139741408784764e-05, | |
| "loss": 0.2019, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.149184149184149, | |
| "grad_norm": 0.2546177191590111, | |
| "learning_rate": 8.107197579319082e-05, | |
| "loss": 0.2053, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.163528778913394, | |
| "grad_norm": 0.2353676242323245, | |
| "learning_rate": 8.074437869149288e-05, | |
| "loss": 0.204, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.1778734086426392, | |
| "grad_norm": 0.23401952893152606, | |
| "learning_rate": 8.041464554342197e-05, | |
| "loss": 0.2036, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.1922180383718843, | |
| "grad_norm": 0.23141975512545726, | |
| "learning_rate": 8.008279925805366e-05, | |
| "loss": 0.2033, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.2065626681011294, | |
| "grad_norm": 0.23587920943899052, | |
| "learning_rate": 7.974886289127927e-05, | |
| "loss": 0.2068, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.2209072978303745, | |
| "grad_norm": 0.2394814609218661, | |
| "learning_rate": 7.941285964420407e-05, | |
| "loss": 0.2049, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.23525192755962, | |
| "grad_norm": 0.2389110148718096, | |
| "learning_rate": 7.907481286153516e-05, | |
| "loss": 0.2116, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.249596557288865, | |
| "grad_norm": 0.2282395291006806, | |
| "learning_rate": 7.873474602995973e-05, | |
| "loss": 0.2088, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.2639411870181103, | |
| "grad_norm": 0.23275397540700887, | |
| "learning_rate": 7.839268277651311e-05, | |
| "loss": 0.2092, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.2782858167473554, | |
| "grad_norm": 0.22624466416184327, | |
| "learning_rate": 7.80486468669373e-05, | |
| "loss": 0.2088, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.2926304464766005, | |
| "grad_norm": 0.23126585599149074, | |
| "learning_rate": 7.770266220402977e-05, | |
| "loss": 0.2117, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.3069750762058456, | |
| "grad_norm": 0.226948134461606, | |
| "learning_rate": 7.735475282598271e-05, | |
| "loss": 0.2097, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.3213197059350907, | |
| "grad_norm": 0.22673465714008167, | |
| "learning_rate": 7.700494290471296e-05, | |
| "loss": 0.2104, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.335664335664336, | |
| "grad_norm": 0.2556824784968339, | |
| "learning_rate": 7.665325674418264e-05, | |
| "loss": 0.2136, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.350008965393581, | |
| "grad_norm": 0.25025658975825976, | |
| "learning_rate": 7.629971877871039e-05, | |
| "loss": 0.2084, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.364353595122826, | |
| "grad_norm": 0.22536490579422702, | |
| "learning_rate": 7.594435357127399e-05, | |
| "loss": 0.2089, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.378698224852071, | |
| "grad_norm": 0.2258065984765025, | |
| "learning_rate": 7.558718581180355e-05, | |
| "loss": 0.2067, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.3930428545813163, | |
| "grad_norm": 0.2464593742203822, | |
| "learning_rate": 7.522824031546629e-05, | |
| "loss": 0.2137, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.4073874843105614, | |
| "grad_norm": 0.24123071412945177, | |
| "learning_rate": 7.486754202094229e-05, | |
| "loss": 0.2115, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.4217321140398065, | |
| "grad_norm": 0.23105649429700748, | |
| "learning_rate": 7.450511598869194e-05, | |
| "loss": 0.2138, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.4360767437690516, | |
| "grad_norm": 0.22955721039077792, | |
| "learning_rate": 7.414098739921471e-05, | |
| "loss": 0.2125, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.4504213734982967, | |
| "grad_norm": 0.23154193335740872, | |
| "learning_rate": 7.377518155129973e-05, | |
| "loss": 0.2183, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.464766003227542, | |
| "grad_norm": 0.2340236121998045, | |
| "learning_rate": 7.340772386026801e-05, | |
| "loss": 0.2157, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 2.479110632956787, | |
| "grad_norm": 0.2250255353665983, | |
| "learning_rate": 7.303863985620676e-05, | |
| "loss": 0.2123, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 2.493455262686032, | |
| "grad_norm": 0.2283114308365594, | |
| "learning_rate": 7.266795518219548e-05, | |
| "loss": 0.2135, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.507799892415277, | |
| "grad_norm": 0.23546636465212323, | |
| "learning_rate": 7.22956955925245e-05, | |
| "loss": 0.214, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.5221445221445222, | |
| "grad_norm": 0.23275268765839288, | |
| "learning_rate": 7.192188695090545e-05, | |
| "loss": 0.2156, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 2.5364891518737673, | |
| "grad_norm": 0.2457436947556184, | |
| "learning_rate": 7.154655522867452e-05, | |
| "loss": 0.2189, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.5508337816030124, | |
| "grad_norm": 0.2385729628030818, | |
| "learning_rate": 7.116972650298782e-05, | |
| "loss": 0.2148, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 2.5651784113322575, | |
| "grad_norm": 0.2382827317725779, | |
| "learning_rate": 7.079142695500975e-05, | |
| "loss": 0.2127, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.5795230410615027, | |
| "grad_norm": 0.22496477508883403, | |
| "learning_rate": 7.041168286809397e-05, | |
| "loss": 0.2156, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.5938676707907478, | |
| "grad_norm": 0.2337756123669142, | |
| "learning_rate": 7.00305206259572e-05, | |
| "loss": 0.2163, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.608212300519993, | |
| "grad_norm": 0.23547675501490803, | |
| "learning_rate": 6.964796671084631e-05, | |
| "loss": 0.213, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.622556930249238, | |
| "grad_norm": 0.236949625863052, | |
| "learning_rate": 6.926404770169819e-05, | |
| "loss": 0.2108, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.636901559978483, | |
| "grad_norm": 0.22775808389184637, | |
| "learning_rate": 6.887879027229332e-05, | |
| "loss": 0.2131, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.651246189707728, | |
| "grad_norm": 0.25558095929144115, | |
| "learning_rate": 6.84922211894024e-05, | |
| "loss": 0.2146, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.6655908194369733, | |
| "grad_norm": 0.23865636643565702, | |
| "learning_rate": 6.810436731092671e-05, | |
| "loss": 0.2154, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.6799354491662184, | |
| "grad_norm": 0.23347390914436725, | |
| "learning_rate": 6.771525558403203e-05, | |
| "loss": 0.2145, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 2.6942800788954635, | |
| "grad_norm": 0.2311770851119529, | |
| "learning_rate": 6.73249130432765e-05, | |
| "loss": 0.2112, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 2.7086247086247086, | |
| "grad_norm": 0.2326246785839781, | |
| "learning_rate": 6.69333668087323e-05, | |
| "loss": 0.2133, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.7229693383539537, | |
| "grad_norm": 0.23563376415545254, | |
| "learning_rate": 6.654064408410132e-05, | |
| "loss": 0.2141, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.737313968083199, | |
| "grad_norm": 0.2298522950109398, | |
| "learning_rate": 6.614677215482527e-05, | |
| "loss": 0.2142, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 2.751658597812444, | |
| "grad_norm": 0.2364865163676101, | |
| "learning_rate": 6.57517783861898e-05, | |
| "loss": 0.2127, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.766003227541689, | |
| "grad_norm": 0.22837021217881728, | |
| "learning_rate": 6.535569022142335e-05, | |
| "loss": 0.2145, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 2.780347857270934, | |
| "grad_norm": 0.22749769763881308, | |
| "learning_rate": 6.495853517979035e-05, | |
| "loss": 0.2106, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 2.7946924870001792, | |
| "grad_norm": 0.21764981978938533, | |
| "learning_rate": 6.456034085467935e-05, | |
| "loss": 0.2125, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.8090371167294244, | |
| "grad_norm": 0.22774012921821585, | |
| "learning_rate": 6.416113491168581e-05, | |
| "loss": 0.213, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 2.8233817464586695, | |
| "grad_norm": 0.22793686074861258, | |
| "learning_rate": 6.376094508668999e-05, | |
| "loss": 0.2116, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 2.8377263761879146, | |
| "grad_norm": 0.24345345462191187, | |
| "learning_rate": 6.335979918392999e-05, | |
| "loss": 0.213, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 2.8520710059171597, | |
| "grad_norm": 0.230566718186529, | |
| "learning_rate": 6.295772507406982e-05, | |
| "loss": 0.2123, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 2.866415635646405, | |
| "grad_norm": 0.23922165240449358, | |
| "learning_rate": 6.255475069226326e-05, | |
| "loss": 0.211, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.88076026537565, | |
| "grad_norm": 0.22058336484670613, | |
| "learning_rate": 6.21509040362127e-05, | |
| "loss": 0.2122, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 2.895104895104895, | |
| "grad_norm": 0.2272702011851071, | |
| "learning_rate": 6.174621316422417e-05, | |
| "loss": 0.2147, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 2.90944952483414, | |
| "grad_norm": 0.23799805104509125, | |
| "learning_rate": 6.134070619325774e-05, | |
| "loss": 0.212, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 2.923794154563385, | |
| "grad_norm": 0.24608349983752625, | |
| "learning_rate": 6.0934411296974184e-05, | |
| "loss": 0.2122, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 2.9381387842926303, | |
| "grad_norm": 0.23079480496683127, | |
| "learning_rate": 6.052735670377736e-05, | |
| "loss": 0.211, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 2.9524834140218754, | |
| "grad_norm": 0.22680559271715478, | |
| "learning_rate": 6.0119570694853155e-05, | |
| "loss": 0.2102, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 2.9668280437511205, | |
| "grad_norm": 0.22760761484882805, | |
| "learning_rate": 5.97110816022044e-05, | |
| "loss": 0.2113, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 2.9811726734803656, | |
| "grad_norm": 0.23303799910976278, | |
| "learning_rate": 5.930191780668258e-05, | |
| "loss": 0.2088, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 2.9955173032096107, | |
| "grad_norm": 0.22946738031807773, | |
| "learning_rate": 5.88921077360159e-05, | |
| "loss": 0.2097, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 3.008606777837547, | |
| "grad_norm": 0.2697620900381124, | |
| "learning_rate": 5.848167986283421e-05, | |
| "loss": 0.1134, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 3.0229514075667923, | |
| "grad_norm": 0.1885938841096422, | |
| "learning_rate": 5.807066270269084e-05, | |
| "loss": 0.0763, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 3.0372960372960374, | |
| "grad_norm": 0.214693696805492, | |
| "learning_rate": 5.765908481208139e-05, | |
| "loss": 0.0756, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 3.0516406670252825, | |
| "grad_norm": 0.2339101871402584, | |
| "learning_rate": 5.724697478645963e-05, | |
| "loss": 0.0744, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 3.0659852967545276, | |
| "grad_norm": 0.1971755620952271, | |
| "learning_rate": 5.6834361258250844e-05, | |
| "loss": 0.072, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 3.0803299264837727, | |
| "grad_norm": 0.1981153430750115, | |
| "learning_rate": 5.642127289486246e-05, | |
| "loss": 0.0748, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 3.094674556213018, | |
| "grad_norm": 0.2172902671287561, | |
| "learning_rate": 5.600773839669237e-05, | |
| "loss": 0.0726, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 3.109019185942263, | |
| "grad_norm": 0.19669334061877888, | |
| "learning_rate": 5.559378649513478e-05, | |
| "loss": 0.0733, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 3.123363815671508, | |
| "grad_norm": 0.21027113699329436, | |
| "learning_rate": 5.517944595058413e-05, | |
| "loss": 0.0746, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 3.137708445400753, | |
| "grad_norm": 0.20204087893287273, | |
| "learning_rate": 5.476474555043688e-05, | |
| "loss": 0.0748, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 3.152053075129998, | |
| "grad_norm": 0.20638588917150788, | |
| "learning_rate": 5.4349714107091335e-05, | |
| "loss": 0.0744, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 3.1663977048592433, | |
| "grad_norm": 0.20367882761147596, | |
| "learning_rate": 5.393438045594595e-05, | |
| "loss": 0.0755, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 3.1807423345884884, | |
| "grad_norm": 0.20836979312681028, | |
| "learning_rate": 5.351877345339583e-05, | |
| "loss": 0.076, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 3.1950869643177335, | |
| "grad_norm": 0.19643695987807314, | |
| "learning_rate": 5.310292197482791e-05, | |
| "loss": 0.0733, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 3.2094315940469786, | |
| "grad_norm": 0.20621763947145422, | |
| "learning_rate": 5.268685491261472e-05, | |
| "loss": 0.075, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 3.2237762237762237, | |
| "grad_norm": 0.20777873086593704, | |
| "learning_rate": 5.227060117410702e-05, | |
| "loss": 0.0746, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 3.238120853505469, | |
| "grad_norm": 0.2021910407099938, | |
| "learning_rate": 5.185418967962543e-05, | |
| "loss": 0.0747, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 3.252465483234714, | |
| "grad_norm": 0.2016612434414281, | |
| "learning_rate": 5.143764936045106e-05, | |
| "loss": 0.0743, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 3.266810112963959, | |
| "grad_norm": 0.2180992659409795, | |
| "learning_rate": 5.1021009156815414e-05, | |
| "loss": 0.0744, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 3.281154742693204, | |
| "grad_norm": 0.2056058145565962, | |
| "learning_rate": 5.060429801588983e-05, | |
| "loss": 0.0744, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 3.2954993724224493, | |
| "grad_norm": 0.20164051829762908, | |
| "learning_rate": 5.018754488977409e-05, | |
| "loss": 0.0745, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 3.3098440021516944, | |
| "grad_norm": 0.2026538165933443, | |
| "learning_rate": 4.9770778733485065e-05, | |
| "loss": 0.074, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 3.3241886318809395, | |
| "grad_norm": 0.20427324673762595, | |
| "learning_rate": 4.935402850294494e-05, | |
| "loss": 0.0739, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 3.3385332616101846, | |
| "grad_norm": 0.20831211218540635, | |
| "learning_rate": 4.893732315296942e-05, | |
| "loss": 0.0748, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 3.3528778913394297, | |
| "grad_norm": 0.20740018500070947, | |
| "learning_rate": 4.852069163525595e-05, | |
| "loss": 0.0737, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 3.367222521068675, | |
| "grad_norm": 0.20060155886370676, | |
| "learning_rate": 4.810416289637234e-05, | |
| "loss": 0.0729, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 3.38156715079792, | |
| "grad_norm": 0.199826847154071, | |
| "learning_rate": 4.7687765875745574e-05, | |
| "loss": 0.0739, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 3.395911780527165, | |
| "grad_norm": 0.20063705204581495, | |
| "learning_rate": 4.727152950365117e-05, | |
| "loss": 0.0737, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 3.41025641025641, | |
| "grad_norm": 0.20947972363514977, | |
| "learning_rate": 4.685548269920312e-05, | |
| "loss": 0.0736, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 3.4246010399856552, | |
| "grad_norm": 0.2006701925989043, | |
| "learning_rate": 4.643965436834474e-05, | |
| "loss": 0.075, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 3.4389456697149003, | |
| "grad_norm": 0.20335025504735554, | |
| "learning_rate": 4.6024073401840336e-05, | |
| "loss": 0.0745, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 3.4532902994441455, | |
| "grad_norm": 0.2192162565442083, | |
| "learning_rate": 4.560876867326791e-05, | |
| "loss": 0.0738, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 3.4676349291733906, | |
| "grad_norm": 0.19858055523329815, | |
| "learning_rate": 4.5193769037013066e-05, | |
| "loss": 0.0732, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 3.4819795589026357, | |
| "grad_norm": 0.20485303414115183, | |
| "learning_rate": 4.477910332626438e-05, | |
| "loss": 0.0728, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 3.4963241886318808, | |
| "grad_norm": 0.19011594248287844, | |
| "learning_rate": 4.4364800351010066e-05, | |
| "loss": 0.0726, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 3.5106688183611263, | |
| "grad_norm": 0.20410603253979742, | |
| "learning_rate": 4.395088889603633e-05, | |
| "loss": 0.0736, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 3.5250134480903714, | |
| "grad_norm": 0.1994983957599032, | |
| "learning_rate": 4.353739771892746e-05, | |
| "loss": 0.073, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 3.5393580778196165, | |
| "grad_norm": 0.20349060401414618, | |
| "learning_rate": 4.312435554806787e-05, | |
| "loss": 0.0736, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 3.5537027075488616, | |
| "grad_norm": 0.20221336765718947, | |
| "learning_rate": 4.271179108064605e-05, | |
| "loss": 0.0713, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 3.5680473372781067, | |
| "grad_norm": 0.1920539935100462, | |
| "learning_rate": 4.229973298066083e-05, | |
| "loss": 0.0714, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 3.582391967007352, | |
| "grad_norm": 0.18514594535819984, | |
| "learning_rate": 4.188820987692981e-05, | |
| "loss": 0.0716, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 3.596736596736597, | |
| "grad_norm": 0.19390555703637974, | |
| "learning_rate": 4.1477250361100317e-05, | |
| "loss": 0.072, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 3.611081226465842, | |
| "grad_norm": 0.19881724163942532, | |
| "learning_rate": 4.106688298566295e-05, | |
| "loss": 0.0722, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 3.625425856195087, | |
| "grad_norm": 0.19864848134388496, | |
| "learning_rate": 4.065713626196778e-05, | |
| "loss": 0.0697, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 3.6397704859243323, | |
| "grad_norm": 0.20964033399772472, | |
| "learning_rate": 4.0248038658243515e-05, | |
| "loss": 0.0703, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 3.6541151156535774, | |
| "grad_norm": 0.1887224816930325, | |
| "learning_rate": 3.983961859761946e-05, | |
| "loss": 0.071, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 3.6684597453828225, | |
| "grad_norm": 0.1939910437911645, | |
| "learning_rate": 3.9431904456150914e-05, | |
| "loss": 0.0685, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 3.6828043751120676, | |
| "grad_norm": 0.1905566250106664, | |
| "learning_rate": 3.902492456084757e-05, | |
| "loss": 0.0709, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 3.6971490048413127, | |
| "grad_norm": 0.1954219594857734, | |
| "learning_rate": 3.861870718770545e-05, | |
| "loss": 0.0691, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 3.711493634570558, | |
| "grad_norm": 0.20129771340548336, | |
| "learning_rate": 3.821328055974231e-05, | |
| "loss": 0.0688, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 3.725838264299803, | |
| "grad_norm": 0.19424451985885532, | |
| "learning_rate": 3.780867284503685e-05, | |
| "loss": 0.0705, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 3.740182894029048, | |
| "grad_norm": 0.19307288848206286, | |
| "learning_rate": 3.7404912154771626e-05, | |
| "loss": 0.069, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 3.754527523758293, | |
| "grad_norm": 0.20224114631498458, | |
| "learning_rate": 3.7002026541279905e-05, | |
| "loss": 0.069, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 3.7688721534875382, | |
| "grad_norm": 0.19645086260070405, | |
| "learning_rate": 3.660004399609675e-05, | |
| "loss": 0.0693, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 3.7832167832167833, | |
| "grad_norm": 0.2009057118393354, | |
| "learning_rate": 3.619899244801414e-05, | |
| "loss": 0.0695, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 3.7975614129460284, | |
| "grad_norm": 0.20154345565922616, | |
| "learning_rate": 3.5798899761140626e-05, | |
| "loss": 0.0688, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 3.8119060426752736, | |
| "grad_norm": 0.19819908788727933, | |
| "learning_rate": 3.5399793732965324e-05, | |
| "loss": 0.0703, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 3.8262506724045187, | |
| "grad_norm": 0.19579772630914064, | |
| "learning_rate": 3.500170209242671e-05, | |
| "loss": 0.0673, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 3.8405953021337638, | |
| "grad_norm": 0.1930709905078437, | |
| "learning_rate": 3.460465249798592e-05, | |
| "loss": 0.068, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 3.854939931863009, | |
| "grad_norm": 0.19375769656837338, | |
| "learning_rate": 3.420867253570529e-05, | |
| "loss": 0.0668, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 3.869284561592254, | |
| "grad_norm": 0.19590535607906298, | |
| "learning_rate": 3.381378971733161e-05, | |
| "loss": 0.0658, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 3.883629191321499, | |
| "grad_norm": 0.19485732453673113, | |
| "learning_rate": 3.342003147838475e-05, | |
| "loss": 0.0671, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 3.897973821050744, | |
| "grad_norm": 0.19120048275674587, | |
| "learning_rate": 3.302742517625144e-05, | |
| "loss": 0.0665, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 3.9123184507799893, | |
| "grad_norm": 0.19464302345990753, | |
| "learning_rate": 3.2635998088284596e-05, | |
| "loss": 0.0662, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 3.9266630805092344, | |
| "grad_norm": 0.20017821890333443, | |
| "learning_rate": 3.224577740990814e-05, | |
| "loss": 0.0655, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 3.9410077102384795, | |
| "grad_norm": 0.18866754533216776, | |
| "learning_rate": 3.185679025272753e-05, | |
| "loss": 0.0663, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 3.9553523399677246, | |
| "grad_norm": 0.19243520237850759, | |
| "learning_rate": 3.146906364264606e-05, | |
| "loss": 0.0657, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 3.9696969696969697, | |
| "grad_norm": 0.1924186499329835, | |
| "learning_rate": 3.108262451798724e-05, | |
| "loss": 0.0651, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 3.984041599426215, | |
| "grad_norm": 0.21148279808210507, | |
| "learning_rate": 3.069749972762316e-05, | |
| "loss": 0.0648, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 3.99838622915546, | |
| "grad_norm": 0.19991640585361364, | |
| "learning_rate": 3.0313716029109064e-05, | |
| "loss": 0.0645, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 4.011475703783396, | |
| "grad_norm": 0.13064534629220334, | |
| "learning_rate": 2.993130008682436e-05, | |
| "loss": 0.0228, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 4.025820333512641, | |
| "grad_norm": 0.14874535281957305, | |
| "learning_rate": 2.955027847011993e-05, | |
| "loss": 0.0176, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 4.040164963241886, | |
| "grad_norm": 0.14336180228683498, | |
| "learning_rate": 2.917067765147229e-05, | |
| "loss": 0.0176, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 4.054509592971131, | |
| "grad_norm": 0.12559441494646076, | |
| "learning_rate": 2.8792524004644283e-05, | |
| "loss": 0.0167, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 4.068854222700376, | |
| "grad_norm": 0.12484448147694403, | |
| "learning_rate": 2.8415843802852672e-05, | |
| "loss": 0.0167, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 4.083198852429621, | |
| "grad_norm": 0.1337296091314726, | |
| "learning_rate": 2.8040663216942752e-05, | |
| "loss": 0.0169, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 4.0975434821588665, | |
| "grad_norm": 0.12242456577697475, | |
| "learning_rate": 2.7667008313570076e-05, | |
| "loss": 0.0161, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 4.111888111888112, | |
| "grad_norm": 0.13243095768870966, | |
| "learning_rate": 2.729490505338943e-05, | |
| "loss": 0.0161, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 4.126232741617357, | |
| "grad_norm": 0.12277718816926177, | |
| "learning_rate": 2.692437928925109e-05, | |
| "loss": 0.0157, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 4.140577371346602, | |
| "grad_norm": 0.137540991678628, | |
| "learning_rate": 2.655545676440464e-05, | |
| "loss": 0.0159, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 4.154922001075847, | |
| "grad_norm": 0.13131712471544715, | |
| "learning_rate": 2.6188163110710435e-05, | |
| "loss": 0.0161, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 4.169266630805092, | |
| "grad_norm": 0.13640835620647865, | |
| "learning_rate": 2.582252384685874e-05, | |
| "loss": 0.0164, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 4.183611260534337, | |
| "grad_norm": 0.12543818653508698, | |
| "learning_rate": 2.5458564376596732e-05, | |
| "loss": 0.0157, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 4.197955890263582, | |
| "grad_norm": 0.11736167137678152, | |
| "learning_rate": 2.509630998696349e-05, | |
| "loss": 0.0154, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 4.212300519992827, | |
| "grad_norm": 0.1245551329001544, | |
| "learning_rate": 2.473578584653321e-05, | |
| "loss": 0.0152, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 4.2266451497220725, | |
| "grad_norm": 0.12763631881000323, | |
| "learning_rate": 2.4377017003666413e-05, | |
| "loss": 0.0155, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 4.240989779451318, | |
| "grad_norm": 0.13313595943206588, | |
| "learning_rate": 2.4020028384769795e-05, | |
| "loss": 0.015, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 4.255334409180563, | |
| "grad_norm": 0.125069284406997, | |
| "learning_rate": 2.366484479256425e-05, | |
| "loss": 0.015, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 4.269679038909808, | |
| "grad_norm": 0.13167131954772826, | |
| "learning_rate": 2.3311490904361738e-05, | |
| "loss": 0.0159, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 4.284023668639053, | |
| "grad_norm": 0.11344149792986571, | |
| "learning_rate": 2.295999127035071e-05, | |
| "loss": 0.0147, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 4.298368298368298, | |
| "grad_norm": 0.1299095136285245, | |
| "learning_rate": 2.26103703118905e-05, | |
| "loss": 0.015, | |
| "step": 3000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 4188, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 6, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 940152769216512.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |