| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 482, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.02079002079002079, |
| "grad_norm": 1.1426271200180054, |
| "learning_rate": 1.9672131147540985e-06, |
| "loss": 1.3841, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.04158004158004158, |
| "grad_norm": 0.8232327103614807, |
| "learning_rate": 4.426229508196722e-06, |
| "loss": 1.317, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.062370062370062374, |
| "grad_norm": 0.6994602680206299, |
| "learning_rate": 6.885245901639345e-06, |
| "loss": 1.3209, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.08316008316008316, |
| "grad_norm": 0.5370446443557739, |
| "learning_rate": 9.344262295081968e-06, |
| "loss": 1.2832, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.10395010395010396, |
| "grad_norm": 0.5207232236862183, |
| "learning_rate": 1.180327868852459e-05, |
| "loss": 1.2765, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.12474012474012475, |
| "grad_norm": 0.563996434211731, |
| "learning_rate": 1.4262295081967213e-05, |
| "loss": 1.2306, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.14553014553014554, |
| "grad_norm": 0.4489838778972626, |
| "learning_rate": 1.6721311475409834e-05, |
| "loss": 1.2378, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.16632016632016633, |
| "grad_norm": 0.44133999943733215, |
| "learning_rate": 1.9180327868852462e-05, |
| "loss": 1.1669, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.18711018711018712, |
| "grad_norm": 0.43441346287727356, |
| "learning_rate": 2.1639344262295084e-05, |
| "loss": 1.2258, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.2079002079002079, |
| "grad_norm": 0.45760205388069153, |
| "learning_rate": 2.4098360655737705e-05, |
| "loss": 1.2327, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.2286902286902287, |
| "grad_norm": 0.42863017320632935, |
| "learning_rate": 2.6557377049180327e-05, |
| "loss": 1.1611, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.2494802494802495, |
| "grad_norm": 0.42736756801605225, |
| "learning_rate": 2.901639344262295e-05, |
| "loss": 1.1748, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.2702702702702703, |
| "grad_norm": 0.4686262607574463, |
| "learning_rate": 2.9999490963395927e-05, |
| "loss": 1.1333, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.2910602910602911, |
| "grad_norm": 0.4466378688812256, |
| "learning_rate": 2.9996380309264023e-05, |
| "loss": 1.0486, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.31185031185031187, |
| "grad_norm": 0.4879732131958008, |
| "learning_rate": 2.9990442384854876e-05, |
| "loss": 1.1013, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.33264033264033266, |
| "grad_norm": 0.48543474078178406, |
| "learning_rate": 2.9981678309646448e-05, |
| "loss": 1.1074, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.35343035343035345, |
| "grad_norm": 0.644829511642456, |
| "learning_rate": 2.997008973593141e-05, |
| "loss": 1.0185, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.37422037422037424, |
| "grad_norm": 0.5197404026985168, |
| "learning_rate": 2.9955678848505648e-05, |
| "loss": 1.0811, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.39501039501039503, |
| "grad_norm": 0.5914992690086365, |
| "learning_rate": 2.9938448364256364e-05, |
| "loss": 1.0279, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.4158004158004158, |
| "grad_norm": 0.573104739189148, |
| "learning_rate": 2.9918401531649852e-05, |
| "loss": 1.0081, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.4365904365904366, |
| "grad_norm": 0.5873317718505859, |
| "learning_rate": 2.9895542130119077e-05, |
| "loss": 1.056, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.4573804573804574, |
| "grad_norm": 0.7740167379379272, |
| "learning_rate": 2.9869874469351106e-05, |
| "loss": 0.9948, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.4781704781704782, |
| "grad_norm": 0.684306800365448, |
| "learning_rate": 2.984140338847465e-05, |
| "loss": 0.9293, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.498960498960499, |
| "grad_norm": 0.688812255859375, |
| "learning_rate": 2.9810134255147707e-05, |
| "loss": 0.9799, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.5197505197505198, |
| "grad_norm": 0.7484465837478638, |
| "learning_rate": 2.977607296454561e-05, |
| "loss": 0.9215, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.5405405405405406, |
| "grad_norm": 0.7774705290794373, |
| "learning_rate": 2.97392259382496e-05, |
| "loss": 0.917, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.5613305613305614, |
| "grad_norm": 0.6652188301086426, |
| "learning_rate": 2.969960012303617e-05, |
| "loss": 0.9703, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.5821205821205822, |
| "grad_norm": 0.7371662259101868, |
| "learning_rate": 2.9657202989567393e-05, |
| "loss": 0.8986, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.6029106029106029, |
| "grad_norm": 0.8022716641426086, |
| "learning_rate": 2.9612042530982473e-05, |
| "loss": 0.9191, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.6237006237006237, |
| "grad_norm": 0.862189769744873, |
| "learning_rate": 2.956412726139078e-05, |
| "loss": 0.9175, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.6444906444906445, |
| "grad_norm": 0.7557716369628906, |
| "learning_rate": 2.951346621426672e-05, |
| "loss": 0.8531, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.6652806652806653, |
| "grad_norm": 0.8276395797729492, |
| "learning_rate": 2.946006894074661e-05, |
| "loss": 0.8899, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.6860706860706861, |
| "grad_norm": 0.8520011305809021, |
| "learning_rate": 2.9403945507828044e-05, |
| "loss": 0.8472, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.7068607068607069, |
| "grad_norm": 1.10086989402771, |
| "learning_rate": 2.9345106496471914e-05, |
| "loss": 0.7985, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.7276507276507277, |
| "grad_norm": 0.865860641002655, |
| "learning_rate": 2.928356299960762e-05, |
| "loss": 0.7626, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.7484407484407485, |
| "grad_norm": 0.8839801549911499, |
| "learning_rate": 2.9219326620041698e-05, |
| "loss": 0.8144, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.7692307692307693, |
| "grad_norm": 0.9144435524940491, |
| "learning_rate": 2.9152409468270334e-05, |
| "loss": 0.7601, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.7900207900207901, |
| "grad_norm": 1.0402355194091797, |
| "learning_rate": 2.9082824160196198e-05, |
| "loss": 0.7169, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.8108108108108109, |
| "grad_norm": 1.128750205039978, |
| "learning_rate": 2.9010583814749933e-05, |
| "loss": 0.8245, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.8316008316008316, |
| "grad_norm": 0.8998545408248901, |
| "learning_rate": 2.8935702051416865e-05, |
| "loss": 0.7077, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.8523908523908524, |
| "grad_norm": 0.908494234085083, |
| "learning_rate": 2.8858192987669303e-05, |
| "loss": 0.7872, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.8731808731808732, |
| "grad_norm": 0.9621368050575256, |
| "learning_rate": 2.8778071236304973e-05, |
| "loss": 0.7388, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.893970893970894, |
| "grad_norm": 0.9282324910163879, |
| "learning_rate": 2.8695351902692078e-05, |
| "loss": 0.6996, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.9147609147609148, |
| "grad_norm": 1.3679566383361816, |
| "learning_rate": 2.861005058192146e-05, |
| "loss": 0.7047, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.9355509355509356, |
| "grad_norm": 0.9948172569274902, |
| "learning_rate": 2.8522183355866466e-05, |
| "loss": 0.6998, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.9563409563409564, |
| "grad_norm": 0.9397404193878174, |
| "learning_rate": 2.8431766790151034e-05, |
| "loss": 0.6721, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.9771309771309772, |
| "grad_norm": 1.093510389328003, |
| "learning_rate": 2.8338817931026564e-05, |
| "loss": 0.6665, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.997920997920998, |
| "grad_norm": 1.060895323753357, |
| "learning_rate": 2.824335430215818e-05, |
| "loss": 0.722, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.0166320166320166, |
| "grad_norm": 1.709092617034912, |
| "learning_rate": 2.8145393901321007e-05, |
| "loss": 0.6232, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.0374220374220373, |
| "grad_norm": 1.013521671295166, |
| "learning_rate": 2.8044955197007036e-05, |
| "loss": 0.5766, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.0582120582120582, |
| "grad_norm": 1.1383347511291504, |
| "learning_rate": 2.794205712494326e-05, |
| "loss": 0.6173, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.0790020790020791, |
| "grad_norm": 1.0383437871932983, |
| "learning_rate": 2.7836719084521714e-05, |
| "loss": 0.6681, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.0997920997920998, |
| "grad_norm": 1.0968817472457886, |
| "learning_rate": 2.7728960935142103e-05, |
| "loss": 0.556, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.1205821205821205, |
| "grad_norm": 1.1271777153015137, |
| "learning_rate": 2.7618802992467718e-05, |
| "loss": 0.5136, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.1413721413721414, |
| "grad_norm": 1.0849883556365967, |
| "learning_rate": 2.75062660245953e-05, |
| "loss": 0.5544, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.1621621621621623, |
| "grad_norm": 1.2528181076049805, |
| "learning_rate": 2.7391371248139658e-05, |
| "loss": 0.5154, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.182952182952183, |
| "grad_norm": 1.2742812633514404, |
| "learning_rate": 2.7274140324233648e-05, |
| "loss": 0.5287, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.2037422037422036, |
| "grad_norm": 1.2115899324417114, |
| "learning_rate": 2.7154595354444452e-05, |
| "loss": 0.5782, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.2245322245322245, |
| "grad_norm": 1.1529432535171509, |
| "learning_rate": 2.703275887660672e-05, |
| "loss": 0.584, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.2453222453222454, |
| "grad_norm": 1.0873831510543823, |
| "learning_rate": 2.6908653860573545e-05, |
| "loss": 0.5172, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.2661122661122661, |
| "grad_norm": 1.139832854270935, |
| "learning_rate": 2.678230370388592e-05, |
| "loss": 0.5262, |
| "step": 305 |
| }, |
| { |
| "epoch": 1.2869022869022868, |
| "grad_norm": 1.1125494241714478, |
| "learning_rate": 2.6653732227361606e-05, |
| "loss": 0.5027, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.3076923076923077, |
| "grad_norm": 1.0311192274093628, |
| "learning_rate": 2.652296367060421e-05, |
| "loss": 0.4776, |
| "step": 315 |
| }, |
| { |
| "epoch": 1.3284823284823286, |
| "grad_norm": 1.4031699895858765, |
| "learning_rate": 2.639002268743325e-05, |
| "loss": 0.4833, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.3492723492723493, |
| "grad_norm": 1.0720524787902832, |
| "learning_rate": 2.62549343412362e-05, |
| "loss": 0.4716, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.37006237006237, |
| "grad_norm": 1.1976195573806763, |
| "learning_rate": 2.6117724100243274e-05, |
| "loss": 0.475, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.3908523908523909, |
| "grad_norm": 1.2341586351394653, |
| "learning_rate": 2.597841783272588e-05, |
| "loss": 0.4753, |
| "step": 335 |
| }, |
| { |
| "epoch": 1.4116424116424118, |
| "grad_norm": 1.085251808166504, |
| "learning_rate": 2.58370418021197e-05, |
| "loss": 0.4534, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.4324324324324325, |
| "grad_norm": 1.2791422605514526, |
| "learning_rate": 2.5693622662073204e-05, |
| "loss": 0.4708, |
| "step": 345 |
| }, |
| { |
| "epoch": 1.4532224532224531, |
| "grad_norm": 1.1701467037200928, |
| "learning_rate": 2.5548187451422667e-05, |
| "loss": 0.4709, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.474012474012474, |
| "grad_norm": 1.0856988430023193, |
| "learning_rate": 2.5400763589094516e-05, |
| "loss": 0.4828, |
| "step": 355 |
| }, |
| { |
| "epoch": 1.494802494802495, |
| "grad_norm": 1.2242966890335083, |
| "learning_rate": 2.5251378868936033e-05, |
| "loss": 0.4405, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.5155925155925156, |
| "grad_norm": 1.0935328006744385, |
| "learning_rate": 2.5100061454475383e-05, |
| "loss": 0.4639, |
| "step": 365 |
| }, |
| { |
| "epoch": 1.5363825363825363, |
| "grad_norm": 1.1667602062225342, |
| "learning_rate": 2.494683987361193e-05, |
| "loss": 0.4634, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.5571725571725572, |
| "grad_norm": 1.3015145063400269, |
| "learning_rate": 2.4791743013237874e-05, |
| "loss": 0.4389, |
| "step": 375 |
| }, |
| { |
| "epoch": 1.577962577962578, |
| "grad_norm": 1.2456783056259155, |
| "learning_rate": 2.4634800113792205e-05, |
| "loss": 0.4809, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.5987525987525988, |
| "grad_norm": 1.163675308227539, |
| "learning_rate": 2.447604076374798e-05, |
| "loss": 0.4327, |
| "step": 385 |
| }, |
| { |
| "epoch": 1.6195426195426195, |
| "grad_norm": 1.11380934715271, |
| "learning_rate": 2.4315494894034028e-05, |
| "loss": 0.3938, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.6403326403326404, |
| "grad_norm": 1.1947519779205322, |
| "learning_rate": 2.415319277239204e-05, |
| "loss": 0.4244, |
| "step": 395 |
| }, |
| { |
| "epoch": 1.6611226611226613, |
| "grad_norm": 1.0451633930206299, |
| "learning_rate": 2.3989164997670204e-05, |
| "loss": 0.4387, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.681912681912682, |
| "grad_norm": 1.137202262878418, |
| "learning_rate": 2.3823442494054394e-05, |
| "loss": 0.398, |
| "step": 405 |
| }, |
| { |
| "epoch": 1.7027027027027026, |
| "grad_norm": 1.1828194856643677, |
| "learning_rate": 2.3656056505238028e-05, |
| "loss": 0.4202, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.7234927234927235, |
| "grad_norm": 1.3268241882324219, |
| "learning_rate": 2.348703858853169e-05, |
| "loss": 0.4094, |
| "step": 415 |
| }, |
| { |
| "epoch": 1.7442827442827444, |
| "grad_norm": 2.0979161262512207, |
| "learning_rate": 2.331642060891361e-05, |
| "loss": 0.3803, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.7650727650727651, |
| "grad_norm": 1.2897614240646362, |
| "learning_rate": 2.314423473302218e-05, |
| "loss": 0.4165, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.7858627858627858, |
| "grad_norm": 1.0115231275558472, |
| "learning_rate": 2.297051342309156e-05, |
| "loss": 0.3775, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.8066528066528067, |
| "grad_norm": 1.0907710790634155, |
| "learning_rate": 2.2795289430831554e-05, |
| "loss": 0.3953, |
| "step": 435 |
| }, |
| { |
| "epoch": 1.8274428274428276, |
| "grad_norm": 1.0820614099502563, |
| "learning_rate": 2.261859579125294e-05, |
| "loss": 0.3932, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.8482328482328483, |
| "grad_norm": 1.4253244400024414, |
| "learning_rate": 2.244046581643938e-05, |
| "loss": 0.3899, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.869022869022869, |
| "grad_norm": 1.221880316734314, |
| "learning_rate": 2.2260933089267063e-05, |
| "loss": 0.3944, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.8898128898128899, |
| "grad_norm": 1.118735432624817, |
| "learning_rate": 2.2080031457073334e-05, |
| "loss": 0.3399, |
| "step": 455 |
| }, |
| { |
| "epoch": 1.9106029106029108, |
| "grad_norm": 1.2311267852783203, |
| "learning_rate": 2.1897795025275455e-05, |
| "loss": 0.3789, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.9313929313929314, |
| "grad_norm": 1.1016563177108765, |
| "learning_rate": 2.1714258150940687e-05, |
| "loss": 0.335, |
| "step": 465 |
| }, |
| { |
| "epoch": 1.9521829521829521, |
| "grad_norm": 1.1976559162139893, |
| "learning_rate": 2.1529455436308973e-05, |
| "loss": 0.3857, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.972972972972973, |
| "grad_norm": 1.396047592163086, |
| "learning_rate": 2.1343421722269338e-05, |
| "loss": 0.3699, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.993762993762994, |
| "grad_norm": 1.135191798210144, |
| "learning_rate": 2.1156192081791355e-05, |
| "loss": 0.3808, |
| "step": 480 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1205, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 2000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 7.369905010177475e+17, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|