{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 454, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022026431718061675, "grad_norm": 1.2465029954910278, "learning_rate": 2.1052631578947366e-06, "loss": 1.3594, "step": 5 }, { "epoch": 0.04405286343612335, "grad_norm": 0.7338076233863831, "learning_rate": 4.736842105263158e-06, "loss": 1.3607, "step": 10 }, { "epoch": 0.06607929515418502, "grad_norm": 0.5697624087333679, "learning_rate": 7.3684210526315784e-06, "loss": 1.3328, "step": 15 }, { "epoch": 0.0881057268722467, "grad_norm": 0.661296010017395, "learning_rate": 9.999999999999999e-06, "loss": 1.3393, "step": 20 }, { "epoch": 0.11013215859030837, "grad_norm": 0.6124210357666016, "learning_rate": 1.263157894736842e-05, "loss": 1.2621, "step": 25 }, { "epoch": 0.13215859030837004, "grad_norm": 0.5665815472602844, "learning_rate": 1.5263157894736842e-05, "loss": 1.2624, "step": 30 }, { "epoch": 0.15418502202643172, "grad_norm": 0.5574439167976379, "learning_rate": 1.7894736842105264e-05, "loss": 1.2487, "step": 35 }, { "epoch": 0.1762114537444934, "grad_norm": 0.4665200412273407, "learning_rate": 2.0526315789473685e-05, "loss": 1.2335, "step": 40 }, { "epoch": 0.19823788546255505, "grad_norm": 0.4646995961666107, "learning_rate": 2.3157894736842103e-05, "loss": 1.1568, "step": 45 }, { "epoch": 0.22026431718061673, "grad_norm": 0.5178118348121643, "learning_rate": 2.578947368421053e-05, "loss": 1.214, "step": 50 }, { "epoch": 0.2422907488986784, "grad_norm": 0.41648760437965393, "learning_rate": 2.8421052631578946e-05, "loss": 1.151, "step": 55 }, { "epoch": 0.2643171806167401, "grad_norm": 0.5988641977310181, "learning_rate": 2.9999745210076202e-05, "loss": 1.1945, "step": 60 }, { "epoch": 0.28634361233480177, "grad_norm": 0.4767996370792389, "learning_rate": 2.9996878922838097e-05, "loss": 1.0975, "step": 65 }, { "epoch": 0.30837004405286345, "grad_norm": 0.547401487827301, "learning_rate": 2.9990828471561044e-05, "loss": 1.1063, "step": 70 }, { "epoch": 0.3303964757709251, "grad_norm": 0.5365089178085327, "learning_rate": 2.998159514088762e-05, "loss": 1.1153, "step": 75 }, { "epoch": 0.3524229074889868, "grad_norm": 0.5236433148384094, "learning_rate": 2.9969180891255046e-05, "loss": 1.088, "step": 80 }, { "epoch": 0.3744493392070485, "grad_norm": 0.6282631754875183, "learning_rate": 2.995358835847891e-05, "loss": 1.0695, "step": 85 }, { "epoch": 0.3964757709251101, "grad_norm": 0.6258669495582581, "learning_rate": 2.9934820853193538e-05, "loss": 1.0281, "step": 90 }, { "epoch": 0.4185022026431718, "grad_norm": 0.6035173535346985, "learning_rate": 2.991288236014907e-05, "loss": 1.0368, "step": 95 }, { "epoch": 0.44052863436123346, "grad_norm": 0.553928554058075, "learning_rate": 2.9887777537365416e-05, "loss": 0.9775, "step": 100 }, { "epoch": 0.46255506607929514, "grad_norm": 0.6714206337928772, "learning_rate": 2.985951171514326e-05, "loss": 0.9667, "step": 105 }, { "epoch": 0.4845814977973568, "grad_norm": 0.640594482421875, "learning_rate": 2.982809089493231e-05, "loss": 0.9989, "step": 110 }, { "epoch": 0.5066079295154186, "grad_norm": 0.6225477457046509, "learning_rate": 2.9793521748057065e-05, "loss": 1.011, "step": 115 }, { "epoch": 0.5286343612334802, "grad_norm": 0.6979171633720398, "learning_rate": 2.975581161430035e-05, "loss": 0.9919, "step": 120 }, { "epoch": 0.5506607929515418, "grad_norm": 0.7126419544219971, "learning_rate": 2.971496850034492e-05, "loss": 0.9232, "step": 125 }, { "epoch": 0.5726872246696035, "grad_norm": 0.8123069405555725, "learning_rate": 2.9671001078073453e-05, "loss": 0.932, "step": 130 }, { "epoch": 0.5947136563876652, "grad_norm": 0.8383358120918274, "learning_rate": 2.9623918682727355e-05, "loss": 0.9127, "step": 135 }, { "epoch": 0.6167400881057269, "grad_norm": 0.7429929971694946, "learning_rate": 2.957373131092464e-05, "loss": 0.9187, "step": 140 }, { "epoch": 0.6387665198237885, "grad_norm": 0.9482147693634033, "learning_rate": 2.9520449618537465e-05, "loss": 0.8848, "step": 145 }, { "epoch": 0.6607929515418502, "grad_norm": 0.8874338865280151, "learning_rate": 2.946408491842964e-05, "loss": 0.8802, "step": 150 }, { "epoch": 0.6828193832599119, "grad_norm": 0.7921388149261475, "learning_rate": 2.940464917805466e-05, "loss": 0.8105, "step": 155 }, { "epoch": 0.7048458149779736, "grad_norm": 0.823284924030304, "learning_rate": 2.9342155016914772e-05, "loss": 0.8335, "step": 160 }, { "epoch": 0.7268722466960352, "grad_norm": 0.8587915301322937, "learning_rate": 2.927661570388155e-05, "loss": 0.8236, "step": 165 }, { "epoch": 0.748898678414097, "grad_norm": 0.9209868311882019, "learning_rate": 2.920804515437865e-05, "loss": 0.7313, "step": 170 }, { "epoch": 0.7709251101321586, "grad_norm": 0.9900104999542236, "learning_rate": 2.9136457927427254e-05, "loss": 0.8445, "step": 175 }, { "epoch": 0.7929515418502202, "grad_norm": 1.1005882024765015, "learning_rate": 2.9061869222554863e-05, "loss": 0.7258, "step": 180 }, { "epoch": 0.8149779735682819, "grad_norm": 0.7985159754753113, "learning_rate": 2.898429487656813e-05, "loss": 0.7554, "step": 185 }, { "epoch": 0.8370044052863436, "grad_norm": 1.0106185674667358, "learning_rate": 2.8903751360190327e-05, "loss": 0.7773, "step": 190 }, { "epoch": 0.8590308370044053, "grad_norm": 1.0603998899459839, "learning_rate": 2.8820255774564287e-05, "loss": 0.8034, "step": 195 }, { "epoch": 0.8810572687224669, "grad_norm": 0.9879089593887329, "learning_rate": 2.8733825847621436e-05, "loss": 0.71, "step": 200 }, { "epoch": 0.9030837004405287, "grad_norm": 1.0008033514022827, "learning_rate": 2.8644479930317776e-05, "loss": 0.7837, "step": 205 }, { "epoch": 0.9251101321585903, "grad_norm": 1.0469297170639038, "learning_rate": 2.8552236992737572e-05, "loss": 0.7139, "step": 210 }, { "epoch": 0.947136563876652, "grad_norm": 0.9759002327919006, "learning_rate": 2.8457116620065596e-05, "loss": 0.7207, "step": 215 }, { "epoch": 0.9691629955947136, "grad_norm": 0.907706618309021, "learning_rate": 2.8359139008428758e-05, "loss": 0.7134, "step": 220 }, { "epoch": 0.9911894273127754, "grad_norm": 1.022333025932312, "learning_rate": 2.8258324960608043e-05, "loss": 0.669, "step": 225 }, { "epoch": 1.013215859030837, "grad_norm": 1.021018385887146, "learning_rate": 2.815469588162161e-05, "loss": 0.6179, "step": 230 }, { "epoch": 1.0352422907488987, "grad_norm": 1.1175833940505981, "learning_rate": 2.8048273774180043e-05, "loss": 0.5932, "step": 235 }, { "epoch": 1.0572687224669604, "grad_norm": 0.9679650068283081, "learning_rate": 2.7939081234014708e-05, "loss": 0.5915, "step": 240 }, { "epoch": 1.079295154185022, "grad_norm": 1.1451464891433716, "learning_rate": 2.7827141445080196e-05, "loss": 0.5768, "step": 245 }, { "epoch": 1.1013215859030836, "grad_norm": 1.274778127670288, "learning_rate": 2.7712478174631813e-05, "loss": 0.5711, "step": 250 }, { "epoch": 1.1233480176211454, "grad_norm": 1.4071186780929565, "learning_rate": 2.759511576817934e-05, "loss": 0.5991, "step": 255 }, { "epoch": 1.145374449339207, "grad_norm": 1.2345290184020996, "learning_rate": 2.747507914431791e-05, "loss": 0.5184, "step": 260 }, { "epoch": 1.1674008810572687, "grad_norm": 1.345145344734192, "learning_rate": 2.7352393789437258e-05, "loss": 0.5362, "step": 265 }, { "epoch": 1.1894273127753303, "grad_norm": 1.1527823209762573, "learning_rate": 2.7227085752310413e-05, "loss": 0.543, "step": 270 }, { "epoch": 1.2114537444933922, "grad_norm": 1.1420563459396362, "learning_rate": 2.709918163856295e-05, "loss": 0.5326, "step": 275 }, { "epoch": 1.2334801762114538, "grad_norm": 1.0940356254577637, "learning_rate": 2.696870860502408e-05, "loss": 0.5188, "step": 280 }, { "epoch": 1.2555066079295154, "grad_norm": 1.2769014835357666, "learning_rate": 2.6835694353960623e-05, "loss": 0.5029, "step": 285 }, { "epoch": 1.277533039647577, "grad_norm": 1.3038066625595093, "learning_rate": 2.6700167127195233e-05, "loss": 0.5492, "step": 290 }, { "epoch": 1.2995594713656389, "grad_norm": 1.0858526229858398, "learning_rate": 2.6562155700110046e-05, "loss": 0.5278, "step": 295 }, { "epoch": 1.3215859030837005, "grad_norm": 1.1577568054199219, "learning_rate": 2.6421689375537015e-05, "loss": 0.5314, "step": 300 }, { "epoch": 1.3436123348017621, "grad_norm": 1.1254470348358154, "learning_rate": 2.6278797977536325e-05, "loss": 0.466, "step": 305 }, { "epoch": 1.3656387665198237, "grad_norm": 1.2429901361465454, "learning_rate": 2.613351184506405e-05, "loss": 0.5185, "step": 310 }, { "epoch": 1.3876651982378854, "grad_norm": 1.3175548315048218, "learning_rate": 2.598586182553056e-05, "loss": 0.5077, "step": 315 }, { "epoch": 1.4096916299559472, "grad_norm": 1.3169467449188232, "learning_rate": 2.5835879268250934e-05, "loss": 0.5124, "step": 320 }, { "epoch": 1.4317180616740088, "grad_norm": 1.2650501728057861, "learning_rate": 2.568359601778881e-05, "loss": 0.4961, "step": 325 }, { "epoch": 1.4537444933920705, "grad_norm": 1.1624271869659424, "learning_rate": 2.5529044407195127e-05, "loss": 0.4552, "step": 330 }, { "epoch": 1.475770925110132, "grad_norm": 1.1693929433822632, "learning_rate": 2.5372257251143056e-05, "loss": 0.4668, "step": 335 }, { "epoch": 1.497797356828194, "grad_norm": 1.105682134628296, "learning_rate": 2.5213267838960772e-05, "loss": 0.4485, "step": 340 }, { "epoch": 1.5198237885462555, "grad_norm": 1.365013599395752, "learning_rate": 2.5052109927563393e-05, "loss": 0.4641, "step": 345 }, { "epoch": 1.5418502202643172, "grad_norm": 1.2356834411621094, "learning_rate": 2.4888817734285657e-05, "loss": 0.4388, "step": 350 }, { "epoch": 1.5638766519823788, "grad_norm": 1.8782477378845215, "learning_rate": 2.472342592961683e-05, "loss": 0.4556, "step": 355 }, { "epoch": 1.5859030837004404, "grad_norm": 1.1085964441299438, "learning_rate": 2.4555969629839393e-05, "loss": 0.4284, "step": 360 }, { "epoch": 1.607929515418502, "grad_norm": 1.3663532733917236, "learning_rate": 2.4386484389573126e-05, "loss": 0.4322, "step": 365 }, { "epoch": 1.6299559471365639, "grad_norm": 1.307347297668457, "learning_rate": 2.421500619422606e-05, "loss": 0.4378, "step": 370 }, { "epoch": 1.6519823788546255, "grad_norm": 1.0821038484573364, "learning_rate": 2.4041571452353982e-05, "loss": 0.4003, "step": 375 }, { "epoch": 1.6740088105726874, "grad_norm": 1.056179404258728, "learning_rate": 2.386621698793015e-05, "loss": 0.4161, "step": 380 }, { "epoch": 1.696035242290749, "grad_norm": 1.2139962911605835, "learning_rate": 2.3688980032526707e-05, "loss": 0.3905, "step": 385 }, { "epoch": 1.7180616740088106, "grad_norm": 1.0942909717559814, "learning_rate": 2.3509898217409645e-05, "loss": 0.4268, "step": 390 }, { "epoch": 1.7400881057268722, "grad_norm": 1.277398705482483, "learning_rate": 2.3329009565548857e-05, "loss": 0.4007, "step": 395 }, { "epoch": 1.7621145374449338, "grad_norm": 1.1945730447769165, "learning_rate": 2.3146352483545026e-05, "loss": 0.3755, "step": 400 }, { "epoch": 1.7841409691629955, "grad_norm": 1.1625131368637085, "learning_rate": 2.2961965753475074e-05, "loss": 0.4081, "step": 405 }, { "epoch": 1.8061674008810573, "grad_norm": 1.1497328281402588, "learning_rate": 2.277588852465788e-05, "loss": 0.3711, "step": 410 }, { "epoch": 1.828193832599119, "grad_norm": 1.1712104082107544, "learning_rate": 2.2588160305342024e-05, "loss": 0.4283, "step": 415 }, { "epoch": 1.8502202643171806, "grad_norm": 1.3465914726257324, "learning_rate": 2.2398820954317342e-05, "loss": 0.3948, "step": 420 }, { "epoch": 1.8722466960352424, "grad_norm": 1.3581944704055786, "learning_rate": 2.220791067245201e-05, "loss": 0.4051, "step": 425 }, { "epoch": 1.894273127753304, "grad_norm": 1.656079649925232, "learning_rate": 2.201546999415704e-05, "loss": 0.3842, "step": 430 }, { "epoch": 1.9162995594713657, "grad_norm": 1.3536030054092407, "learning_rate": 2.182153977877994e-05, "loss": 0.3618, "step": 435 }, { "epoch": 1.9383259911894273, "grad_norm": 1.2806636095046997, "learning_rate": 2.162616120192939e-05, "loss": 0.362, "step": 440 }, { "epoch": 1.960352422907489, "grad_norm": 1.1593424081802368, "learning_rate": 2.142937574673275e-05, "loss": 0.3802, "step": 445 }, { "epoch": 1.9823788546255505, "grad_norm": 1.5513302087783813, "learning_rate": 2.12312251950283e-05, "loss": 0.36, "step": 450 } ], "logging_steps": 5, "max_steps": 1135, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.563190790820987e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }