| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 454, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.022026431718061675, | |
| "grad_norm": 1.2465029954910278, | |
| "learning_rate": 2.1052631578947366e-06, | |
| "loss": 1.3594, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.04405286343612335, | |
| "grad_norm": 0.7338076233863831, | |
| "learning_rate": 4.736842105263158e-06, | |
| "loss": 1.3607, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06607929515418502, | |
| "grad_norm": 0.5697624087333679, | |
| "learning_rate": 7.3684210526315784e-06, | |
| "loss": 1.3328, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.0881057268722467, | |
| "grad_norm": 0.661296010017395, | |
| "learning_rate": 9.999999999999999e-06, | |
| "loss": 1.3393, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.11013215859030837, | |
| "grad_norm": 0.6124210357666016, | |
| "learning_rate": 1.263157894736842e-05, | |
| "loss": 1.2621, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.13215859030837004, | |
| "grad_norm": 0.5665815472602844, | |
| "learning_rate": 1.5263157894736842e-05, | |
| "loss": 1.2624, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.15418502202643172, | |
| "grad_norm": 0.5574439167976379, | |
| "learning_rate": 1.7894736842105264e-05, | |
| "loss": 1.2487, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.1762114537444934, | |
| "grad_norm": 0.4665200412273407, | |
| "learning_rate": 2.0526315789473685e-05, | |
| "loss": 1.2335, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.19823788546255505, | |
| "grad_norm": 0.4646995961666107, | |
| "learning_rate": 2.3157894736842103e-05, | |
| "loss": 1.1568, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.22026431718061673, | |
| "grad_norm": 0.5178118348121643, | |
| "learning_rate": 2.578947368421053e-05, | |
| "loss": 1.214, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.2422907488986784, | |
| "grad_norm": 0.41648760437965393, | |
| "learning_rate": 2.8421052631578946e-05, | |
| "loss": 1.151, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.2643171806167401, | |
| "grad_norm": 0.5988641977310181, | |
| "learning_rate": 2.9999745210076202e-05, | |
| "loss": 1.1945, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.28634361233480177, | |
| "grad_norm": 0.4767996370792389, | |
| "learning_rate": 2.9996878922838097e-05, | |
| "loss": 1.0975, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.30837004405286345, | |
| "grad_norm": 0.547401487827301, | |
| "learning_rate": 2.9990828471561044e-05, | |
| "loss": 1.1063, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.3303964757709251, | |
| "grad_norm": 0.5365089178085327, | |
| "learning_rate": 2.998159514088762e-05, | |
| "loss": 1.1153, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.3524229074889868, | |
| "grad_norm": 0.5236433148384094, | |
| "learning_rate": 2.9969180891255046e-05, | |
| "loss": 1.088, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3744493392070485, | |
| "grad_norm": 0.6282631754875183, | |
| "learning_rate": 2.995358835847891e-05, | |
| "loss": 1.0695, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.3964757709251101, | |
| "grad_norm": 0.6258669495582581, | |
| "learning_rate": 2.9934820853193538e-05, | |
| "loss": 1.0281, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.4185022026431718, | |
| "grad_norm": 0.6035173535346985, | |
| "learning_rate": 2.991288236014907e-05, | |
| "loss": 1.0368, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.44052863436123346, | |
| "grad_norm": 0.553928554058075, | |
| "learning_rate": 2.9887777537365416e-05, | |
| "loss": 0.9775, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.46255506607929514, | |
| "grad_norm": 0.6714206337928772, | |
| "learning_rate": 2.985951171514326e-05, | |
| "loss": 0.9667, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.4845814977973568, | |
| "grad_norm": 0.640594482421875, | |
| "learning_rate": 2.982809089493231e-05, | |
| "loss": 0.9989, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.5066079295154186, | |
| "grad_norm": 0.6225477457046509, | |
| "learning_rate": 2.9793521748057065e-05, | |
| "loss": 1.011, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.5286343612334802, | |
| "grad_norm": 0.6979171633720398, | |
| "learning_rate": 2.975581161430035e-05, | |
| "loss": 0.9919, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5506607929515418, | |
| "grad_norm": 0.7126419544219971, | |
| "learning_rate": 2.971496850034492e-05, | |
| "loss": 0.9232, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.5726872246696035, | |
| "grad_norm": 0.8123069405555725, | |
| "learning_rate": 2.9671001078073453e-05, | |
| "loss": 0.932, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5947136563876652, | |
| "grad_norm": 0.8383358120918274, | |
| "learning_rate": 2.9623918682727355e-05, | |
| "loss": 0.9127, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.6167400881057269, | |
| "grad_norm": 0.7429929971694946, | |
| "learning_rate": 2.957373131092464e-05, | |
| "loss": 0.9187, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.6387665198237885, | |
| "grad_norm": 0.9482147693634033, | |
| "learning_rate": 2.9520449618537465e-05, | |
| "loss": 0.8848, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.6607929515418502, | |
| "grad_norm": 0.8874338865280151, | |
| "learning_rate": 2.946408491842964e-05, | |
| "loss": 0.8802, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6828193832599119, | |
| "grad_norm": 0.7921388149261475, | |
| "learning_rate": 2.940464917805466e-05, | |
| "loss": 0.8105, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.7048458149779736, | |
| "grad_norm": 0.823284924030304, | |
| "learning_rate": 2.9342155016914772e-05, | |
| "loss": 0.8335, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.7268722466960352, | |
| "grad_norm": 0.8587915301322937, | |
| "learning_rate": 2.927661570388155e-05, | |
| "loss": 0.8236, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.748898678414097, | |
| "grad_norm": 0.9209868311882019, | |
| "learning_rate": 2.920804515437865e-05, | |
| "loss": 0.7313, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.7709251101321586, | |
| "grad_norm": 0.9900104999542236, | |
| "learning_rate": 2.9136457927427254e-05, | |
| "loss": 0.8445, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.7929515418502202, | |
| "grad_norm": 1.1005882024765015, | |
| "learning_rate": 2.9061869222554863e-05, | |
| "loss": 0.7258, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.8149779735682819, | |
| "grad_norm": 0.7985159754753113, | |
| "learning_rate": 2.898429487656813e-05, | |
| "loss": 0.7554, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.8370044052863436, | |
| "grad_norm": 1.0106185674667358, | |
| "learning_rate": 2.8903751360190327e-05, | |
| "loss": 0.7773, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.8590308370044053, | |
| "grad_norm": 1.0603998899459839, | |
| "learning_rate": 2.8820255774564287e-05, | |
| "loss": 0.8034, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.8810572687224669, | |
| "grad_norm": 0.9879089593887329, | |
| "learning_rate": 2.8733825847621436e-05, | |
| "loss": 0.71, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.9030837004405287, | |
| "grad_norm": 1.0008033514022827, | |
| "learning_rate": 2.8644479930317776e-05, | |
| "loss": 0.7837, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.9251101321585903, | |
| "grad_norm": 1.0469297170639038, | |
| "learning_rate": 2.8552236992737572e-05, | |
| "loss": 0.7139, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.947136563876652, | |
| "grad_norm": 0.9759002327919006, | |
| "learning_rate": 2.8457116620065596e-05, | |
| "loss": 0.7207, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.9691629955947136, | |
| "grad_norm": 0.907706618309021, | |
| "learning_rate": 2.8359139008428758e-05, | |
| "loss": 0.7134, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.9911894273127754, | |
| "grad_norm": 1.022333025932312, | |
| "learning_rate": 2.8258324960608043e-05, | |
| "loss": 0.669, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.013215859030837, | |
| "grad_norm": 1.021018385887146, | |
| "learning_rate": 2.815469588162161e-05, | |
| "loss": 0.6179, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.0352422907488987, | |
| "grad_norm": 1.1175833940505981, | |
| "learning_rate": 2.8048273774180043e-05, | |
| "loss": 0.5932, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.0572687224669604, | |
| "grad_norm": 0.9679650068283081, | |
| "learning_rate": 2.7939081234014708e-05, | |
| "loss": 0.5915, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.079295154185022, | |
| "grad_norm": 1.1451464891433716, | |
| "learning_rate": 2.7827141445080196e-05, | |
| "loss": 0.5768, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.1013215859030836, | |
| "grad_norm": 1.274778127670288, | |
| "learning_rate": 2.7712478174631813e-05, | |
| "loss": 0.5711, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.1233480176211454, | |
| "grad_norm": 1.4071186780929565, | |
| "learning_rate": 2.759511576817934e-05, | |
| "loss": 0.5991, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.145374449339207, | |
| "grad_norm": 1.2345290184020996, | |
| "learning_rate": 2.747507914431791e-05, | |
| "loss": 0.5184, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.1674008810572687, | |
| "grad_norm": 1.345145344734192, | |
| "learning_rate": 2.7352393789437258e-05, | |
| "loss": 0.5362, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.1894273127753303, | |
| "grad_norm": 1.1527823209762573, | |
| "learning_rate": 2.7227085752310413e-05, | |
| "loss": 0.543, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.2114537444933922, | |
| "grad_norm": 1.1420563459396362, | |
| "learning_rate": 2.709918163856295e-05, | |
| "loss": 0.5326, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.2334801762114538, | |
| "grad_norm": 1.0940356254577637, | |
| "learning_rate": 2.696870860502408e-05, | |
| "loss": 0.5188, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.2555066079295154, | |
| "grad_norm": 1.2769014835357666, | |
| "learning_rate": 2.6835694353960623e-05, | |
| "loss": 0.5029, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.277533039647577, | |
| "grad_norm": 1.3038066625595093, | |
| "learning_rate": 2.6700167127195233e-05, | |
| "loss": 0.5492, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.2995594713656389, | |
| "grad_norm": 1.0858526229858398, | |
| "learning_rate": 2.6562155700110046e-05, | |
| "loss": 0.5278, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.3215859030837005, | |
| "grad_norm": 1.1577568054199219, | |
| "learning_rate": 2.6421689375537015e-05, | |
| "loss": 0.5314, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.3436123348017621, | |
| "grad_norm": 1.1254470348358154, | |
| "learning_rate": 2.6278797977536325e-05, | |
| "loss": 0.466, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.3656387665198237, | |
| "grad_norm": 1.2429901361465454, | |
| "learning_rate": 2.613351184506405e-05, | |
| "loss": 0.5185, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.3876651982378854, | |
| "grad_norm": 1.3175548315048218, | |
| "learning_rate": 2.598586182553056e-05, | |
| "loss": 0.5077, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.4096916299559472, | |
| "grad_norm": 1.3169467449188232, | |
| "learning_rate": 2.5835879268250934e-05, | |
| "loss": 0.5124, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.4317180616740088, | |
| "grad_norm": 1.2650501728057861, | |
| "learning_rate": 2.568359601778881e-05, | |
| "loss": 0.4961, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.4537444933920705, | |
| "grad_norm": 1.1624271869659424, | |
| "learning_rate": 2.5529044407195127e-05, | |
| "loss": 0.4552, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.475770925110132, | |
| "grad_norm": 1.1693929433822632, | |
| "learning_rate": 2.5372257251143056e-05, | |
| "loss": 0.4668, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.497797356828194, | |
| "grad_norm": 1.105682134628296, | |
| "learning_rate": 2.5213267838960772e-05, | |
| "loss": 0.4485, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.5198237885462555, | |
| "grad_norm": 1.365013599395752, | |
| "learning_rate": 2.5052109927563393e-05, | |
| "loss": 0.4641, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.5418502202643172, | |
| "grad_norm": 1.2356834411621094, | |
| "learning_rate": 2.4888817734285657e-05, | |
| "loss": 0.4388, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.5638766519823788, | |
| "grad_norm": 1.8782477378845215, | |
| "learning_rate": 2.472342592961683e-05, | |
| "loss": 0.4556, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.5859030837004404, | |
| "grad_norm": 1.1085964441299438, | |
| "learning_rate": 2.4555969629839393e-05, | |
| "loss": 0.4284, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.607929515418502, | |
| "grad_norm": 1.3663532733917236, | |
| "learning_rate": 2.4386484389573126e-05, | |
| "loss": 0.4322, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.6299559471365639, | |
| "grad_norm": 1.307347297668457, | |
| "learning_rate": 2.421500619422606e-05, | |
| "loss": 0.4378, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.6519823788546255, | |
| "grad_norm": 1.0821038484573364, | |
| "learning_rate": 2.4041571452353982e-05, | |
| "loss": 0.4003, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.6740088105726874, | |
| "grad_norm": 1.056179404258728, | |
| "learning_rate": 2.386621698793015e-05, | |
| "loss": 0.4161, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.696035242290749, | |
| "grad_norm": 1.2139962911605835, | |
| "learning_rate": 2.3688980032526707e-05, | |
| "loss": 0.3905, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.7180616740088106, | |
| "grad_norm": 1.0942909717559814, | |
| "learning_rate": 2.3509898217409645e-05, | |
| "loss": 0.4268, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.7400881057268722, | |
| "grad_norm": 1.277398705482483, | |
| "learning_rate": 2.3329009565548857e-05, | |
| "loss": 0.4007, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.7621145374449338, | |
| "grad_norm": 1.1945730447769165, | |
| "learning_rate": 2.3146352483545026e-05, | |
| "loss": 0.3755, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.7841409691629955, | |
| "grad_norm": 1.1625131368637085, | |
| "learning_rate": 2.2961965753475074e-05, | |
| "loss": 0.4081, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.8061674008810573, | |
| "grad_norm": 1.1497328281402588, | |
| "learning_rate": 2.277588852465788e-05, | |
| "loss": 0.3711, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.828193832599119, | |
| "grad_norm": 1.1712104082107544, | |
| "learning_rate": 2.2588160305342024e-05, | |
| "loss": 0.4283, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.8502202643171806, | |
| "grad_norm": 1.3465914726257324, | |
| "learning_rate": 2.2398820954317342e-05, | |
| "loss": 0.3948, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.8722466960352424, | |
| "grad_norm": 1.3581944704055786, | |
| "learning_rate": 2.220791067245201e-05, | |
| "loss": 0.4051, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.894273127753304, | |
| "grad_norm": 1.656079649925232, | |
| "learning_rate": 2.201546999415704e-05, | |
| "loss": 0.3842, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.9162995594713657, | |
| "grad_norm": 1.3536030054092407, | |
| "learning_rate": 2.182153977877994e-05, | |
| "loss": 0.3618, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.9383259911894273, | |
| "grad_norm": 1.2806636095046997, | |
| "learning_rate": 2.162616120192939e-05, | |
| "loss": 0.362, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.960352422907489, | |
| "grad_norm": 1.1593424081802368, | |
| "learning_rate": 2.142937574673275e-05, | |
| "loss": 0.3802, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.9823788546255505, | |
| "grad_norm": 1.5513302087783813, | |
| "learning_rate": 2.12312251950283e-05, | |
| "loss": 0.36, | |
| "step": 450 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1135, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.563190790820987e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |