| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.964980544747082, | |
| "eval_steps": 500, | |
| "global_step": 192, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01556420233463035, | |
| "grad_norm": 32.7662513325928, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 2.0427, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0311284046692607, | |
| "grad_norm": 14.17953242397471, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 1.8704, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.04669260700389105, | |
| "grad_norm": 9.581323731857186, | |
| "learning_rate": 1e-05, | |
| "loss": 2.0032, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0622568093385214, | |
| "grad_norm": 12.804841760907372, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 2.2126, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.07782101167315175, | |
| "grad_norm": 8.478925783183458, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 1.4769, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0933852140077821, | |
| "grad_norm": 8.969266052384613, | |
| "learning_rate": 2e-05, | |
| "loss": 1.5752, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.10894941634241245, | |
| "grad_norm": 11.83099546071694, | |
| "learning_rate": 1.999871626303739e-05, | |
| "loss": 1.5588, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.1245136186770428, | |
| "grad_norm": 5.954526723326734, | |
| "learning_rate": 1.999486541836746e-05, | |
| "loss": 1.3619, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.14007782101167315, | |
| "grad_norm": 36.01845182362445, | |
| "learning_rate": 1.9988448564539475e-05, | |
| "loss": 1.3373, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.1556420233463035, | |
| "grad_norm": 9.862509446330417, | |
| "learning_rate": 1.9979467532120636e-05, | |
| "loss": 1.8391, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.17120622568093385, | |
| "grad_norm": 6.924375728787968, | |
| "learning_rate": 1.99679248831739e-05, | |
| "loss": 1.7063, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.1867704280155642, | |
| "grad_norm": 5.947859490849787, | |
| "learning_rate": 1.9953823910527057e-05, | |
| "loss": 1.3137, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.20233463035019456, | |
| "grad_norm": 9.376302417818316, | |
| "learning_rate": 1.9937168636833405e-05, | |
| "loss": 1.4841, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.2178988326848249, | |
| "grad_norm": 8.09315137157763, | |
| "learning_rate": 1.9917963813424154e-05, | |
| "loss": 1.6061, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.23346303501945526, | |
| "grad_norm": 12.912871635467674, | |
| "learning_rate": 1.9896214918953003e-05, | |
| "loss": 1.5974, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.2490272373540856, | |
| "grad_norm": 4.890429692775311, | |
| "learning_rate": 1.9871928157833235e-05, | |
| "loss": 1.3604, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.26459143968871596, | |
| "grad_norm": 5.96078805337046, | |
| "learning_rate": 1.9845110458467724e-05, | |
| "loss": 1.4205, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.2801556420233463, | |
| "grad_norm": 5.406312981626749, | |
| "learning_rate": 1.981576947127245e-05, | |
| "loss": 1.3691, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.29571984435797666, | |
| "grad_norm": 6.229338294974077, | |
| "learning_rate": 1.978391356649404e-05, | |
| "loss": 1.4662, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.311284046692607, | |
| "grad_norm": 7.078018638767833, | |
| "learning_rate": 1.9749551831821917e-05, | |
| "loss": 1.4734, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.32684824902723736, | |
| "grad_norm": 11.02898750969552, | |
| "learning_rate": 1.971269406979584e-05, | |
| "loss": 1.3636, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.3424124513618677, | |
| "grad_norm": 6.82235626712361, | |
| "learning_rate": 1.9673350795009468e-05, | |
| "loss": 1.5607, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.35797665369649806, | |
| "grad_norm": 8.481695271045934, | |
| "learning_rate": 1.963153323111082e-05, | |
| "loss": 1.2882, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.3735408560311284, | |
| "grad_norm": 17.63369186879554, | |
| "learning_rate": 1.958725330760044e-05, | |
| "loss": 1.5022, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.38910505836575876, | |
| "grad_norm": 17.68569390194203, | |
| "learning_rate": 1.9540523656428223e-05, | |
| "loss": 1.2848, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.4046692607003891, | |
| "grad_norm": 6.641749083734281, | |
| "learning_rate": 1.9491357608389824e-05, | |
| "loss": 1.2358, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.42023346303501946, | |
| "grad_norm": 23.867198010532427, | |
| "learning_rate": 1.9439769189323727e-05, | |
| "loss": 1.2404, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.4357976653696498, | |
| "grad_norm": 6.690231465484989, | |
| "learning_rate": 1.9385773116110015e-05, | |
| "loss": 1.3622, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.45136186770428016, | |
| "grad_norm": 7.267926238407661, | |
| "learning_rate": 1.9329384792472036e-05, | |
| "loss": 1.3815, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.4669260700389105, | |
| "grad_norm": 7.371143249720988, | |
| "learning_rate": 1.9270620304582077e-05, | |
| "loss": 1.4497, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.48249027237354086, | |
| "grad_norm": 5.739368570493641, | |
| "learning_rate": 1.92094964164724e-05, | |
| "loss": 1.439, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.4980544747081712, | |
| "grad_norm": 10.712092924918982, | |
| "learning_rate": 1.9146030565252894e-05, | |
| "loss": 1.2185, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.5136186770428015, | |
| "grad_norm": 5.745736001033042, | |
| "learning_rate": 1.9080240856136675e-05, | |
| "loss": 1.4049, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.5291828793774319, | |
| "grad_norm": 7.632465200177505, | |
| "learning_rate": 1.9012146057275168e-05, | |
| "loss": 1.5059, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.5447470817120622, | |
| "grad_norm": 8.286309897189899, | |
| "learning_rate": 1.8941765594403975e-05, | |
| "loss": 1.4689, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.5603112840466926, | |
| "grad_norm": 8.480450264883203, | |
| "learning_rate": 1.886911954530124e-05, | |
| "loss": 1.5331, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.5758754863813229, | |
| "grad_norm": 9.499572491510447, | |
| "learning_rate": 1.879422863405995e-05, | |
| "loss": 1.635, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.5914396887159533, | |
| "grad_norm": 9.489789008744502, | |
| "learning_rate": 1.8717114225175858e-05, | |
| "loss": 1.4844, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.6070038910505836, | |
| "grad_norm": 7.2947137328088765, | |
| "learning_rate": 1.863779831745276e-05, | |
| "loss": 1.4507, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.622568093385214, | |
| "grad_norm": 5.913323265251484, | |
| "learning_rate": 1.8556303537726753e-05, | |
| "loss": 1.6038, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.6381322957198443, | |
| "grad_norm": 14.132278657345845, | |
| "learning_rate": 1.8472653134411388e-05, | |
| "loss": 1.3738, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.6536964980544747, | |
| "grad_norm": 7.698733116161007, | |
| "learning_rate": 1.8386870970865488e-05, | |
| "loss": 1.1948, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.669260700389105, | |
| "grad_norm": 6.593898707009616, | |
| "learning_rate": 1.8298981518585514e-05, | |
| "loss": 1.2161, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.6848249027237354, | |
| "grad_norm": 8.732218414278748, | |
| "learning_rate": 1.8209009850224465e-05, | |
| "loss": 1.3516, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.7003891050583657, | |
| "grad_norm": 10.91781491470466, | |
| "learning_rate": 1.811698163243929e-05, | |
| "loss": 1.3615, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.7159533073929961, | |
| "grad_norm": 10.205838168734168, | |
| "learning_rate": 1.8022923118568827e-05, | |
| "loss": 1.4948, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.7315175097276264, | |
| "grad_norm": 17.343787523950684, | |
| "learning_rate": 1.7926861141144393e-05, | |
| "loss": 1.4923, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.7470817120622568, | |
| "grad_norm": 6.218713745426841, | |
| "learning_rate": 1.782882310423512e-05, | |
| "loss": 1.3801, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.7626459143968871, | |
| "grad_norm": 9.930901357054527, | |
| "learning_rate": 1.7728836975630283e-05, | |
| "loss": 1.4591, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.7782101167315175, | |
| "grad_norm": 10.565666651397537, | |
| "learning_rate": 1.7626931278860773e-05, | |
| "loss": 1.2283, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.7937743190661478, | |
| "grad_norm": 7.173094908653802, | |
| "learning_rate": 1.752313508506208e-05, | |
| "loss": 1.1787, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.8093385214007782, | |
| "grad_norm": 7.454713889992053, | |
| "learning_rate": 1.7417478004680982e-05, | |
| "loss": 1.387, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.8249027237354085, | |
| "grad_norm": 8.888727310362047, | |
| "learning_rate": 1.730999017902848e-05, | |
| "loss": 1.363, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.8404669260700389, | |
| "grad_norm": 8.910897747960528, | |
| "learning_rate": 1.720070227168118e-05, | |
| "loss": 1.4924, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.8560311284046692, | |
| "grad_norm": 8.560591680367171, | |
| "learning_rate": 1.708964545973382e-05, | |
| "loss": 1.5208, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.8715953307392996, | |
| "grad_norm": 6.31006260674449, | |
| "learning_rate": 1.6976851424905153e-05, | |
| "loss": 1.1552, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.8871595330739299, | |
| "grad_norm": 15.810845104599778, | |
| "learning_rate": 1.6862352344500004e-05, | |
| "loss": 1.2454, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.9027237354085603, | |
| "grad_norm": 6.767459348182446, | |
| "learning_rate": 1.674618088222985e-05, | |
| "loss": 1.2886, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.9182879377431906, | |
| "grad_norm": 10.51614814940254, | |
| "learning_rate": 1.6628370178894734e-05, | |
| "loss": 1.2644, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.933852140077821, | |
| "grad_norm": 7.047043052174269, | |
| "learning_rate": 1.6508953842928966e-05, | |
| "loss": 1.443, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.9494163424124513, | |
| "grad_norm": 8.579327238483026, | |
| "learning_rate": 1.638796594081354e-05, | |
| "loss": 1.3322, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.9649805447470817, | |
| "grad_norm": 7.0287097887612235, | |
| "learning_rate": 1.626544098735777e-05, | |
| "loss": 1.4198, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.980544747081712, | |
| "grad_norm": 16.693418616763456, | |
| "learning_rate": 1.614141393585313e-05, | |
| "loss": 1.4243, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.9961089494163424, | |
| "grad_norm": 4.4493625007162185, | |
| "learning_rate": 1.601592016810193e-05, | |
| "loss": 1.0317, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 4.4493625007162185, | |
| "learning_rate": 1.588899548432377e-05, | |
| "loss": 0.3818, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 1.0155642023346303, | |
| "grad_norm": 14.571145059681054, | |
| "learning_rate": 1.5760676092942663e-05, | |
| "loss": 1.3258, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 1.0311284046692606, | |
| "grad_norm": 7.149351806597666, | |
| "learning_rate": 1.563099860025766e-05, | |
| "loss": 1.2366, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 1.046692607003891, | |
| "grad_norm": 40.50757660945441, | |
| "learning_rate": 1.55e-05, | |
| "loss": 1.6179, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 1.0622568093385214, | |
| "grad_norm": 15.44110442369705, | |
| "learning_rate": 1.5367717662779732e-05, | |
| "loss": 1.3405, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 1.0778210116731517, | |
| "grad_norm": 6.675357357971338, | |
| "learning_rate": 1.5234189325424802e-05, | |
| "loss": 1.1276, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.0933852140077822, | |
| "grad_norm": 7.1518156856898605, | |
| "learning_rate": 1.5099453080215705e-05, | |
| "loss": 1.2737, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 1.1089494163424125, | |
| "grad_norm": 6.2755998715712815, | |
| "learning_rate": 1.4963547364018711e-05, | |
| "loss": 1.2964, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 1.1245136186770428, | |
| "grad_norm": 7.749171240376019, | |
| "learning_rate": 1.4826510947320767e-05, | |
| "loss": 1.2542, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 1.140077821011673, | |
| "grad_norm": 8.188433813273727, | |
| "learning_rate": 1.4688382923169289e-05, | |
| "loss": 1.2587, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 1.1556420233463034, | |
| "grad_norm": 7.386225279732122, | |
| "learning_rate": 1.4549202696019868e-05, | |
| "loss": 1.3309, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.171206225680934, | |
| "grad_norm": 6.253316144967461, | |
| "learning_rate": 1.4409009970495184e-05, | |
| "loss": 1.3574, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 1.1867704280155642, | |
| "grad_norm": 10.042142885418704, | |
| "learning_rate": 1.4267844740058273e-05, | |
| "loss": 1.1808, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 1.2023346303501945, | |
| "grad_norm": 8.752169534398908, | |
| "learning_rate": 1.4125747275603384e-05, | |
| "loss": 1.2535, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 1.217898832684825, | |
| "grad_norm": 5.922268212950014, | |
| "learning_rate": 1.3982758113967723e-05, | |
| "loss": 1.4928, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 1.2334630350194553, | |
| "grad_norm": 13.340943215095326, | |
| "learning_rate": 1.3838918046367302e-05, | |
| "loss": 1.5576, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.2490272373540856, | |
| "grad_norm": 11.447188182283101, | |
| "learning_rate": 1.3694268106760225e-05, | |
| "loss": 1.3702, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 1.264591439688716, | |
| "grad_norm": 8.785930153191286, | |
| "learning_rate": 1.3548849560140735e-05, | |
| "loss": 1.5769, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 1.2801556420233462, | |
| "grad_norm": 11.289308481687042, | |
| "learning_rate": 1.3402703890767365e-05, | |
| "loss": 1.4041, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 1.2957198443579767, | |
| "grad_norm": 6.0701513028110865, | |
| "learning_rate": 1.3255872790328485e-05, | |
| "loss": 1.2474, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.311284046692607, | |
| "grad_norm": 15.420406437695464, | |
| "learning_rate": 1.310839814604874e-05, | |
| "loss": 1.3971, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.3268482490272373, | |
| "grad_norm": 11.112901019437691, | |
| "learning_rate": 1.2960322028739664e-05, | |
| "loss": 1.292, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 1.3424124513618678, | |
| "grad_norm": 6.574313635072488, | |
| "learning_rate": 1.2811686680797942e-05, | |
| "loss": 1.5592, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 1.3579766536964981, | |
| "grad_norm": 19.9661703497788, | |
| "learning_rate": 1.2662534504154707e-05, | |
| "loss": 1.5115, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.3735408560311284, | |
| "grad_norm": 18.38165853721984, | |
| "learning_rate": 1.2512908048179336e-05, | |
| "loss": 1.5681, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 1.3891050583657587, | |
| "grad_norm": 7.567686089019195, | |
| "learning_rate": 1.236284999754119e-05, | |
| "loss": 1.2417, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.404669260700389, | |
| "grad_norm": 12.905705238689295, | |
| "learning_rate": 1.221240316003275e-05, | |
| "loss": 1.2854, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 1.4202334630350195, | |
| "grad_norm": 20.818267922715442, | |
| "learning_rate": 1.2061610454357618e-05, | |
| "loss": 1.5286, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 1.4357976653696498, | |
| "grad_norm": 6.109213052045277, | |
| "learning_rate": 1.1910514897886892e-05, | |
| "loss": 1.3168, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 1.45136186770428, | |
| "grad_norm": 15.101872320411488, | |
| "learning_rate": 1.1759159594387404e-05, | |
| "loss": 1.5504, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 1.4669260700389106, | |
| "grad_norm": 6.233434571455187, | |
| "learning_rate": 1.1607587721725288e-05, | |
| "loss": 1.5917, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.482490272373541, | |
| "grad_norm": 9.122907207075865, | |
| "learning_rate": 1.1455842519548417e-05, | |
| "loss": 1.53, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.4980544747081712, | |
| "grad_norm": 7.271922289011854, | |
| "learning_rate": 1.1303967276951215e-05, | |
| "loss": 1.3232, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 1.5136186770428015, | |
| "grad_norm": 7.525926983479133, | |
| "learning_rate": 1.115200532012538e-05, | |
| "loss": 1.434, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 1.5291828793774318, | |
| "grad_norm": 13.459453151432884, | |
| "learning_rate": 1.1000000000000001e-05, | |
| "loss": 1.2062, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 1.544747081712062, | |
| "grad_norm": 6.34554520110176, | |
| "learning_rate": 1.0847994679874623e-05, | |
| "loss": 1.2515, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.5603112840466926, | |
| "grad_norm": 8.913386857418164, | |
| "learning_rate": 1.0696032723048787e-05, | |
| "loss": 1.2267, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 1.575875486381323, | |
| "grad_norm": 9.547469800185302, | |
| "learning_rate": 1.0544157480451586e-05, | |
| "loss": 1.1735, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.5914396887159534, | |
| "grad_norm": 7.932054956466567, | |
| "learning_rate": 1.0392412278274714e-05, | |
| "loss": 1.205, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.6070038910505837, | |
| "grad_norm": 9.208669039900636, | |
| "learning_rate": 1.02408404056126e-05, | |
| "loss": 1.1383, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.622568093385214, | |
| "grad_norm": 7.709935193825099, | |
| "learning_rate": 1.0089485102113113e-05, | |
| "loss": 1.4121, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.6381322957198443, | |
| "grad_norm": 6.819923905554452, | |
| "learning_rate": 9.938389545642388e-06, | |
| "loss": 1.3696, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.6536964980544746, | |
| "grad_norm": 8.206400329676246, | |
| "learning_rate": 9.787596839967254e-06, | |
| "loss": 1.3651, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.669260700389105, | |
| "grad_norm": 10.260363436595911, | |
| "learning_rate": 9.637150002458813e-06, | |
| "loss": 1.2666, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.6848249027237354, | |
| "grad_norm": 26.767196552606528, | |
| "learning_rate": 9.487091951820669e-06, | |
| "loss": 1.479, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.7003891050583657, | |
| "grad_norm": 7.51563263665608, | |
| "learning_rate": 9.337465495845299e-06, | |
| "loss": 1.2219, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.7159533073929962, | |
| "grad_norm": 12.755450160808891, | |
| "learning_rate": 9.188313319202057e-06, | |
| "loss": 1.4279, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.7315175097276265, | |
| "grad_norm": 6.349610495085197, | |
| "learning_rate": 9.039677971260337e-06, | |
| "loss": 1.4551, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.7470817120622568, | |
| "grad_norm": 8.917885244170789, | |
| "learning_rate": 8.891601853951262e-06, | |
| "loss": 1.2766, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 1.7626459143968871, | |
| "grad_norm": 13.766988130785693, | |
| "learning_rate": 8.744127209671516e-06, | |
| "loss": 1.2214, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.7782101167315174, | |
| "grad_norm": 15.79931988964264, | |
| "learning_rate": 8.597296109232636e-06, | |
| "loss": 1.2607, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.7937743190661477, | |
| "grad_norm": 27.982585836161547, | |
| "learning_rate": 8.451150439859264e-06, | |
| "loss": 1.213, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.8093385214007782, | |
| "grad_norm": 6.366753369802077, | |
| "learning_rate": 8.30573189323978e-06, | |
| "loss": 1.4226, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.8249027237354085, | |
| "grad_norm": 8.762108357477535, | |
| "learning_rate": 8.161081953632701e-06, | |
| "loss": 1.2593, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.840466926070039, | |
| "grad_norm": 9.6117893036037, | |
| "learning_rate": 8.01724188603228e-06, | |
| "loss": 1.625, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.8560311284046693, | |
| "grad_norm": 25.20227315878213, | |
| "learning_rate": 7.87425272439662e-06, | |
| "loss": 1.7573, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.8715953307392996, | |
| "grad_norm": 18.50697294390281, | |
| "learning_rate": 7.732155259941729e-06, | |
| "loss": 1.3655, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 1.88715953307393, | |
| "grad_norm": 11.419496186766294, | |
| "learning_rate": 7.590990029504816e-06, | |
| "loss": 1.2208, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 1.9027237354085602, | |
| "grad_norm": 12.960895376933111, | |
| "learning_rate": 7.450797303980135e-06, | |
| "loss": 1.1531, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.9182879377431905, | |
| "grad_norm": 10.895377744831976, | |
| "learning_rate": 7.311617076830715e-06, | |
| "loss": 1.2867, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.933852140077821, | |
| "grad_norm": 9.198976531684588, | |
| "learning_rate": 7.173489052679236e-06, | |
| "loss": 1.3783, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.9494163424124513, | |
| "grad_norm": 7.6941427655967365, | |
| "learning_rate": 7.0364526359812924e-06, | |
| "loss": 1.5269, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.9649805447470818, | |
| "grad_norm": 7.578325575993518, | |
| "learning_rate": 6.900546919784295e-06, | |
| "loss": 1.479, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 1.9805447470817121, | |
| "grad_norm": 9.809861880346132, | |
| "learning_rate": 6.7658106745752015e-06, | |
| "loss": 1.3796, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.9961089494163424, | |
| "grad_norm": 11.809424540063796, | |
| "learning_rate": 6.632282337220272e-06, | |
| "loss": 1.8018, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 11.809424540063796, | |
| "learning_rate": 6.500000000000003e-06, | |
| "loss": 0.3816, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.0155642023346303, | |
| "grad_norm": 10.863741493343777, | |
| "learning_rate": 6.369001399742344e-06, | |
| "loss": 1.2037, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 2.0311284046692606, | |
| "grad_norm": 5.962018084798566, | |
| "learning_rate": 6.239323907057342e-06, | |
| "loss": 0.9657, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 2.046692607003891, | |
| "grad_norm": 4.788797901834975, | |
| "learning_rate": 6.1110045156762355e-06, | |
| "loss": 1.1664, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 2.062256809338521, | |
| "grad_norm": 6.2976648307960446, | |
| "learning_rate": 5.984079831898073e-06, | |
| "loss": 1.4275, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 2.077821011673152, | |
| "grad_norm": 11.390981419662713, | |
| "learning_rate": 5.8585860641468674e-06, | |
| "loss": 1.1395, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 2.093385214007782, | |
| "grad_norm": 6.57931364685089, | |
| "learning_rate": 5.7345590126422315e-06, | |
| "loss": 1.2979, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 2.1089494163424125, | |
| "grad_norm": 9.124550827147443, | |
| "learning_rate": 5.612034059186464e-06, | |
| "loss": 1.5149, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 2.124513618677043, | |
| "grad_norm": 7.9154220919147145, | |
| "learning_rate": 5.491046157071034e-06, | |
| "loss": 1.2253, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 2.140077821011673, | |
| "grad_norm": 9.043495071655409, | |
| "learning_rate": 5.37162982110527e-06, | |
| "loss": 1.2771, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 2.1556420233463034, | |
| "grad_norm": 7.246287181191101, | |
| "learning_rate": 5.253819117770149e-06, | |
| "loss": 1.28, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.1712062256809337, | |
| "grad_norm": 14.135867016035936, | |
| "learning_rate": 5.137647655500002e-06, | |
| "loss": 1.2389, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 2.1867704280155644, | |
| "grad_norm": 6.191000880702552, | |
| "learning_rate": 5.023148575094847e-06, | |
| "loss": 1.3685, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 2.2023346303501947, | |
| "grad_norm": 6.912497051747966, | |
| "learning_rate": 4.910354540266184e-06, | |
| "loss": 1.1248, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 2.217898832684825, | |
| "grad_norm": 7.634399054473615, | |
| "learning_rate": 4.799297728318821e-06, | |
| "loss": 1.2091, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 2.2334630350194553, | |
| "grad_norm": 8.654404179498275, | |
| "learning_rate": 4.690009820971527e-06, | |
| "loss": 1.2775, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 2.2490272373540856, | |
| "grad_norm": 5.486520320319506, | |
| "learning_rate": 4.582521995319019e-06, | |
| "loss": 1.3234, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 2.264591439688716, | |
| "grad_norm": 14.08610341346389, | |
| "learning_rate": 4.476864914937923e-06, | |
| "loss": 1.1865, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 2.280155642023346, | |
| "grad_norm": 11.693098373701448, | |
| "learning_rate": 4.373068721139227e-06, | |
| "loss": 1.4238, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 2.2957198443579765, | |
| "grad_norm": 9.49971316853895, | |
| "learning_rate": 4.271163024369722e-06, | |
| "loss": 1.1235, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 2.311284046692607, | |
| "grad_norm": 7.887870435405849, | |
| "learning_rate": 4.171176895764882e-06, | |
| "loss": 1.1697, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.3268482490272375, | |
| "grad_norm": 7.29104535243051, | |
| "learning_rate": 4.07313885885561e-06, | |
| "loss": 1.4309, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 2.342412451361868, | |
| "grad_norm": 10.320755505362524, | |
| "learning_rate": 3.977076881431175e-06, | |
| "loss": 1.3613, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 2.357976653696498, | |
| "grad_norm": 5.857317648653641, | |
| "learning_rate": 3.883018367560715e-06, | |
| "loss": 1.3462, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 2.3735408560311284, | |
| "grad_norm": 9.13512553833875, | |
| "learning_rate": 3.7909901497755408e-06, | |
| "loss": 1.3862, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 2.3891050583657587, | |
| "grad_norm": 6.878082659822421, | |
| "learning_rate": 3.7010184814144916e-06, | |
| "loss": 1.3616, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 2.404669260700389, | |
| "grad_norm": 5.910003013217337, | |
| "learning_rate": 3.6131290291345155e-06, | |
| "loss": 1.3136, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 2.4202334630350193, | |
| "grad_norm": 9.041793350178478, | |
| "learning_rate": 3.527346865588614e-06, | |
| "loss": 1.2654, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 2.43579766536965, | |
| "grad_norm": 9.955462288729418, | |
| "learning_rate": 3.4436964622732493e-06, | |
| "loss": 1.3949, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 2.4513618677042803, | |
| "grad_norm": 12.124015441070618, | |
| "learning_rate": 3.3622016825472414e-06, | |
| "loss": 1.3149, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 2.4669260700389106, | |
| "grad_norm": 5.944228819887684, | |
| "learning_rate": 3.2828857748241404e-06, | |
| "loss": 1.3735, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.482490272373541, | |
| "grad_norm": 7.465704659945909, | |
| "learning_rate": 3.205771365940052e-06, | |
| "loss": 1.1572, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 2.498054474708171, | |
| "grad_norm": 8.012935003044838, | |
| "learning_rate": 3.1308804546987615e-06, | |
| "loss": 1.2964, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 2.5136186770428015, | |
| "grad_norm": 6.396153961978255, | |
| "learning_rate": 3.058234405596029e-06, | |
| "loss": 1.2518, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 2.529182879377432, | |
| "grad_norm": 32.66960667166894, | |
| "learning_rate": 2.9878539427248364e-06, | |
| "loss": 1.3154, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 2.544747081712062, | |
| "grad_norm": 7.182443079061496, | |
| "learning_rate": 2.919759143863326e-06, | |
| "loss": 1.2754, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 2.5603112840466924, | |
| "grad_norm": 9.065894651134005, | |
| "learning_rate": 2.8539694347471093e-06, | |
| "loss": 1.5717, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 2.5758754863813227, | |
| "grad_norm": 8.155108121011244, | |
| "learning_rate": 2.7905035835276e-06, | |
| "loss": 1.1931, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 2.5914396887159534, | |
| "grad_norm": 10.525703817651328, | |
| "learning_rate": 2.7293796954179254e-06, | |
| "loss": 1.2438, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 2.6070038910505837, | |
| "grad_norm": 10.790702057048689, | |
| "learning_rate": 2.670615207527965e-06, | |
| "loss": 1.2728, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 2.622568093385214, | |
| "grad_norm": 6.486978399127539, | |
| "learning_rate": 2.6142268838899844e-06, | |
| "loss": 1.3483, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.6381322957198443, | |
| "grad_norm": 11.399298263764798, | |
| "learning_rate": 2.5602308106762773e-06, | |
| "loss": 1.4894, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 2.6536964980544746, | |
| "grad_norm": 8.803157004201939, | |
| "learning_rate": 2.5086423916101794e-06, | |
| "loss": 1.5442, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 2.669260700389105, | |
| "grad_norm": 7.898925628777204, | |
| "learning_rate": 2.4594763435717788e-06, | |
| "loss": 1.3132, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 2.6848249027237356, | |
| "grad_norm": 6.01024097821099, | |
| "learning_rate": 2.412746692399561e-06, | |
| "loss": 1.3329, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 2.700389105058366, | |
| "grad_norm": 9.287448544998288, | |
| "learning_rate": 2.3684667688891813e-06, | |
| "loss": 1.2279, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 2.7159533073929962, | |
| "grad_norm": 6.509435104920216, | |
| "learning_rate": 2.3266492049905327e-06, | |
| "loss": 1.1356, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 2.7315175097276265, | |
| "grad_norm": 7.274526660218056, | |
| "learning_rate": 2.2873059302041627e-06, | |
| "loss": 1.2053, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 2.747081712062257, | |
| "grad_norm": 7.528171905726772, | |
| "learning_rate": 2.250448168178085e-06, | |
| "loss": 1.2631, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 2.762645914396887, | |
| "grad_norm": 7.685404987229149, | |
| "learning_rate": 2.216086433505963e-06, | |
| "loss": 1.1471, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 2.7782101167315174, | |
| "grad_norm": 7.30724911654224, | |
| "learning_rate": 2.18423052872755e-06, | |
| "loss": 1.1335, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.7937743190661477, | |
| "grad_norm": 8.275377130204019, | |
| "learning_rate": 2.154889541532279e-06, | |
| "loss": 1.4331, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 2.809338521400778, | |
| "grad_norm": 8.26916689050228, | |
| "learning_rate": 2.128071842166766e-06, | |
| "loss": 1.1323, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 2.8249027237354083, | |
| "grad_norm": 8.648891670292945, | |
| "learning_rate": 2.1037850810469977e-06, | |
| "loss": 1.0748, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 2.840466926070039, | |
| "grad_norm": 8.864628121919203, | |
| "learning_rate": 2.0820361865758506e-06, | |
| "loss": 1.2159, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 2.8560311284046693, | |
| "grad_norm": 7.550747013009374, | |
| "learning_rate": 2.0628313631665977e-06, | |
| "loss": 1.1746, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 2.8715953307392996, | |
| "grad_norm": 5.4396609218382075, | |
| "learning_rate": 2.0461760894729438e-06, | |
| "loss": 1.1403, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 2.88715953307393, | |
| "grad_norm": 9.061879291284523, | |
| "learning_rate": 2.032075116826103e-06, | |
| "loss": 1.5448, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 2.90272373540856, | |
| "grad_norm": 6.124913133747852, | |
| "learning_rate": 2.0205324678793635e-06, | |
| "loss": 1.1864, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 2.9182879377431905, | |
| "grad_norm": 7.03064717545691, | |
| "learning_rate": 2.0115514354605255e-06, | |
| "loss": 1.3855, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 2.9338521400778212, | |
| "grad_norm": 8.91521754504974, | |
| "learning_rate": 2.005134581632538e-06, | |
| "loss": 1.3689, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.9494163424124515, | |
| "grad_norm": 8.561228532947991, | |
| "learning_rate": 2.001283736962612e-06, | |
| "loss": 1.5862, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 2.964980544747082, | |
| "grad_norm": 10.281179103024385, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 1.1926, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 2.964980544747082, | |
| "step": 192, | |
| "total_flos": 104900150247424.0, | |
| "train_loss": 1.356536865234375, | |
| "train_runtime": 15762.2393, | |
| "train_samples_per_second": 1.571, | |
| "train_steps_per_second": 0.012 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 192, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 104900150247424.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |