{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.964980544747082, "eval_steps": 500, "global_step": 192, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01556420233463035, "grad_norm": 32.7662513325928, "learning_rate": 3.3333333333333333e-06, "loss": 2.0427, "step": 1 }, { "epoch": 0.0311284046692607, "grad_norm": 14.17953242397471, "learning_rate": 6.666666666666667e-06, "loss": 1.8704, "step": 2 }, { "epoch": 0.04669260700389105, "grad_norm": 9.581323731857186, "learning_rate": 1e-05, "loss": 2.0032, "step": 3 }, { "epoch": 0.0622568093385214, "grad_norm": 12.804841760907372, "learning_rate": 1.3333333333333333e-05, "loss": 2.2126, "step": 4 }, { "epoch": 0.07782101167315175, "grad_norm": 8.478925783183458, "learning_rate": 1.6666666666666667e-05, "loss": 1.4769, "step": 5 }, { "epoch": 0.0933852140077821, "grad_norm": 8.969266052384613, "learning_rate": 2e-05, "loss": 1.5752, "step": 6 }, { "epoch": 0.10894941634241245, "grad_norm": 11.83099546071694, "learning_rate": 1.999871626303739e-05, "loss": 1.5588, "step": 7 }, { "epoch": 0.1245136186770428, "grad_norm": 5.954526723326734, "learning_rate": 1.999486541836746e-05, "loss": 1.3619, "step": 8 }, { "epoch": 0.14007782101167315, "grad_norm": 36.01845182362445, "learning_rate": 1.9988448564539475e-05, "loss": 1.3373, "step": 9 }, { "epoch": 0.1556420233463035, "grad_norm": 9.862509446330417, "learning_rate": 1.9979467532120636e-05, "loss": 1.8391, "step": 10 }, { "epoch": 0.17120622568093385, "grad_norm": 6.924375728787968, "learning_rate": 1.99679248831739e-05, "loss": 1.7063, "step": 11 }, { "epoch": 0.1867704280155642, "grad_norm": 5.947859490849787, "learning_rate": 1.9953823910527057e-05, "loss": 1.3137, "step": 12 }, { "epoch": 0.20233463035019456, "grad_norm": 9.376302417818316, "learning_rate": 1.9937168636833405e-05, "loss": 1.4841, "step": 13 }, { "epoch": 0.2178988326848249, "grad_norm": 8.09315137157763, "learning_rate": 1.9917963813424154e-05, "loss": 1.6061, "step": 14 }, { "epoch": 0.23346303501945526, "grad_norm": 12.912871635467674, "learning_rate": 1.9896214918953003e-05, "loss": 1.5974, "step": 15 }, { "epoch": 0.2490272373540856, "grad_norm": 4.890429692775311, "learning_rate": 1.9871928157833235e-05, "loss": 1.3604, "step": 16 }, { "epoch": 0.26459143968871596, "grad_norm": 5.96078805337046, "learning_rate": 1.9845110458467724e-05, "loss": 1.4205, "step": 17 }, { "epoch": 0.2801556420233463, "grad_norm": 5.406312981626749, "learning_rate": 1.981576947127245e-05, "loss": 1.3691, "step": 18 }, { "epoch": 0.29571984435797666, "grad_norm": 6.229338294974077, "learning_rate": 1.978391356649404e-05, "loss": 1.4662, "step": 19 }, { "epoch": 0.311284046692607, "grad_norm": 7.078018638767833, "learning_rate": 1.9749551831821917e-05, "loss": 1.4734, "step": 20 }, { "epoch": 0.32684824902723736, "grad_norm": 11.02898750969552, "learning_rate": 1.971269406979584e-05, "loss": 1.3636, "step": 21 }, { "epoch": 0.3424124513618677, "grad_norm": 6.82235626712361, "learning_rate": 1.9673350795009468e-05, "loss": 1.5607, "step": 22 }, { "epoch": 0.35797665369649806, "grad_norm": 8.481695271045934, "learning_rate": 1.963153323111082e-05, "loss": 1.2882, "step": 23 }, { "epoch": 0.3735408560311284, "grad_norm": 17.63369186879554, "learning_rate": 1.958725330760044e-05, "loss": 1.5022, "step": 24 }, { "epoch": 0.38910505836575876, "grad_norm": 17.68569390194203, "learning_rate": 1.9540523656428223e-05, "loss": 1.2848, "step": 25 }, { "epoch": 0.4046692607003891, "grad_norm": 6.641749083734281, "learning_rate": 1.9491357608389824e-05, "loss": 1.2358, "step": 26 }, { "epoch": 0.42023346303501946, "grad_norm": 23.867198010532427, "learning_rate": 1.9439769189323727e-05, "loss": 1.2404, "step": 27 }, { "epoch": 0.4357976653696498, "grad_norm": 6.690231465484989, "learning_rate": 1.9385773116110015e-05, "loss": 1.3622, "step": 28 }, { "epoch": 0.45136186770428016, "grad_norm": 7.267926238407661, "learning_rate": 1.9329384792472036e-05, "loss": 1.3815, "step": 29 }, { "epoch": 0.4669260700389105, "grad_norm": 7.371143249720988, "learning_rate": 1.9270620304582077e-05, "loss": 1.4497, "step": 30 }, { "epoch": 0.48249027237354086, "grad_norm": 5.739368570493641, "learning_rate": 1.92094964164724e-05, "loss": 1.439, "step": 31 }, { "epoch": 0.4980544747081712, "grad_norm": 10.712092924918982, "learning_rate": 1.9146030565252894e-05, "loss": 1.2185, "step": 32 }, { "epoch": 0.5136186770428015, "grad_norm": 5.745736001033042, "learning_rate": 1.9080240856136675e-05, "loss": 1.4049, "step": 33 }, { "epoch": 0.5291828793774319, "grad_norm": 7.632465200177505, "learning_rate": 1.9012146057275168e-05, "loss": 1.5059, "step": 34 }, { "epoch": 0.5447470817120622, "grad_norm": 8.286309897189899, "learning_rate": 1.8941765594403975e-05, "loss": 1.4689, "step": 35 }, { "epoch": 0.5603112840466926, "grad_norm": 8.480450264883203, "learning_rate": 1.886911954530124e-05, "loss": 1.5331, "step": 36 }, { "epoch": 0.5758754863813229, "grad_norm": 9.499572491510447, "learning_rate": 1.879422863405995e-05, "loss": 1.635, "step": 37 }, { "epoch": 0.5914396887159533, "grad_norm": 9.489789008744502, "learning_rate": 1.8717114225175858e-05, "loss": 1.4844, "step": 38 }, { "epoch": 0.6070038910505836, "grad_norm": 7.2947137328088765, "learning_rate": 1.863779831745276e-05, "loss": 1.4507, "step": 39 }, { "epoch": 0.622568093385214, "grad_norm": 5.913323265251484, "learning_rate": 1.8556303537726753e-05, "loss": 1.6038, "step": 40 }, { "epoch": 0.6381322957198443, "grad_norm": 14.132278657345845, "learning_rate": 1.8472653134411388e-05, "loss": 1.3738, "step": 41 }, { "epoch": 0.6536964980544747, "grad_norm": 7.698733116161007, "learning_rate": 1.8386870970865488e-05, "loss": 1.1948, "step": 42 }, { "epoch": 0.669260700389105, "grad_norm": 6.593898707009616, "learning_rate": 1.8298981518585514e-05, "loss": 1.2161, "step": 43 }, { "epoch": 0.6848249027237354, "grad_norm": 8.732218414278748, "learning_rate": 1.8209009850224465e-05, "loss": 1.3516, "step": 44 }, { "epoch": 0.7003891050583657, "grad_norm": 10.91781491470466, "learning_rate": 1.811698163243929e-05, "loss": 1.3615, "step": 45 }, { "epoch": 0.7159533073929961, "grad_norm": 10.205838168734168, "learning_rate": 1.8022923118568827e-05, "loss": 1.4948, "step": 46 }, { "epoch": 0.7315175097276264, "grad_norm": 17.343787523950684, "learning_rate": 1.7926861141144393e-05, "loss": 1.4923, "step": 47 }, { "epoch": 0.7470817120622568, "grad_norm": 6.218713745426841, "learning_rate": 1.782882310423512e-05, "loss": 1.3801, "step": 48 }, { "epoch": 0.7626459143968871, "grad_norm": 9.930901357054527, "learning_rate": 1.7728836975630283e-05, "loss": 1.4591, "step": 49 }, { "epoch": 0.7782101167315175, "grad_norm": 10.565666651397537, "learning_rate": 1.7626931278860773e-05, "loss": 1.2283, "step": 50 }, { "epoch": 0.7937743190661478, "grad_norm": 7.173094908653802, "learning_rate": 1.752313508506208e-05, "loss": 1.1787, "step": 51 }, { "epoch": 0.8093385214007782, "grad_norm": 7.454713889992053, "learning_rate": 1.7417478004680982e-05, "loss": 1.387, "step": 52 }, { "epoch": 0.8249027237354085, "grad_norm": 8.888727310362047, "learning_rate": 1.730999017902848e-05, "loss": 1.363, "step": 53 }, { "epoch": 0.8404669260700389, "grad_norm": 8.910897747960528, "learning_rate": 1.720070227168118e-05, "loss": 1.4924, "step": 54 }, { "epoch": 0.8560311284046692, "grad_norm": 8.560591680367171, "learning_rate": 1.708964545973382e-05, "loss": 1.5208, "step": 55 }, { "epoch": 0.8715953307392996, "grad_norm": 6.31006260674449, "learning_rate": 1.6976851424905153e-05, "loss": 1.1552, "step": 56 }, { "epoch": 0.8871595330739299, "grad_norm": 15.810845104599778, "learning_rate": 1.6862352344500004e-05, "loss": 1.2454, "step": 57 }, { "epoch": 0.9027237354085603, "grad_norm": 6.767459348182446, "learning_rate": 1.674618088222985e-05, "loss": 1.2886, "step": 58 }, { "epoch": 0.9182879377431906, "grad_norm": 10.51614814940254, "learning_rate": 1.6628370178894734e-05, "loss": 1.2644, "step": 59 }, { "epoch": 0.933852140077821, "grad_norm": 7.047043052174269, "learning_rate": 1.6508953842928966e-05, "loss": 1.443, "step": 60 }, { "epoch": 0.9494163424124513, "grad_norm": 8.579327238483026, "learning_rate": 1.638796594081354e-05, "loss": 1.3322, "step": 61 }, { "epoch": 0.9649805447470817, "grad_norm": 7.0287097887612235, "learning_rate": 1.626544098735777e-05, "loss": 1.4198, "step": 62 }, { "epoch": 0.980544747081712, "grad_norm": 16.693418616763456, "learning_rate": 1.614141393585313e-05, "loss": 1.4243, "step": 63 }, { "epoch": 0.9961089494163424, "grad_norm": 4.4493625007162185, "learning_rate": 1.601592016810193e-05, "loss": 1.0317, "step": 64 }, { "epoch": 1.0, "grad_norm": 4.4493625007162185, "learning_rate": 1.588899548432377e-05, "loss": 0.3818, "step": 65 }, { "epoch": 1.0155642023346303, "grad_norm": 14.571145059681054, "learning_rate": 1.5760676092942663e-05, "loss": 1.3258, "step": 66 }, { "epoch": 1.0311284046692606, "grad_norm": 7.149351806597666, "learning_rate": 1.563099860025766e-05, "loss": 1.2366, "step": 67 }, { "epoch": 1.046692607003891, "grad_norm": 40.50757660945441, "learning_rate": 1.55e-05, "loss": 1.6179, "step": 68 }, { "epoch": 1.0622568093385214, "grad_norm": 15.44110442369705, "learning_rate": 1.5367717662779732e-05, "loss": 1.3405, "step": 69 }, { "epoch": 1.0778210116731517, "grad_norm": 6.675357357971338, "learning_rate": 1.5234189325424802e-05, "loss": 1.1276, "step": 70 }, { "epoch": 1.0933852140077822, "grad_norm": 7.1518156856898605, "learning_rate": 1.5099453080215705e-05, "loss": 1.2737, "step": 71 }, { "epoch": 1.1089494163424125, "grad_norm": 6.2755998715712815, "learning_rate": 1.4963547364018711e-05, "loss": 1.2964, "step": 72 }, { "epoch": 1.1245136186770428, "grad_norm": 7.749171240376019, "learning_rate": 1.4826510947320767e-05, "loss": 1.2542, "step": 73 }, { "epoch": 1.140077821011673, "grad_norm": 8.188433813273727, "learning_rate": 1.4688382923169289e-05, "loss": 1.2587, "step": 74 }, { "epoch": 1.1556420233463034, "grad_norm": 7.386225279732122, "learning_rate": 1.4549202696019868e-05, "loss": 1.3309, "step": 75 }, { "epoch": 1.171206225680934, "grad_norm": 6.253316144967461, "learning_rate": 1.4409009970495184e-05, "loss": 1.3574, "step": 76 }, { "epoch": 1.1867704280155642, "grad_norm": 10.042142885418704, "learning_rate": 1.4267844740058273e-05, "loss": 1.1808, "step": 77 }, { "epoch": 1.2023346303501945, "grad_norm": 8.752169534398908, "learning_rate": 1.4125747275603384e-05, "loss": 1.2535, "step": 78 }, { "epoch": 1.217898832684825, "grad_norm": 5.922268212950014, "learning_rate": 1.3982758113967723e-05, "loss": 1.4928, "step": 79 }, { "epoch": 1.2334630350194553, "grad_norm": 13.340943215095326, "learning_rate": 1.3838918046367302e-05, "loss": 1.5576, "step": 80 }, { "epoch": 1.2490272373540856, "grad_norm": 11.447188182283101, "learning_rate": 1.3694268106760225e-05, "loss": 1.3702, "step": 81 }, { "epoch": 1.264591439688716, "grad_norm": 8.785930153191286, "learning_rate": 1.3548849560140735e-05, "loss": 1.5769, "step": 82 }, { "epoch": 1.2801556420233462, "grad_norm": 11.289308481687042, "learning_rate": 1.3402703890767365e-05, "loss": 1.4041, "step": 83 }, { "epoch": 1.2957198443579767, "grad_norm": 6.0701513028110865, "learning_rate": 1.3255872790328485e-05, "loss": 1.2474, "step": 84 }, { "epoch": 1.311284046692607, "grad_norm": 15.420406437695464, "learning_rate": 1.310839814604874e-05, "loss": 1.3971, "step": 85 }, { "epoch": 1.3268482490272373, "grad_norm": 11.112901019437691, "learning_rate": 1.2960322028739664e-05, "loss": 1.292, "step": 86 }, { "epoch": 1.3424124513618678, "grad_norm": 6.574313635072488, "learning_rate": 1.2811686680797942e-05, "loss": 1.5592, "step": 87 }, { "epoch": 1.3579766536964981, "grad_norm": 19.9661703497788, "learning_rate": 1.2662534504154707e-05, "loss": 1.5115, "step": 88 }, { "epoch": 1.3735408560311284, "grad_norm": 18.38165853721984, "learning_rate": 1.2512908048179336e-05, "loss": 1.5681, "step": 89 }, { "epoch": 1.3891050583657587, "grad_norm": 7.567686089019195, "learning_rate": 1.236284999754119e-05, "loss": 1.2417, "step": 90 }, { "epoch": 1.404669260700389, "grad_norm": 12.905705238689295, "learning_rate": 1.221240316003275e-05, "loss": 1.2854, "step": 91 }, { "epoch": 1.4202334630350195, "grad_norm": 20.818267922715442, "learning_rate": 1.2061610454357618e-05, "loss": 1.5286, "step": 92 }, { "epoch": 1.4357976653696498, "grad_norm": 6.109213052045277, "learning_rate": 1.1910514897886892e-05, "loss": 1.3168, "step": 93 }, { "epoch": 1.45136186770428, "grad_norm": 15.101872320411488, "learning_rate": 1.1759159594387404e-05, "loss": 1.5504, "step": 94 }, { "epoch": 1.4669260700389106, "grad_norm": 6.233434571455187, "learning_rate": 1.1607587721725288e-05, "loss": 1.5917, "step": 95 }, { "epoch": 1.482490272373541, "grad_norm": 9.122907207075865, "learning_rate": 1.1455842519548417e-05, "loss": 1.53, "step": 96 }, { "epoch": 1.4980544747081712, "grad_norm": 7.271922289011854, "learning_rate": 1.1303967276951215e-05, "loss": 1.3232, "step": 97 }, { "epoch": 1.5136186770428015, "grad_norm": 7.525926983479133, "learning_rate": 1.115200532012538e-05, "loss": 1.434, "step": 98 }, { "epoch": 1.5291828793774318, "grad_norm": 13.459453151432884, "learning_rate": 1.1000000000000001e-05, "loss": 1.2062, "step": 99 }, { "epoch": 1.544747081712062, "grad_norm": 6.34554520110176, "learning_rate": 1.0847994679874623e-05, "loss": 1.2515, "step": 100 }, { "epoch": 1.5603112840466926, "grad_norm": 8.913386857418164, "learning_rate": 1.0696032723048787e-05, "loss": 1.2267, "step": 101 }, { "epoch": 1.575875486381323, "grad_norm": 9.547469800185302, "learning_rate": 1.0544157480451586e-05, "loss": 1.1735, "step": 102 }, { "epoch": 1.5914396887159534, "grad_norm": 7.932054956466567, "learning_rate": 1.0392412278274714e-05, "loss": 1.205, "step": 103 }, { "epoch": 1.6070038910505837, "grad_norm": 9.208669039900636, "learning_rate": 1.02408404056126e-05, "loss": 1.1383, "step": 104 }, { "epoch": 1.622568093385214, "grad_norm": 7.709935193825099, "learning_rate": 1.0089485102113113e-05, "loss": 1.4121, "step": 105 }, { "epoch": 1.6381322957198443, "grad_norm": 6.819923905554452, "learning_rate": 9.938389545642388e-06, "loss": 1.3696, "step": 106 }, { "epoch": 1.6536964980544746, "grad_norm": 8.206400329676246, "learning_rate": 9.787596839967254e-06, "loss": 1.3651, "step": 107 }, { "epoch": 1.669260700389105, "grad_norm": 10.260363436595911, "learning_rate": 9.637150002458813e-06, "loss": 1.2666, "step": 108 }, { "epoch": 1.6848249027237354, "grad_norm": 26.767196552606528, "learning_rate": 9.487091951820669e-06, "loss": 1.479, "step": 109 }, { "epoch": 1.7003891050583657, "grad_norm": 7.51563263665608, "learning_rate": 9.337465495845299e-06, "loss": 1.2219, "step": 110 }, { "epoch": 1.7159533073929962, "grad_norm": 12.755450160808891, "learning_rate": 9.188313319202057e-06, "loss": 1.4279, "step": 111 }, { "epoch": 1.7315175097276265, "grad_norm": 6.349610495085197, "learning_rate": 9.039677971260337e-06, "loss": 1.4551, "step": 112 }, { "epoch": 1.7470817120622568, "grad_norm": 8.917885244170789, "learning_rate": 8.891601853951262e-06, "loss": 1.2766, "step": 113 }, { "epoch": 1.7626459143968871, "grad_norm": 13.766988130785693, "learning_rate": 8.744127209671516e-06, "loss": 1.2214, "step": 114 }, { "epoch": 1.7782101167315174, "grad_norm": 15.79931988964264, "learning_rate": 8.597296109232636e-06, "loss": 1.2607, "step": 115 }, { "epoch": 1.7937743190661477, "grad_norm": 27.982585836161547, "learning_rate": 8.451150439859264e-06, "loss": 1.213, "step": 116 }, { "epoch": 1.8093385214007782, "grad_norm": 6.366753369802077, "learning_rate": 8.30573189323978e-06, "loss": 1.4226, "step": 117 }, { "epoch": 1.8249027237354085, "grad_norm": 8.762108357477535, "learning_rate": 8.161081953632701e-06, "loss": 1.2593, "step": 118 }, { "epoch": 1.840466926070039, "grad_norm": 9.6117893036037, "learning_rate": 8.01724188603228e-06, "loss": 1.625, "step": 119 }, { "epoch": 1.8560311284046693, "grad_norm": 25.20227315878213, "learning_rate": 7.87425272439662e-06, "loss": 1.7573, "step": 120 }, { "epoch": 1.8715953307392996, "grad_norm": 18.50697294390281, "learning_rate": 7.732155259941729e-06, "loss": 1.3655, "step": 121 }, { "epoch": 1.88715953307393, "grad_norm": 11.419496186766294, "learning_rate": 7.590990029504816e-06, "loss": 1.2208, "step": 122 }, { "epoch": 1.9027237354085602, "grad_norm": 12.960895376933111, "learning_rate": 7.450797303980135e-06, "loss": 1.1531, "step": 123 }, { "epoch": 1.9182879377431905, "grad_norm": 10.895377744831976, "learning_rate": 7.311617076830715e-06, "loss": 1.2867, "step": 124 }, { "epoch": 1.933852140077821, "grad_norm": 9.198976531684588, "learning_rate": 7.173489052679236e-06, "loss": 1.3783, "step": 125 }, { "epoch": 1.9494163424124513, "grad_norm": 7.6941427655967365, "learning_rate": 7.0364526359812924e-06, "loss": 1.5269, "step": 126 }, { "epoch": 1.9649805447470818, "grad_norm": 7.578325575993518, "learning_rate": 6.900546919784295e-06, "loss": 1.479, "step": 127 }, { "epoch": 1.9805447470817121, "grad_norm": 9.809861880346132, "learning_rate": 6.7658106745752015e-06, "loss": 1.3796, "step": 128 }, { "epoch": 1.9961089494163424, "grad_norm": 11.809424540063796, "learning_rate": 6.632282337220272e-06, "loss": 1.8018, "step": 129 }, { "epoch": 2.0, "grad_norm": 11.809424540063796, "learning_rate": 6.500000000000003e-06, "loss": 0.3816, "step": 130 }, { "epoch": 2.0155642023346303, "grad_norm": 10.863741493343777, "learning_rate": 6.369001399742344e-06, "loss": 1.2037, "step": 131 }, { "epoch": 2.0311284046692606, "grad_norm": 5.962018084798566, "learning_rate": 6.239323907057342e-06, "loss": 0.9657, "step": 132 }, { "epoch": 2.046692607003891, "grad_norm": 4.788797901834975, "learning_rate": 6.1110045156762355e-06, "loss": 1.1664, "step": 133 }, { "epoch": 2.062256809338521, "grad_norm": 6.2976648307960446, "learning_rate": 5.984079831898073e-06, "loss": 1.4275, "step": 134 }, { "epoch": 2.077821011673152, "grad_norm": 11.390981419662713, "learning_rate": 5.8585860641468674e-06, "loss": 1.1395, "step": 135 }, { "epoch": 2.093385214007782, "grad_norm": 6.57931364685089, "learning_rate": 5.7345590126422315e-06, "loss": 1.2979, "step": 136 }, { "epoch": 2.1089494163424125, "grad_norm": 9.124550827147443, "learning_rate": 5.612034059186464e-06, "loss": 1.5149, "step": 137 }, { "epoch": 2.124513618677043, "grad_norm": 7.9154220919147145, "learning_rate": 5.491046157071034e-06, "loss": 1.2253, "step": 138 }, { "epoch": 2.140077821011673, "grad_norm": 9.043495071655409, "learning_rate": 5.37162982110527e-06, "loss": 1.2771, "step": 139 }, { "epoch": 2.1556420233463034, "grad_norm": 7.246287181191101, "learning_rate": 5.253819117770149e-06, "loss": 1.28, "step": 140 }, { "epoch": 2.1712062256809337, "grad_norm": 14.135867016035936, "learning_rate": 5.137647655500002e-06, "loss": 1.2389, "step": 141 }, { "epoch": 2.1867704280155644, "grad_norm": 6.191000880702552, "learning_rate": 5.023148575094847e-06, "loss": 1.3685, "step": 142 }, { "epoch": 2.2023346303501947, "grad_norm": 6.912497051747966, "learning_rate": 4.910354540266184e-06, "loss": 1.1248, "step": 143 }, { "epoch": 2.217898832684825, "grad_norm": 7.634399054473615, "learning_rate": 4.799297728318821e-06, "loss": 1.2091, "step": 144 }, { "epoch": 2.2334630350194553, "grad_norm": 8.654404179498275, "learning_rate": 4.690009820971527e-06, "loss": 1.2775, "step": 145 }, { "epoch": 2.2490272373540856, "grad_norm": 5.486520320319506, "learning_rate": 4.582521995319019e-06, "loss": 1.3234, "step": 146 }, { "epoch": 2.264591439688716, "grad_norm": 14.08610341346389, "learning_rate": 4.476864914937923e-06, "loss": 1.1865, "step": 147 }, { "epoch": 2.280155642023346, "grad_norm": 11.693098373701448, "learning_rate": 4.373068721139227e-06, "loss": 1.4238, "step": 148 }, { "epoch": 2.2957198443579765, "grad_norm": 9.49971316853895, "learning_rate": 4.271163024369722e-06, "loss": 1.1235, "step": 149 }, { "epoch": 2.311284046692607, "grad_norm": 7.887870435405849, "learning_rate": 4.171176895764882e-06, "loss": 1.1697, "step": 150 }, { "epoch": 2.3268482490272375, "grad_norm": 7.29104535243051, "learning_rate": 4.07313885885561e-06, "loss": 1.4309, "step": 151 }, { "epoch": 2.342412451361868, "grad_norm": 10.320755505362524, "learning_rate": 3.977076881431175e-06, "loss": 1.3613, "step": 152 }, { "epoch": 2.357976653696498, "grad_norm": 5.857317648653641, "learning_rate": 3.883018367560715e-06, "loss": 1.3462, "step": 153 }, { "epoch": 2.3735408560311284, "grad_norm": 9.13512553833875, "learning_rate": 3.7909901497755408e-06, "loss": 1.3862, "step": 154 }, { "epoch": 2.3891050583657587, "grad_norm": 6.878082659822421, "learning_rate": 3.7010184814144916e-06, "loss": 1.3616, "step": 155 }, { "epoch": 2.404669260700389, "grad_norm": 5.910003013217337, "learning_rate": 3.6131290291345155e-06, "loss": 1.3136, "step": 156 }, { "epoch": 2.4202334630350193, "grad_norm": 9.041793350178478, "learning_rate": 3.527346865588614e-06, "loss": 1.2654, "step": 157 }, { "epoch": 2.43579766536965, "grad_norm": 9.955462288729418, "learning_rate": 3.4436964622732493e-06, "loss": 1.3949, "step": 158 }, { "epoch": 2.4513618677042803, "grad_norm": 12.124015441070618, "learning_rate": 3.3622016825472414e-06, "loss": 1.3149, "step": 159 }, { "epoch": 2.4669260700389106, "grad_norm": 5.944228819887684, "learning_rate": 3.2828857748241404e-06, "loss": 1.3735, "step": 160 }, { "epoch": 2.482490272373541, "grad_norm": 7.465704659945909, "learning_rate": 3.205771365940052e-06, "loss": 1.1572, "step": 161 }, { "epoch": 2.498054474708171, "grad_norm": 8.012935003044838, "learning_rate": 3.1308804546987615e-06, "loss": 1.2964, "step": 162 }, { "epoch": 2.5136186770428015, "grad_norm": 6.396153961978255, "learning_rate": 3.058234405596029e-06, "loss": 1.2518, "step": 163 }, { "epoch": 2.529182879377432, "grad_norm": 32.66960667166894, "learning_rate": 2.9878539427248364e-06, "loss": 1.3154, "step": 164 }, { "epoch": 2.544747081712062, "grad_norm": 7.182443079061496, "learning_rate": 2.919759143863326e-06, "loss": 1.2754, "step": 165 }, { "epoch": 2.5603112840466924, "grad_norm": 9.065894651134005, "learning_rate": 2.8539694347471093e-06, "loss": 1.5717, "step": 166 }, { "epoch": 2.5758754863813227, "grad_norm": 8.155108121011244, "learning_rate": 2.7905035835276e-06, "loss": 1.1931, "step": 167 }, { "epoch": 2.5914396887159534, "grad_norm": 10.525703817651328, "learning_rate": 2.7293796954179254e-06, "loss": 1.2438, "step": 168 }, { "epoch": 2.6070038910505837, "grad_norm": 10.790702057048689, "learning_rate": 2.670615207527965e-06, "loss": 1.2728, "step": 169 }, { "epoch": 2.622568093385214, "grad_norm": 6.486978399127539, "learning_rate": 2.6142268838899844e-06, "loss": 1.3483, "step": 170 }, { "epoch": 2.6381322957198443, "grad_norm": 11.399298263764798, "learning_rate": 2.5602308106762773e-06, "loss": 1.4894, "step": 171 }, { "epoch": 2.6536964980544746, "grad_norm": 8.803157004201939, "learning_rate": 2.5086423916101794e-06, "loss": 1.5442, "step": 172 }, { "epoch": 2.669260700389105, "grad_norm": 7.898925628777204, "learning_rate": 2.4594763435717788e-06, "loss": 1.3132, "step": 173 }, { "epoch": 2.6848249027237356, "grad_norm": 6.01024097821099, "learning_rate": 2.412746692399561e-06, "loss": 1.3329, "step": 174 }, { "epoch": 2.700389105058366, "grad_norm": 9.287448544998288, "learning_rate": 2.3684667688891813e-06, "loss": 1.2279, "step": 175 }, { "epoch": 2.7159533073929962, "grad_norm": 6.509435104920216, "learning_rate": 2.3266492049905327e-06, "loss": 1.1356, "step": 176 }, { "epoch": 2.7315175097276265, "grad_norm": 7.274526660218056, "learning_rate": 2.2873059302041627e-06, "loss": 1.2053, "step": 177 }, { "epoch": 2.747081712062257, "grad_norm": 7.528171905726772, "learning_rate": 2.250448168178085e-06, "loss": 1.2631, "step": 178 }, { "epoch": 2.762645914396887, "grad_norm": 7.685404987229149, "learning_rate": 2.216086433505963e-06, "loss": 1.1471, "step": 179 }, { "epoch": 2.7782101167315174, "grad_norm": 7.30724911654224, "learning_rate": 2.18423052872755e-06, "loss": 1.1335, "step": 180 }, { "epoch": 2.7937743190661477, "grad_norm": 8.275377130204019, "learning_rate": 2.154889541532279e-06, "loss": 1.4331, "step": 181 }, { "epoch": 2.809338521400778, "grad_norm": 8.26916689050228, "learning_rate": 2.128071842166766e-06, "loss": 1.1323, "step": 182 }, { "epoch": 2.8249027237354083, "grad_norm": 8.648891670292945, "learning_rate": 2.1037850810469977e-06, "loss": 1.0748, "step": 183 }, { "epoch": 2.840466926070039, "grad_norm": 8.864628121919203, "learning_rate": 2.0820361865758506e-06, "loss": 1.2159, "step": 184 }, { "epoch": 2.8560311284046693, "grad_norm": 7.550747013009374, "learning_rate": 2.0628313631665977e-06, "loss": 1.1746, "step": 185 }, { "epoch": 2.8715953307392996, "grad_norm": 5.4396609218382075, "learning_rate": 2.0461760894729438e-06, "loss": 1.1403, "step": 186 }, { "epoch": 2.88715953307393, "grad_norm": 9.061879291284523, "learning_rate": 2.032075116826103e-06, "loss": 1.5448, "step": 187 }, { "epoch": 2.90272373540856, "grad_norm": 6.124913133747852, "learning_rate": 2.0205324678793635e-06, "loss": 1.1864, "step": 188 }, { "epoch": 2.9182879377431905, "grad_norm": 7.03064717545691, "learning_rate": 2.0115514354605255e-06, "loss": 1.3855, "step": 189 }, { "epoch": 2.9338521400778212, "grad_norm": 8.91521754504974, "learning_rate": 2.005134581632538e-06, "loss": 1.3689, "step": 190 }, { "epoch": 2.9494163424124515, "grad_norm": 8.561228532947991, "learning_rate": 2.001283736962612e-06, "loss": 1.5862, "step": 191 }, { "epoch": 2.964980544747082, "grad_norm": 10.281179103024385, "learning_rate": 2.0000000000000003e-06, "loss": 1.1926, "step": 192 }, { "epoch": 2.964980544747082, "step": 192, "total_flos": 104900150247424.0, "train_loss": 1.356536865234375, "train_runtime": 15762.2393, "train_samples_per_second": 1.571, "train_steps_per_second": 0.012 } ], "logging_steps": 1.0, "max_steps": 192, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 104900150247424.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }