{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.652075471698113, "eval_steps": 500, "global_step": 270, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.036226415094339624, "grad_norm": 4.074844837188721, "learning_rate": 3.7037037037037036e-07, "loss": 0.9495, "step": 1 }, { "epoch": 0.07245283018867925, "grad_norm": 4.034702301025391, "learning_rate": 7.407407407407407e-07, "loss": 0.9536, "step": 2 }, { "epoch": 0.10867924528301887, "grad_norm": 4.080567359924316, "learning_rate": 1.111111111111111e-06, "loss": 0.9267, "step": 3 }, { "epoch": 0.1449056603773585, "grad_norm": 3.9866671562194824, "learning_rate": 1.4814814814814815e-06, "loss": 0.95, "step": 4 }, { "epoch": 0.1811320754716981, "grad_norm": 4.042410373687744, "learning_rate": 1.8518518518518519e-06, "loss": 0.995, "step": 5 }, { "epoch": 0.21735849056603773, "grad_norm": 4.1409783363342285, "learning_rate": 2.222222222222222e-06, "loss": 0.9579, "step": 6 }, { "epoch": 0.25358490566037734, "grad_norm": 3.998582601547241, "learning_rate": 2.5925925925925925e-06, "loss": 0.9625, "step": 7 }, { "epoch": 0.289811320754717, "grad_norm": 4.141905307769775, "learning_rate": 2.962962962962963e-06, "loss": 0.9506, "step": 8 }, { "epoch": 0.3260377358490566, "grad_norm": 4.01569128036499, "learning_rate": 3.3333333333333333e-06, "loss": 0.9591, "step": 9 }, { "epoch": 0.3622641509433962, "grad_norm": 3.662905216217041, "learning_rate": 3.7037037037037037e-06, "loss": 0.9519, "step": 10 }, { "epoch": 0.39849056603773586, "grad_norm": 2.892972707748413, "learning_rate": 4.074074074074074e-06, "loss": 0.9484, "step": 11 }, { "epoch": 0.43471698113207546, "grad_norm": 2.5980327129364014, "learning_rate": 4.444444444444444e-06, "loss": 0.9322, "step": 12 }, { "epoch": 0.47094339622641507, "grad_norm": 2.2705729007720947, "learning_rate": 4.814814814814815e-06, "loss": 0.9016, "step": 13 }, { "epoch": 0.5071698113207547, "grad_norm": 2.027101755142212, "learning_rate": 5.185185185185185e-06, "loss": 0.9084, "step": 14 }, { "epoch": 0.5433962264150943, "grad_norm": 1.2326773405075073, "learning_rate": 5.555555555555557e-06, "loss": 0.9035, "step": 15 }, { "epoch": 0.579622641509434, "grad_norm": 1.5498149394989014, "learning_rate": 5.925925925925926e-06, "loss": 0.8692, "step": 16 }, { "epoch": 0.6158490566037735, "grad_norm": 1.6801735162734985, "learning_rate": 6.296296296296297e-06, "loss": 0.8736, "step": 17 }, { "epoch": 0.6520754716981132, "grad_norm": 1.998213529586792, "learning_rate": 6.666666666666667e-06, "loss": 0.8728, "step": 18 }, { "epoch": 0.6883018867924529, "grad_norm": 1.8951051235198975, "learning_rate": 7.0370370370370375e-06, "loss": 0.8715, "step": 19 }, { "epoch": 0.7245283018867924, "grad_norm": 1.7730218172073364, "learning_rate": 7.4074074074074075e-06, "loss": 0.8358, "step": 20 }, { "epoch": 0.7607547169811321, "grad_norm": 1.758504867553711, "learning_rate": 7.77777777777778e-06, "loss": 0.8289, "step": 21 }, { "epoch": 0.7969811320754717, "grad_norm": 1.978657841682434, "learning_rate": 8.148148148148148e-06, "loss": 0.8432, "step": 22 }, { "epoch": 0.8332075471698113, "grad_norm": 1.5869137048721313, "learning_rate": 8.518518518518519e-06, "loss": 0.8255, "step": 23 }, { "epoch": 0.8694339622641509, "grad_norm": 1.3577728271484375, "learning_rate": 8.888888888888888e-06, "loss": 0.8272, "step": 24 }, { "epoch": 0.9056603773584906, "grad_norm": 0.984247088432312, "learning_rate": 9.25925925925926e-06, "loss": 0.8071, "step": 25 }, { "epoch": 0.9418867924528301, "grad_norm": 0.8904944062232971, "learning_rate": 9.62962962962963e-06, "loss": 0.8054, "step": 26 }, { "epoch": 0.9781132075471698, "grad_norm": 0.8401544094085693, "learning_rate": 1e-05, "loss": 0.8313, "step": 27 }, { "epoch": 1.0, "grad_norm": 0.8401544094085693, "learning_rate": 9.999582149277188e-06, "loss": 0.8047, "step": 28 }, { "epoch": 1.0362264150943397, "grad_norm": 1.6306538581848145, "learning_rate": 9.998328666948437e-06, "loss": 0.8103, "step": 29 }, { "epoch": 1.0724528301886793, "grad_norm": 1.0208637714385986, "learning_rate": 9.996239762521152e-06, "loss": 0.7616, "step": 30 }, { "epoch": 1.1086792452830188, "grad_norm": 1.045002818107605, "learning_rate": 9.993315785135417e-06, "loss": 0.7736, "step": 31 }, { "epoch": 1.1449056603773584, "grad_norm": 0.8551422357559204, "learning_rate": 9.989557223505661e-06, "loss": 0.762, "step": 32 }, { "epoch": 1.181132075471698, "grad_norm": 0.7436487078666687, "learning_rate": 9.98496470583896e-06, "loss": 0.7793, "step": 33 }, { "epoch": 1.2173584905660377, "grad_norm": 0.6822688579559326, "learning_rate": 9.979538999730047e-06, "loss": 0.7728, "step": 34 }, { "epoch": 1.2535849056603774, "grad_norm": 0.6098222732543945, "learning_rate": 9.973281012033009e-06, "loss": 0.7362, "step": 35 }, { "epoch": 1.289811320754717, "grad_norm": 0.6669561266899109, "learning_rate": 9.966191788709716e-06, "loss": 0.7052, "step": 36 }, { "epoch": 1.3260377358490567, "grad_norm": 0.6505219340324402, "learning_rate": 9.958272514655006e-06, "loss": 0.7609, "step": 37 }, { "epoch": 1.3622641509433961, "grad_norm": 0.737601637840271, "learning_rate": 9.949524513498636e-06, "loss": 0.7477, "step": 38 }, { "epoch": 1.3984905660377358, "grad_norm": 0.5156015753746033, "learning_rate": 9.939949247384046e-06, "loss": 0.7554, "step": 39 }, { "epoch": 1.4347169811320755, "grad_norm": 0.5583709478378296, "learning_rate": 9.929548316723983e-06, "loss": 0.7526, "step": 40 }, { "epoch": 1.4709433962264151, "grad_norm": 0.5988060832023621, "learning_rate": 9.918323459933006e-06, "loss": 0.7516, "step": 41 }, { "epoch": 1.5071698113207548, "grad_norm": 0.5599033832550049, "learning_rate": 9.906276553136924e-06, "loss": 0.7294, "step": 42 }, { "epoch": 1.5433962264150942, "grad_norm": 0.6027616858482361, "learning_rate": 9.893409609859221e-06, "loss": 0.72, "step": 43 }, { "epoch": 1.579622641509434, "grad_norm": 0.6933006048202515, "learning_rate": 9.879724780684518e-06, "loss": 0.7176, "step": 44 }, { "epoch": 1.6158490566037735, "grad_norm": 0.5711022615432739, "learning_rate": 9.86522435289912e-06, "loss": 0.716, "step": 45 }, { "epoch": 1.6520754716981132, "grad_norm": 0.48980799317359924, "learning_rate": 9.849910750108718e-06, "loss": 0.7333, "step": 46 }, { "epoch": 1.6883018867924529, "grad_norm": 0.4940769076347351, "learning_rate": 9.833786531833311e-06, "loss": 0.7368, "step": 47 }, { "epoch": 1.7245283018867923, "grad_norm": 0.4914894998073578, "learning_rate": 9.816854393079402e-06, "loss": 0.7323, "step": 48 }, { "epoch": 1.7607547169811322, "grad_norm": 0.5431811213493347, "learning_rate": 9.79911716388956e-06, "loss": 0.7226, "step": 49 }, { "epoch": 1.7969811320754716, "grad_norm": 0.4794186055660248, "learning_rate": 9.7805778088694e-06, "loss": 0.7277, "step": 50 }, { "epoch": 1.8332075471698113, "grad_norm": 0.5294991731643677, "learning_rate": 9.761239426692077e-06, "loss": 0.7193, "step": 51 }, { "epoch": 1.869433962264151, "grad_norm": 0.48333099484443665, "learning_rate": 9.741105249580383e-06, "loss": 0.6936, "step": 52 }, { "epoch": 1.9056603773584906, "grad_norm": 0.42380550503730774, "learning_rate": 9.7201786427665e-06, "loss": 0.7183, "step": 53 }, { "epoch": 1.9418867924528302, "grad_norm": 0.49902644753456116, "learning_rate": 9.698463103929542e-06, "loss": 0.7288, "step": 54 }, { "epoch": 1.9781132075471697, "grad_norm": 0.44469350576400757, "learning_rate": 9.67596226261095e-06, "loss": 0.7469, "step": 55 }, { "epoch": 2.0, "grad_norm": 0.6417510509490967, "learning_rate": 9.652679879607843e-06, "loss": 0.6903, "step": 56 }, { "epoch": 2.0362264150943394, "grad_norm": 0.5208938121795654, "learning_rate": 9.628619846344453e-06, "loss": 0.6569, "step": 57 }, { "epoch": 2.0724528301886793, "grad_norm": 0.41403067111968994, "learning_rate": 9.603786184221693e-06, "loss": 0.6837, "step": 58 }, { "epoch": 2.1086792452830188, "grad_norm": 0.4388677775859833, "learning_rate": 9.578183043945031e-06, "loss": 0.6577, "step": 59 }, { "epoch": 2.1449056603773586, "grad_norm": 0.38680893182754517, "learning_rate": 9.551814704830734e-06, "loss": 0.6647, "step": 60 }, { "epoch": 2.181132075471698, "grad_norm": 0.41955962777137756, "learning_rate": 9.524685574090627e-06, "loss": 0.6414, "step": 61 }, { "epoch": 2.2173584905660375, "grad_norm": 0.5132808089256287, "learning_rate": 9.496800186095466e-06, "loss": 0.6397, "step": 62 }, { "epoch": 2.2535849056603774, "grad_norm": 0.41187506914138794, "learning_rate": 9.468163201617063e-06, "loss": 0.6654, "step": 63 }, { "epoch": 2.289811320754717, "grad_norm": 0.40094566345214844, "learning_rate": 9.438779407049282e-06, "loss": 0.6483, "step": 64 }, { "epoch": 2.3260377358490567, "grad_norm": 0.42385968565940857, "learning_rate": 9.40865371360804e-06, "loss": 0.6382, "step": 65 }, { "epoch": 2.362264150943396, "grad_norm": 0.42505592107772827, "learning_rate": 9.377791156510456e-06, "loss": 0.621, "step": 66 }, { "epoch": 2.398490566037736, "grad_norm": 0.4004240036010742, "learning_rate": 9.346196894133239e-06, "loss": 0.6137, "step": 67 }, { "epoch": 2.4347169811320755, "grad_norm": 0.44593772292137146, "learning_rate": 9.313876207150544e-06, "loss": 0.6299, "step": 68 }, { "epoch": 2.470943396226415, "grad_norm": 0.40830758213996887, "learning_rate": 9.280834497651334e-06, "loss": 0.6504, "step": 69 }, { "epoch": 2.507169811320755, "grad_norm": 0.4714842438697815, "learning_rate": 9.247077288236488e-06, "loss": 0.626, "step": 70 }, { "epoch": 2.543396226415094, "grad_norm": 0.4523302912712097, "learning_rate": 9.212610221095748e-06, "loss": 0.6386, "step": 71 }, { "epoch": 2.579622641509434, "grad_norm": 0.4274156093597412, "learning_rate": 9.177439057064684e-06, "loss": 0.6368, "step": 72 }, { "epoch": 2.6158490566037735, "grad_norm": 0.3964768350124359, "learning_rate": 9.141569674661816e-06, "loss": 0.6161, "step": 73 }, { "epoch": 2.6520754716981134, "grad_norm": 0.4652266502380371, "learning_rate": 9.105008069106093e-06, "loss": 0.6208, "step": 74 }, { "epoch": 2.688301886792453, "grad_norm": 0.4457603394985199, "learning_rate": 9.067760351314838e-06, "loss": 0.6407, "step": 75 }, { "epoch": 2.7245283018867923, "grad_norm": 0.47072410583496094, "learning_rate": 9.029832746882372e-06, "loss": 0.6345, "step": 76 }, { "epoch": 2.760754716981132, "grad_norm": 0.41980573534965515, "learning_rate": 8.991231595039464e-06, "loss": 0.6242, "step": 77 }, { "epoch": 2.7969811320754716, "grad_norm": 0.8126304745674133, "learning_rate": 8.951963347593797e-06, "loss": 0.6368, "step": 78 }, { "epoch": 2.8332075471698115, "grad_norm": 0.4141775667667389, "learning_rate": 8.9120345678516e-06, "loss": 0.6299, "step": 79 }, { "epoch": 2.869433962264151, "grad_norm": 0.4654167592525482, "learning_rate": 8.871451929520662e-06, "loss": 0.6064, "step": 80 }, { "epoch": 2.9056603773584904, "grad_norm": 0.4674675464630127, "learning_rate": 8.83022221559489e-06, "loss": 0.6252, "step": 81 }, { "epoch": 2.9418867924528302, "grad_norm": 0.46518832445144653, "learning_rate": 8.78835231722059e-06, "loss": 0.6304, "step": 82 }, { "epoch": 2.9781132075471697, "grad_norm": 0.40669646859169006, "learning_rate": 8.74584923254468e-06, "loss": 0.6323, "step": 83 }, { "epoch": 3.0, "grad_norm": 0.40669646859169006, "learning_rate": 8.702720065545024e-06, "loss": 0.6371, "step": 84 }, { "epoch": 3.0362264150943394, "grad_norm": 0.5793619155883789, "learning_rate": 8.658972024843063e-06, "loss": 0.5837, "step": 85 }, { "epoch": 3.0724528301886793, "grad_norm": 0.42956608533859253, "learning_rate": 8.614612422498965e-06, "loss": 0.5657, "step": 86 }, { "epoch": 3.1086792452830188, "grad_norm": 0.3798193037509918, "learning_rate": 8.569648672789496e-06, "loss": 0.5788, "step": 87 }, { "epoch": 3.1449056603773586, "grad_norm": 0.43145015835762024, "learning_rate": 8.524088290968781e-06, "loss": 0.57, "step": 88 }, { "epoch": 3.181132075471698, "grad_norm": 0.40505528450012207, "learning_rate": 8.477938892012209e-06, "loss": 0.5556, "step": 89 }, { "epoch": 3.2173584905660375, "grad_norm": 0.36875954270362854, "learning_rate": 8.43120818934367e-06, "loss": 0.5566, "step": 90 }, { "epoch": 3.2535849056603774, "grad_norm": 0.45734313130378723, "learning_rate": 8.38390399354631e-06, "loss": 0.5388, "step": 91 }, { "epoch": 3.289811320754717, "grad_norm": 0.40550005435943604, "learning_rate": 8.336034211057098e-06, "loss": 0.5569, "step": 92 }, { "epoch": 3.3260377358490567, "grad_norm": 0.3886796832084656, "learning_rate": 8.28760684284532e-06, "loss": 0.546, "step": 93 }, { "epoch": 3.362264150943396, "grad_norm": 0.39407268166542053, "learning_rate": 8.238629983075296e-06, "loss": 0.5513, "step": 94 }, { "epoch": 3.398490566037736, "grad_norm": 0.44064226746559143, "learning_rate": 8.18911181775353e-06, "loss": 0.5524, "step": 95 }, { "epoch": 3.4347169811320755, "grad_norm": 0.45655596256256104, "learning_rate": 8.139060623360494e-06, "loss": 0.5402, "step": 96 }, { "epoch": 3.470943396226415, "grad_norm": 0.44399574398994446, "learning_rate": 8.088484765467286e-06, "loss": 0.5403, "step": 97 }, { "epoch": 3.507169811320755, "grad_norm": 0.46749594807624817, "learning_rate": 8.037392697337418e-06, "loss": 0.5609, "step": 98 }, { "epoch": 3.543396226415094, "grad_norm": 0.39268767833709717, "learning_rate": 7.985792958513932e-06, "loss": 0.5616, "step": 99 }, { "epoch": 3.579622641509434, "grad_norm": 0.4343853294849396, "learning_rate": 7.93369417339209e-06, "loss": 0.5493, "step": 100 }, { "epoch": 3.6158490566037735, "grad_norm": 0.4399295151233673, "learning_rate": 7.881105049777902e-06, "loss": 0.5118, "step": 101 }, { "epoch": 3.6520754716981134, "grad_norm": 0.5278156995773315, "learning_rate": 7.828034377432694e-06, "loss": 0.5592, "step": 102 }, { "epoch": 3.688301886792453, "grad_norm": 0.46402445435523987, "learning_rate": 7.774491026603985e-06, "loss": 0.5389, "step": 103 }, { "epoch": 3.7245283018867923, "grad_norm": 0.4599299430847168, "learning_rate": 7.720483946542913e-06, "loss": 0.5429, "step": 104 }, { "epoch": 3.760754716981132, "grad_norm": 0.4408857822418213, "learning_rate": 7.666022164008458e-06, "loss": 0.557, "step": 105 }, { "epoch": 3.7969811320754716, "grad_norm": 0.41980406641960144, "learning_rate": 7.6111147817586925e-06, "loss": 0.5434, "step": 106 }, { "epoch": 3.8332075471698115, "grad_norm": 0.5317021012306213, "learning_rate": 7.5557709770293664e-06, "loss": 0.552, "step": 107 }, { "epoch": 3.869433962264151, "grad_norm": 0.5827487111091614, "learning_rate": 7.500000000000001e-06, "loss": 0.5203, "step": 108 }, { "epoch": 3.9056603773584904, "grad_norm": 0.4034237265586853, "learning_rate": 7.443811172247822e-06, "loss": 0.5418, "step": 109 }, { "epoch": 3.9418867924528302, "grad_norm": 0.4142571985721588, "learning_rate": 7.387213885189746e-06, "loss": 0.5517, "step": 110 }, { "epoch": 3.9781132075471697, "grad_norm": 0.4551447033882141, "learning_rate": 7.330217598512696e-06, "loss": 0.5426, "step": 111 }, { "epoch": 4.0, "grad_norm": 0.5489457845687866, "learning_rate": 7.2728318385925035e-06, "loss": 0.5209, "step": 112 }, { "epoch": 4.036226415094339, "grad_norm": 0.5183960795402527, "learning_rate": 7.215066196901676e-06, "loss": 0.4952, "step": 113 }, { "epoch": 4.072452830188679, "grad_norm": 0.4089732766151428, "learning_rate": 7.156930328406268e-06, "loss": 0.4918, "step": 114 }, { "epoch": 4.108679245283019, "grad_norm": 0.47553887963294983, "learning_rate": 7.098433949952146e-06, "loss": 0.4745, "step": 115 }, { "epoch": 4.144905660377359, "grad_norm": 0.43845227360725403, "learning_rate": 7.039586838640918e-06, "loss": 0.463, "step": 116 }, { "epoch": 4.181132075471698, "grad_norm": 0.4647291898727417, "learning_rate": 6.980398830195785e-06, "loss": 0.5002, "step": 117 }, { "epoch": 4.2173584905660375, "grad_norm": 0.42274776101112366, "learning_rate": 6.920879817317588e-06, "loss": 0.4634, "step": 118 }, { "epoch": 4.253584905660377, "grad_norm": 0.46993109583854675, "learning_rate": 6.861039748031351e-06, "loss": 0.4527, "step": 119 }, { "epoch": 4.289811320754717, "grad_norm": 0.49515578150749207, "learning_rate": 6.800888624023552e-06, "loss": 0.473, "step": 120 }, { "epoch": 4.326037735849057, "grad_norm": 0.4713442027568817, "learning_rate": 6.740436498970453e-06, "loss": 0.4611, "step": 121 }, { "epoch": 4.362264150943396, "grad_norm": 0.4236859977245331, "learning_rate": 6.679693476857712e-06, "loss": 0.4632, "step": 122 }, { "epoch": 4.398490566037736, "grad_norm": 0.4841013252735138, "learning_rate": 6.618669710291607e-06, "loss": 0.4825, "step": 123 }, { "epoch": 4.434716981132075, "grad_norm": 0.5489901304244995, "learning_rate": 6.557375398802124e-06, "loss": 0.4679, "step": 124 }, { "epoch": 4.470943396226415, "grad_norm": 0.49812740087509155, "learning_rate": 6.495820787138209e-06, "loss": 0.4615, "step": 125 }, { "epoch": 4.507169811320755, "grad_norm": 0.3921999931335449, "learning_rate": 6.434016163555452e-06, "loss": 0.4664, "step": 126 }, { "epoch": 4.543396226415094, "grad_norm": 0.478518545627594, "learning_rate": 6.371971858096509e-06, "loss": 0.4754, "step": 127 }, { "epoch": 4.579622641509434, "grad_norm": 0.5080537796020508, "learning_rate": 6.30969824086453e-06, "loss": 0.442, "step": 128 }, { "epoch": 4.615849056603773, "grad_norm": 0.44495469331741333, "learning_rate": 6.247205720289907e-06, "loss": 0.4527, "step": 129 }, { "epoch": 4.652075471698113, "grad_norm": 0.4623711407184601, "learning_rate": 6.184504741390596e-06, "loss": 0.434, "step": 130 }, { "epoch": 4.688301886792453, "grad_norm": 0.38758838176727295, "learning_rate": 6.121605784026339e-06, "loss": 0.45, "step": 131 }, { "epoch": 4.724528301886792, "grad_norm": 0.4601069688796997, "learning_rate": 6.058519361147055e-06, "loss": 0.4655, "step": 132 }, { "epoch": 4.760754716981132, "grad_norm": 0.46330124139785767, "learning_rate": 5.995256017035703e-06, "loss": 0.4531, "step": 133 }, { "epoch": 4.796981132075472, "grad_norm": 0.5022570490837097, "learning_rate": 5.931826325545912e-06, "loss": 0.465, "step": 134 }, { "epoch": 4.8332075471698115, "grad_norm": 0.4626692533493042, "learning_rate": 5.8682408883346535e-06, "loss": 0.4511, "step": 135 }, { "epoch": 4.869433962264151, "grad_norm": 0.3865698575973511, "learning_rate": 5.804510333090287e-06, "loss": 0.46, "step": 136 }, { "epoch": 4.90566037735849, "grad_norm": 0.4084339737892151, "learning_rate": 5.740645311756246e-06, "loss": 0.4587, "step": 137 }, { "epoch": 4.94188679245283, "grad_norm": 0.42995211482048035, "learning_rate": 5.6766564987506564e-06, "loss": 0.4516, "step": 138 }, { "epoch": 4.97811320754717, "grad_norm": 0.41004374623298645, "learning_rate": 5.612554589182228e-06, "loss": 0.4644, "step": 139 }, { "epoch": 5.0, "grad_norm": 0.5633000135421753, "learning_rate": 5.548350297062659e-06, "loss": 0.4656, "step": 140 }, { "epoch": 5.036226415094339, "grad_norm": 0.4353588819503784, "learning_rate": 5.484054353515896e-06, "loss": 0.3881, "step": 141 }, { "epoch": 5.072452830188679, "grad_norm": 0.4253406822681427, "learning_rate": 5.419677504984534e-06, "loss": 0.3969, "step": 142 }, { "epoch": 5.108679245283019, "grad_norm": 0.46061569452285767, "learning_rate": 5.3552305114336515e-06, "loss": 0.3986, "step": 143 }, { "epoch": 5.144905660377359, "grad_norm": 0.4197911024093628, "learning_rate": 5.290724144552379e-06, "loss": 0.4135, "step": 144 }, { "epoch": 5.181132075471698, "grad_norm": 0.40193256735801697, "learning_rate": 5.2261691859535325e-06, "loss": 0.3908, "step": 145 }, { "epoch": 5.2173584905660375, "grad_norm": 0.4688466787338257, "learning_rate": 5.161576425371554e-06, "loss": 0.3923, "step": 146 }, { "epoch": 5.253584905660377, "grad_norm": 0.5070242881774902, "learning_rate": 5.096956658859122e-06, "loss": 0.3837, "step": 147 }, { "epoch": 5.289811320754717, "grad_norm": 0.46500205993652344, "learning_rate": 5.032320686982697e-06, "loss": 0.3959, "step": 148 }, { "epoch": 5.326037735849057, "grad_norm": 1.2324038743972778, "learning_rate": 4.967679313017304e-06, "loss": 0.3785, "step": 149 }, { "epoch": 5.362264150943396, "grad_norm": 0.4382549524307251, "learning_rate": 4.903043341140879e-06, "loss": 0.3595, "step": 150 }, { "epoch": 5.398490566037736, "grad_norm": 0.4779147803783417, "learning_rate": 4.838423574628447e-06, "loss": 0.369, "step": 151 }, { "epoch": 5.434716981132075, "grad_norm": 0.506566047668457, "learning_rate": 4.773830814046469e-06, "loss": 0.3752, "step": 152 }, { "epoch": 5.470943396226415, "grad_norm": 0.4850460886955261, "learning_rate": 4.7092758554476215e-06, "loss": 0.3805, "step": 153 }, { "epoch": 5.507169811320755, "grad_norm": 0.4675082266330719, "learning_rate": 4.644769488566351e-06, "loss": 0.3696, "step": 154 }, { "epoch": 5.543396226415094, "grad_norm": 0.4534352123737335, "learning_rate": 4.580322495015466e-06, "loss": 0.3937, "step": 155 }, { "epoch": 5.579622641509434, "grad_norm": 0.4178565442562103, "learning_rate": 4.515945646484105e-06, "loss": 0.3673, "step": 156 }, { "epoch": 5.615849056603773, "grad_norm": 0.43837058544158936, "learning_rate": 4.451649702937343e-06, "loss": 0.3932, "step": 157 }, { "epoch": 5.652075471698113, "grad_norm": 0.5009051561355591, "learning_rate": 4.387445410817774e-06, "loss": 0.3767, "step": 158 }, { "epoch": 5.688301886792453, "grad_norm": 0.4401375651359558, "learning_rate": 4.323343501249346e-06, "loss": 0.3836, "step": 159 }, { "epoch": 5.724528301886792, "grad_norm": 0.49288874864578247, "learning_rate": 4.259354688243758e-06, "loss": 0.3735, "step": 160 }, { "epoch": 5.760754716981132, "grad_norm": 0.41812247037887573, "learning_rate": 4.195489666909714e-06, "loss": 0.3632, "step": 161 }, { "epoch": 5.796981132075472, "grad_norm": 0.4192414879798889, "learning_rate": 4.131759111665349e-06, "loss": 0.3742, "step": 162 }, { "epoch": 5.8332075471698115, "grad_norm": 0.46856996417045593, "learning_rate": 4.06817367445409e-06, "loss": 0.3842, "step": 163 }, { "epoch": 5.869433962264151, "grad_norm": 0.4367072582244873, "learning_rate": 4.004743982964298e-06, "loss": 0.3778, "step": 164 }, { "epoch": 5.90566037735849, "grad_norm": 0.4130428433418274, "learning_rate": 3.941480638852948e-06, "loss": 0.392, "step": 165 }, { "epoch": 5.94188679245283, "grad_norm": 0.44017598032951355, "learning_rate": 3.878394215973663e-06, "loss": 0.3997, "step": 166 }, { "epoch": 5.97811320754717, "grad_norm": 0.44622254371643066, "learning_rate": 3.815495258609404e-06, "loss": 0.3827, "step": 167 }, { "epoch": 6.0, "grad_norm": 0.44622254371643066, "learning_rate": 3.752794279710094e-06, "loss": 0.4119, "step": 168 }, { "epoch": 6.036226415094339, "grad_norm": 0.6544888615608215, "learning_rate": 3.690301759135471e-06, "loss": 0.3342, "step": 169 }, { "epoch": 6.072452830188679, "grad_norm": 0.48705703020095825, "learning_rate": 3.6280281419034934e-06, "loss": 0.3163, "step": 170 }, { "epoch": 6.108679245283019, "grad_norm": 0.42752087116241455, "learning_rate": 3.5659838364445505e-06, "loss": 0.3442, "step": 171 }, { "epoch": 6.144905660377359, "grad_norm": 0.3857191205024719, "learning_rate": 3.504179212861793e-06, "loss": 0.3319, "step": 172 }, { "epoch": 6.181132075471698, "grad_norm": 0.46637335419654846, "learning_rate": 3.442624601197877e-06, "loss": 0.3167, "step": 173 }, { "epoch": 6.2173584905660375, "grad_norm": 0.44155874848365784, "learning_rate": 3.3813302897083955e-06, "loss": 0.3032, "step": 174 }, { "epoch": 6.253584905660377, "grad_norm": 0.4230786859989166, "learning_rate": 3.3203065231422904e-06, "loss": 0.3082, "step": 175 }, { "epoch": 6.289811320754717, "grad_norm": 0.45516237616539, "learning_rate": 3.259563501029548e-06, "loss": 0.3323, "step": 176 }, { "epoch": 6.326037735849057, "grad_norm": 0.4669758081436157, "learning_rate": 3.1991113759764493e-06, "loss": 0.3143, "step": 177 }, { "epoch": 6.362264150943396, "grad_norm": 0.5168375968933105, "learning_rate": 3.1389602519686515e-06, "loss": 0.3154, "step": 178 }, { "epoch": 6.398490566037736, "grad_norm": 0.4606865644454956, "learning_rate": 3.0791201826824117e-06, "loss": 0.3067, "step": 179 }, { "epoch": 6.434716981132075, "grad_norm": 0.4242306053638458, "learning_rate": 3.019601169804216e-06, "loss": 0.3326, "step": 180 }, { "epoch": 6.470943396226415, "grad_norm": 0.4492356777191162, "learning_rate": 2.9604131613590825e-06, "loss": 0.3422, "step": 181 }, { "epoch": 6.507169811320755, "grad_norm": 0.4301685690879822, "learning_rate": 2.901566050047855e-06, "loss": 0.3071, "step": 182 }, { "epoch": 6.543396226415094, "grad_norm": 0.46162980794906616, "learning_rate": 2.843069671593734e-06, "loss": 0.3084, "step": 183 }, { "epoch": 6.579622641509434, "grad_norm": 0.43854185938835144, "learning_rate": 2.784933803098326e-06, "loss": 0.3256, "step": 184 }, { "epoch": 6.615849056603773, "grad_norm": 0.46818047761917114, "learning_rate": 2.7271681614074973e-06, "loss": 0.3109, "step": 185 }, { "epoch": 6.652075471698113, "grad_norm": 0.3902846872806549, "learning_rate": 2.6697824014873076e-06, "loss": 0.303, "step": 186 }, { "epoch": 6.688301886792453, "grad_norm": 0.4288512170314789, "learning_rate": 2.6127861148102552e-06, "loss": 0.2988, "step": 187 }, { "epoch": 6.724528301886792, "grad_norm": 0.3952915668487549, "learning_rate": 2.5561888277521797e-06, "loss": 0.3003, "step": 188 }, { "epoch": 6.760754716981132, "grad_norm": 0.4569963812828064, "learning_rate": 2.5000000000000015e-06, "loss": 0.3145, "step": 189 }, { "epoch": 6.796981132075472, "grad_norm": 0.448163241147995, "learning_rate": 2.4442290229706344e-06, "loss": 0.3302, "step": 190 }, { "epoch": 6.8332075471698115, "grad_norm": 0.44628778100013733, "learning_rate": 2.3888852182413087e-06, "loss": 0.3256, "step": 191 }, { "epoch": 6.869433962264151, "grad_norm": 0.4496423900127411, "learning_rate": 2.333977835991545e-06, "loss": 0.3399, "step": 192 }, { "epoch": 6.90566037735849, "grad_norm": 0.4480762481689453, "learning_rate": 2.2795160534570866e-06, "loss": 0.3202, "step": 193 }, { "epoch": 6.94188679245283, "grad_norm": 0.4092267155647278, "learning_rate": 2.2255089733960162e-06, "loss": 0.3302, "step": 194 }, { "epoch": 6.97811320754717, "grad_norm": 0.43138426542282104, "learning_rate": 2.171965622567308e-06, "loss": 0.3106, "step": 195 }, { "epoch": 7.0, "grad_norm": 0.5887420773506165, "learning_rate": 2.1188949502220987e-06, "loss": 0.3219, "step": 196 }, { "epoch": 7.036226415094339, "grad_norm": 0.556282103061676, "learning_rate": 2.066305826607911e-06, "loss": 0.3141, "step": 197 }, { "epoch": 7.072452830188679, "grad_norm": 0.4548152983188629, "learning_rate": 2.0142070414860704e-06, "loss": 0.2697, "step": 198 }, { "epoch": 7.108679245283019, "grad_norm": 0.45477020740509033, "learning_rate": 1.962607302662582e-06, "loss": 0.27, "step": 199 }, { "epoch": 7.144905660377359, "grad_norm": 0.3835766911506653, "learning_rate": 1.9115152345327154e-06, "loss": 0.2718, "step": 200 }, { "epoch": 7.181132075471698, "grad_norm": 0.4106207489967346, "learning_rate": 1.8609393766395083e-06, "loss": 0.2902, "step": 201 }, { "epoch": 7.2173584905660375, "grad_norm": 0.40980860590934753, "learning_rate": 1.8108881822464697e-06, "loss": 0.294, "step": 202 }, { "epoch": 7.253584905660377, "grad_norm": 0.4507136940956116, "learning_rate": 1.7613700169247055e-06, "loss": 0.2941, "step": 203 }, { "epoch": 7.289811320754717, "grad_norm": 0.3944286108016968, "learning_rate": 1.7123931571546826e-06, "loss": 0.2596, "step": 204 }, { "epoch": 7.326037735849057, "grad_norm": 0.42168113589286804, "learning_rate": 1.6639657889429017e-06, "loss": 0.2757, "step": 205 }, { "epoch": 7.362264150943396, "grad_norm": 0.45149892568588257, "learning_rate": 1.6160960064536907e-06, "loss": 0.2618, "step": 206 }, { "epoch": 7.398490566037736, "grad_norm": 0.4606323540210724, "learning_rate": 1.5687918106563326e-06, "loss": 0.2756, "step": 207 }, { "epoch": 7.434716981132075, "grad_norm": 0.5498378872871399, "learning_rate": 1.52206110798779e-06, "loss": 0.2818, "step": 208 }, { "epoch": 7.470943396226415, "grad_norm": 0.4124191403388977, "learning_rate": 1.4759117090312197e-06, "loss": 0.2731, "step": 209 }, { "epoch": 7.507169811320755, "grad_norm": 0.44834864139556885, "learning_rate": 1.4303513272105057e-06, "loss": 0.2687, "step": 210 }, { "epoch": 7.543396226415094, "grad_norm": 0.39862528443336487, "learning_rate": 1.3853875775010355e-06, "loss": 0.2688, "step": 211 }, { "epoch": 7.579622641509434, "grad_norm": 0.48807084560394287, "learning_rate": 1.3410279751569399e-06, "loss": 0.3119, "step": 212 }, { "epoch": 7.615849056603773, "grad_norm": 0.44604766368865967, "learning_rate": 1.297279934454978e-06, "loss": 0.2621, "step": 213 }, { "epoch": 7.652075471698113, "grad_norm": 0.3903985619544983, "learning_rate": 1.25415076745532e-06, "loss": 0.2745, "step": 214 }, { "epoch": 7.688301886792453, "grad_norm": 0.4241688549518585, "learning_rate": 1.2116476827794104e-06, "loss": 0.264, "step": 215 }, { "epoch": 7.724528301886792, "grad_norm": 0.4157409369945526, "learning_rate": 1.1697777844051105e-06, "loss": 0.2695, "step": 216 }, { "epoch": 7.760754716981132, "grad_norm": 0.42209798097610474, "learning_rate": 1.1285480704793378e-06, "loss": 0.293, "step": 217 }, { "epoch": 7.796981132075472, "grad_norm": 0.39040008187294006, "learning_rate": 1.0879654321484012e-06, "loss": 0.2443, "step": 218 }, { "epoch": 7.8332075471698115, "grad_norm": 0.4144212603569031, "learning_rate": 1.0480366524062041e-06, "loss": 0.2747, "step": 219 }, { "epoch": 7.869433962264151, "grad_norm": 0.38053098320961, "learning_rate": 1.008768404960535e-06, "loss": 0.2634, "step": 220 }, { "epoch": 7.90566037735849, "grad_norm": 0.4866553246974945, "learning_rate": 9.701672531176287e-07, "loss": 0.2883, "step": 221 }, { "epoch": 7.94188679245283, "grad_norm": 0.401796817779541, "learning_rate": 9.322396486851626e-07, "loss": 0.2885, "step": 222 }, { "epoch": 7.97811320754717, "grad_norm": 0.4356318414211273, "learning_rate": 8.949919308939081e-07, "loss": 0.2985, "step": 223 }, { "epoch": 8.0, "grad_norm": 0.4356318414211273, "learning_rate": 8.584303253381848e-07, "loss": 0.2454, "step": 224 }, { "epoch": 8.03622641509434, "grad_norm": 0.5875634551048279, "learning_rate": 8.225609429353187e-07, "loss": 0.2752, "step": 225 }, { "epoch": 8.072452830188679, "grad_norm": 0.4065380394458771, "learning_rate": 7.873897789042523e-07, "loss": 0.2725, "step": 226 }, { "epoch": 8.10867924528302, "grad_norm": 0.47121724486351013, "learning_rate": 7.529227117635135e-07, "loss": 0.2802, "step": 227 }, { "epoch": 8.144905660377358, "grad_norm": 0.41164955496788025, "learning_rate": 7.191655023486682e-07, "loss": 0.2475, "step": 228 }, { "epoch": 8.181132075471698, "grad_norm": 0.40498465299606323, "learning_rate": 6.86123792849458e-07, "loss": 0.2585, "step": 229 }, { "epoch": 8.217358490566038, "grad_norm": 0.39466381072998047, "learning_rate": 6.53803105866761e-07, "loss": 0.2552, "step": 230 }, { "epoch": 8.253584905660377, "grad_norm": 0.3694005310535431, "learning_rate": 6.222088434895462e-07, "loss": 0.266, "step": 231 }, { "epoch": 8.289811320754717, "grad_norm": 0.3654123544692993, "learning_rate": 5.9134628639196e-07, "loss": 0.261, "step": 232 }, { "epoch": 8.326037735849056, "grad_norm": 0.3835010230541229, "learning_rate": 5.612205929507209e-07, "loss": 0.262, "step": 233 }, { "epoch": 8.362264150943396, "grad_norm": 0.4232535660266876, "learning_rate": 5.318367983829393e-07, "loss": 0.2443, "step": 234 }, { "epoch": 8.398490566037736, "grad_norm": 0.3865967392921448, "learning_rate": 5.031998139045352e-07, "loss": 0.2479, "step": 235 }, { "epoch": 8.434716981132075, "grad_norm": 0.3988674581050873, "learning_rate": 4.753144259093734e-07, "loss": 0.2569, "step": 236 }, { "epoch": 8.470943396226415, "grad_norm": 0.40713047981262207, "learning_rate": 4.481852951692672e-07, "loss": 0.2446, "step": 237 }, { "epoch": 8.507169811320754, "grad_norm": 0.3784768283367157, "learning_rate": 4.2181695605497066e-07, "loss": 0.2561, "step": 238 }, { "epoch": 8.543396226415094, "grad_norm": 0.39897289872169495, "learning_rate": 3.9621381577830855e-07, "loss": 0.2504, "step": 239 }, { "epoch": 8.579622641509435, "grad_norm": 0.40435102581977844, "learning_rate": 3.7138015365554834e-07, "loss": 0.2572, "step": 240 }, { "epoch": 8.615849056603773, "grad_norm": 0.3976576328277588, "learning_rate": 3.473201203921578e-07, "loss": 0.2622, "step": 241 }, { "epoch": 8.652075471698113, "grad_norm": 0.40374553203582764, "learning_rate": 3.2403773738905185e-07, "loss": 0.2302, "step": 242 }, { "epoch": 8.688301886792452, "grad_norm": 0.39839479327201843, "learning_rate": 3.015368960704584e-07, "loss": 0.2658, "step": 243 }, { "epoch": 8.724528301886792, "grad_norm": 0.3632482588291168, "learning_rate": 2.798213572335001e-07, "loss": 0.2435, "step": 244 }, { "epoch": 8.760754716981133, "grad_norm": 0.37311646342277527, "learning_rate": 2.5889475041961767e-07, "loss": 0.2427, "step": 245 }, { "epoch": 8.796981132075471, "grad_norm": 0.43890783190727234, "learning_rate": 2.3876057330792344e-07, "loss": 0.2576, "step": 246 }, { "epoch": 8.833207547169811, "grad_norm": 0.3749382197856903, "learning_rate": 2.1942219113060215e-07, "loss": 0.2588, "step": 247 }, { "epoch": 8.86943396226415, "grad_norm": 0.41522374749183655, "learning_rate": 2.0088283611044034e-07, "loss": 0.2526, "step": 248 }, { "epoch": 8.90566037735849, "grad_norm": 0.3923836648464203, "learning_rate": 1.8314560692059836e-07, "loss": 0.2504, "step": 249 }, { "epoch": 8.94188679245283, "grad_norm": 0.40502244234085083, "learning_rate": 1.6621346816668993e-07, "loss": 0.2409, "step": 250 }, { "epoch": 8.97811320754717, "grad_norm": 0.391658753156662, "learning_rate": 1.500892498912826e-07, "loss": 0.2607, "step": 251 }, { "epoch": 9.0, "grad_norm": 0.5146093964576721, "learning_rate": 1.3477564710088097e-07, "loss": 0.2573, "step": 252 }, { "epoch": 9.03622641509434, "grad_norm": 0.5422486662864685, "learning_rate": 1.2027521931548214e-07, "loss": 0.2379, "step": 253 }, { "epoch": 9.072452830188679, "grad_norm": 0.40020889043807983, "learning_rate": 1.0659039014077943e-07, "loss": 0.2412, "step": 254 }, { "epoch": 9.10867924528302, "grad_norm": 0.3687814772129059, "learning_rate": 9.372344686307655e-08, "loss": 0.2409, "step": 255 }, { "epoch": 9.144905660377358, "grad_norm": 0.380043089389801, "learning_rate": 8.167654006699444e-08, "loss": 0.2699, "step": 256 }, { "epoch": 9.181132075471698, "grad_norm": 0.39074811339378357, "learning_rate": 7.04516832760177e-08, "loss": 0.2489, "step": 257 }, { "epoch": 9.217358490566038, "grad_norm": 0.38397496938705444, "learning_rate": 6.005075261595495e-08, "loss": 0.2585, "step": 258 }, { "epoch": 9.253584905660377, "grad_norm": 0.4563198685646057, "learning_rate": 5.047548650136513e-08, "loss": 0.2351, "step": 259 }, { "epoch": 9.289811320754717, "grad_norm": 0.3548761308193207, "learning_rate": 4.172748534499449e-08, "loss": 0.2508, "step": 260 }, { "epoch": 9.326037735849056, "grad_norm": 0.41531455516815186, "learning_rate": 3.3808211290284886e-08, "loss": 0.2534, "step": 261 }, { "epoch": 9.362264150943396, "grad_norm": 0.34948405623435974, "learning_rate": 2.6718987966992683e-08, "loss": 0.2497, "step": 262 }, { "epoch": 9.398490566037736, "grad_norm": 0.3671889305114746, "learning_rate": 2.0461000269953457e-08, "loss": 0.2317, "step": 263 }, { "epoch": 9.434716981132075, "grad_norm": 0.43335914611816406, "learning_rate": 1.5035294161039882e-08, "loss": 0.2353, "step": 264 }, { "epoch": 9.470943396226415, "grad_norm": 0.41739046573638916, "learning_rate": 1.044277649433989e-08, "loss": 0.2589, "step": 265 }, { "epoch": 9.507169811320754, "grad_norm": 0.36336734890937805, "learning_rate": 6.6842148645840374e-09, "loss": 0.2228, "step": 266 }, { "epoch": 9.543396226415094, "grad_norm": 0.3958076238632202, "learning_rate": 3.760237478849793e-09, "loss": 0.2405, "step": 267 }, { "epoch": 9.579622641509435, "grad_norm": 0.40326353907585144, "learning_rate": 1.6713330515627512e-09, "loss": 0.2645, "step": 268 }, { "epoch": 9.615849056603773, "grad_norm": 0.38379284739494324, "learning_rate": 4.178507228136397e-10, "loss": 0.2699, "step": 269 }, { "epoch": 9.652075471698113, "grad_norm": 0.4049612283706665, "learning_rate": 0.0, "loss": 0.2758, "step": 270 }, { "epoch": 9.652075471698113, "step": 270, "total_flos": 99687939342336.0, "train_loss": 0.48553379895510496, "train_runtime": 12266.0559, "train_samples_per_second": 1.08, "train_steps_per_second": 0.022 } ], "logging_steps": 1, "max_steps": 270, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 99687939342336.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }