| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 9.652075471698113, | |
| "eval_steps": 500, | |
| "global_step": 270, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.036226415094339624, | |
| "grad_norm": 4.074844837188721, | |
| "learning_rate": 3.7037037037037036e-07, | |
| "loss": 0.9495, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.07245283018867925, | |
| "grad_norm": 4.034702301025391, | |
| "learning_rate": 7.407407407407407e-07, | |
| "loss": 0.9536, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.10867924528301887, | |
| "grad_norm": 4.080567359924316, | |
| "learning_rate": 1.111111111111111e-06, | |
| "loss": 0.9267, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.1449056603773585, | |
| "grad_norm": 3.9866671562194824, | |
| "learning_rate": 1.4814814814814815e-06, | |
| "loss": 0.95, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.1811320754716981, | |
| "grad_norm": 4.042410373687744, | |
| "learning_rate": 1.8518518518518519e-06, | |
| "loss": 0.995, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.21735849056603773, | |
| "grad_norm": 4.1409783363342285, | |
| "learning_rate": 2.222222222222222e-06, | |
| "loss": 0.9579, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.25358490566037734, | |
| "grad_norm": 3.998582601547241, | |
| "learning_rate": 2.5925925925925925e-06, | |
| "loss": 0.9625, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.289811320754717, | |
| "grad_norm": 4.141905307769775, | |
| "learning_rate": 2.962962962962963e-06, | |
| "loss": 0.9506, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.3260377358490566, | |
| "grad_norm": 4.01569128036499, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.9591, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.3622641509433962, | |
| "grad_norm": 3.662905216217041, | |
| "learning_rate": 3.7037037037037037e-06, | |
| "loss": 0.9519, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.39849056603773586, | |
| "grad_norm": 2.892972707748413, | |
| "learning_rate": 4.074074074074074e-06, | |
| "loss": 0.9484, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.43471698113207546, | |
| "grad_norm": 2.5980327129364014, | |
| "learning_rate": 4.444444444444444e-06, | |
| "loss": 0.9322, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.47094339622641507, | |
| "grad_norm": 2.2705729007720947, | |
| "learning_rate": 4.814814814814815e-06, | |
| "loss": 0.9016, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.5071698113207547, | |
| "grad_norm": 2.027101755142212, | |
| "learning_rate": 5.185185185185185e-06, | |
| "loss": 0.9084, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.5433962264150943, | |
| "grad_norm": 1.2326773405075073, | |
| "learning_rate": 5.555555555555557e-06, | |
| "loss": 0.9035, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.579622641509434, | |
| "grad_norm": 1.5498149394989014, | |
| "learning_rate": 5.925925925925926e-06, | |
| "loss": 0.8692, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.6158490566037735, | |
| "grad_norm": 1.6801735162734985, | |
| "learning_rate": 6.296296296296297e-06, | |
| "loss": 0.8736, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.6520754716981132, | |
| "grad_norm": 1.998213529586792, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.8728, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.6883018867924529, | |
| "grad_norm": 1.8951051235198975, | |
| "learning_rate": 7.0370370370370375e-06, | |
| "loss": 0.8715, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.7245283018867924, | |
| "grad_norm": 1.7730218172073364, | |
| "learning_rate": 7.4074074074074075e-06, | |
| "loss": 0.8358, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.7607547169811321, | |
| "grad_norm": 1.758504867553711, | |
| "learning_rate": 7.77777777777778e-06, | |
| "loss": 0.8289, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.7969811320754717, | |
| "grad_norm": 1.978657841682434, | |
| "learning_rate": 8.148148148148148e-06, | |
| "loss": 0.8432, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.8332075471698113, | |
| "grad_norm": 1.5869137048721313, | |
| "learning_rate": 8.518518518518519e-06, | |
| "loss": 0.8255, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.8694339622641509, | |
| "grad_norm": 1.3577728271484375, | |
| "learning_rate": 8.888888888888888e-06, | |
| "loss": 0.8272, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.9056603773584906, | |
| "grad_norm": 0.984247088432312, | |
| "learning_rate": 9.25925925925926e-06, | |
| "loss": 0.8071, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.9418867924528301, | |
| "grad_norm": 0.8904944062232971, | |
| "learning_rate": 9.62962962962963e-06, | |
| "loss": 0.8054, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.9781132075471698, | |
| "grad_norm": 0.8401544094085693, | |
| "learning_rate": 1e-05, | |
| "loss": 0.8313, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.8401544094085693, | |
| "learning_rate": 9.999582149277188e-06, | |
| "loss": 0.8047, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 1.0362264150943397, | |
| "grad_norm": 1.6306538581848145, | |
| "learning_rate": 9.998328666948437e-06, | |
| "loss": 0.8103, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 1.0724528301886793, | |
| "grad_norm": 1.0208637714385986, | |
| "learning_rate": 9.996239762521152e-06, | |
| "loss": 0.7616, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 1.1086792452830188, | |
| "grad_norm": 1.045002818107605, | |
| "learning_rate": 9.993315785135417e-06, | |
| "loss": 0.7736, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 1.1449056603773584, | |
| "grad_norm": 0.8551422357559204, | |
| "learning_rate": 9.989557223505661e-06, | |
| "loss": 0.762, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 1.181132075471698, | |
| "grad_norm": 0.7436487078666687, | |
| "learning_rate": 9.98496470583896e-06, | |
| "loss": 0.7793, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 1.2173584905660377, | |
| "grad_norm": 0.6822688579559326, | |
| "learning_rate": 9.979538999730047e-06, | |
| "loss": 0.7728, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 1.2535849056603774, | |
| "grad_norm": 0.6098222732543945, | |
| "learning_rate": 9.973281012033009e-06, | |
| "loss": 0.7362, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 1.289811320754717, | |
| "grad_norm": 0.6669561266899109, | |
| "learning_rate": 9.966191788709716e-06, | |
| "loss": 0.7052, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 1.3260377358490567, | |
| "grad_norm": 0.6505219340324402, | |
| "learning_rate": 9.958272514655006e-06, | |
| "loss": 0.7609, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 1.3622641509433961, | |
| "grad_norm": 0.737601637840271, | |
| "learning_rate": 9.949524513498636e-06, | |
| "loss": 0.7477, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 1.3984905660377358, | |
| "grad_norm": 0.5156015753746033, | |
| "learning_rate": 9.939949247384046e-06, | |
| "loss": 0.7554, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 1.4347169811320755, | |
| "grad_norm": 0.5583709478378296, | |
| "learning_rate": 9.929548316723983e-06, | |
| "loss": 0.7526, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.4709433962264151, | |
| "grad_norm": 0.5988060832023621, | |
| "learning_rate": 9.918323459933006e-06, | |
| "loss": 0.7516, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 1.5071698113207548, | |
| "grad_norm": 0.5599033832550049, | |
| "learning_rate": 9.906276553136924e-06, | |
| "loss": 0.7294, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 1.5433962264150942, | |
| "grad_norm": 0.6027616858482361, | |
| "learning_rate": 9.893409609859221e-06, | |
| "loss": 0.72, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 1.579622641509434, | |
| "grad_norm": 0.6933006048202515, | |
| "learning_rate": 9.879724780684518e-06, | |
| "loss": 0.7176, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 1.6158490566037735, | |
| "grad_norm": 0.5711022615432739, | |
| "learning_rate": 9.86522435289912e-06, | |
| "loss": 0.716, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 1.6520754716981132, | |
| "grad_norm": 0.48980799317359924, | |
| "learning_rate": 9.849910750108718e-06, | |
| "loss": 0.7333, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 1.6883018867924529, | |
| "grad_norm": 0.4940769076347351, | |
| "learning_rate": 9.833786531833311e-06, | |
| "loss": 0.7368, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 1.7245283018867923, | |
| "grad_norm": 0.4914894998073578, | |
| "learning_rate": 9.816854393079402e-06, | |
| "loss": 0.7323, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 1.7607547169811322, | |
| "grad_norm": 0.5431811213493347, | |
| "learning_rate": 9.79911716388956e-06, | |
| "loss": 0.7226, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 1.7969811320754716, | |
| "grad_norm": 0.4794186055660248, | |
| "learning_rate": 9.7805778088694e-06, | |
| "loss": 0.7277, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.8332075471698113, | |
| "grad_norm": 0.5294991731643677, | |
| "learning_rate": 9.761239426692077e-06, | |
| "loss": 0.7193, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 1.869433962264151, | |
| "grad_norm": 0.48333099484443665, | |
| "learning_rate": 9.741105249580383e-06, | |
| "loss": 0.6936, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 1.9056603773584906, | |
| "grad_norm": 0.42380550503730774, | |
| "learning_rate": 9.7201786427665e-06, | |
| "loss": 0.7183, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 1.9418867924528302, | |
| "grad_norm": 0.49902644753456116, | |
| "learning_rate": 9.698463103929542e-06, | |
| "loss": 0.7288, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 1.9781132075471697, | |
| "grad_norm": 0.44469350576400757, | |
| "learning_rate": 9.67596226261095e-06, | |
| "loss": 0.7469, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.6417510509490967, | |
| "learning_rate": 9.652679879607843e-06, | |
| "loss": 0.6903, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 2.0362264150943394, | |
| "grad_norm": 0.5208938121795654, | |
| "learning_rate": 9.628619846344453e-06, | |
| "loss": 0.6569, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 2.0724528301886793, | |
| "grad_norm": 0.41403067111968994, | |
| "learning_rate": 9.603786184221693e-06, | |
| "loss": 0.6837, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 2.1086792452830188, | |
| "grad_norm": 0.4388677775859833, | |
| "learning_rate": 9.578183043945031e-06, | |
| "loss": 0.6577, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 2.1449056603773586, | |
| "grad_norm": 0.38680893182754517, | |
| "learning_rate": 9.551814704830734e-06, | |
| "loss": 0.6647, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 2.181132075471698, | |
| "grad_norm": 0.41955962777137756, | |
| "learning_rate": 9.524685574090627e-06, | |
| "loss": 0.6414, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 2.2173584905660375, | |
| "grad_norm": 0.5132808089256287, | |
| "learning_rate": 9.496800186095466e-06, | |
| "loss": 0.6397, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 2.2535849056603774, | |
| "grad_norm": 0.41187506914138794, | |
| "learning_rate": 9.468163201617063e-06, | |
| "loss": 0.6654, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 2.289811320754717, | |
| "grad_norm": 0.40094566345214844, | |
| "learning_rate": 9.438779407049282e-06, | |
| "loss": 0.6483, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 2.3260377358490567, | |
| "grad_norm": 0.42385968565940857, | |
| "learning_rate": 9.40865371360804e-06, | |
| "loss": 0.6382, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 2.362264150943396, | |
| "grad_norm": 0.42505592107772827, | |
| "learning_rate": 9.377791156510456e-06, | |
| "loss": 0.621, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 2.398490566037736, | |
| "grad_norm": 0.4004240036010742, | |
| "learning_rate": 9.346196894133239e-06, | |
| "loss": 0.6137, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 2.4347169811320755, | |
| "grad_norm": 0.44593772292137146, | |
| "learning_rate": 9.313876207150544e-06, | |
| "loss": 0.6299, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 2.470943396226415, | |
| "grad_norm": 0.40830758213996887, | |
| "learning_rate": 9.280834497651334e-06, | |
| "loss": 0.6504, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 2.507169811320755, | |
| "grad_norm": 0.4714842438697815, | |
| "learning_rate": 9.247077288236488e-06, | |
| "loss": 0.626, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 2.543396226415094, | |
| "grad_norm": 0.4523302912712097, | |
| "learning_rate": 9.212610221095748e-06, | |
| "loss": 0.6386, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 2.579622641509434, | |
| "grad_norm": 0.4274156093597412, | |
| "learning_rate": 9.177439057064684e-06, | |
| "loss": 0.6368, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 2.6158490566037735, | |
| "grad_norm": 0.3964768350124359, | |
| "learning_rate": 9.141569674661816e-06, | |
| "loss": 0.6161, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 2.6520754716981134, | |
| "grad_norm": 0.4652266502380371, | |
| "learning_rate": 9.105008069106093e-06, | |
| "loss": 0.6208, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 2.688301886792453, | |
| "grad_norm": 0.4457603394985199, | |
| "learning_rate": 9.067760351314838e-06, | |
| "loss": 0.6407, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 2.7245283018867923, | |
| "grad_norm": 0.47072410583496094, | |
| "learning_rate": 9.029832746882372e-06, | |
| "loss": 0.6345, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 2.760754716981132, | |
| "grad_norm": 0.41980573534965515, | |
| "learning_rate": 8.991231595039464e-06, | |
| "loss": 0.6242, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 2.7969811320754716, | |
| "grad_norm": 0.8126304745674133, | |
| "learning_rate": 8.951963347593797e-06, | |
| "loss": 0.6368, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 2.8332075471698115, | |
| "grad_norm": 0.4141775667667389, | |
| "learning_rate": 8.9120345678516e-06, | |
| "loss": 0.6299, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 2.869433962264151, | |
| "grad_norm": 0.4654167592525482, | |
| "learning_rate": 8.871451929520662e-06, | |
| "loss": 0.6064, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 2.9056603773584904, | |
| "grad_norm": 0.4674675464630127, | |
| "learning_rate": 8.83022221559489e-06, | |
| "loss": 0.6252, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 2.9418867924528302, | |
| "grad_norm": 0.46518832445144653, | |
| "learning_rate": 8.78835231722059e-06, | |
| "loss": 0.6304, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 2.9781132075471697, | |
| "grad_norm": 0.40669646859169006, | |
| "learning_rate": 8.74584923254468e-06, | |
| "loss": 0.6323, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.40669646859169006, | |
| "learning_rate": 8.702720065545024e-06, | |
| "loss": 0.6371, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 3.0362264150943394, | |
| "grad_norm": 0.5793619155883789, | |
| "learning_rate": 8.658972024843063e-06, | |
| "loss": 0.5837, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 3.0724528301886793, | |
| "grad_norm": 0.42956608533859253, | |
| "learning_rate": 8.614612422498965e-06, | |
| "loss": 0.5657, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 3.1086792452830188, | |
| "grad_norm": 0.3798193037509918, | |
| "learning_rate": 8.569648672789496e-06, | |
| "loss": 0.5788, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 3.1449056603773586, | |
| "grad_norm": 0.43145015835762024, | |
| "learning_rate": 8.524088290968781e-06, | |
| "loss": 0.57, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 3.181132075471698, | |
| "grad_norm": 0.40505528450012207, | |
| "learning_rate": 8.477938892012209e-06, | |
| "loss": 0.5556, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 3.2173584905660375, | |
| "grad_norm": 0.36875954270362854, | |
| "learning_rate": 8.43120818934367e-06, | |
| "loss": 0.5566, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 3.2535849056603774, | |
| "grad_norm": 0.45734313130378723, | |
| "learning_rate": 8.38390399354631e-06, | |
| "loss": 0.5388, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 3.289811320754717, | |
| "grad_norm": 0.40550005435943604, | |
| "learning_rate": 8.336034211057098e-06, | |
| "loss": 0.5569, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 3.3260377358490567, | |
| "grad_norm": 0.3886796832084656, | |
| "learning_rate": 8.28760684284532e-06, | |
| "loss": 0.546, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 3.362264150943396, | |
| "grad_norm": 0.39407268166542053, | |
| "learning_rate": 8.238629983075296e-06, | |
| "loss": 0.5513, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 3.398490566037736, | |
| "grad_norm": 0.44064226746559143, | |
| "learning_rate": 8.18911181775353e-06, | |
| "loss": 0.5524, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 3.4347169811320755, | |
| "grad_norm": 0.45655596256256104, | |
| "learning_rate": 8.139060623360494e-06, | |
| "loss": 0.5402, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 3.470943396226415, | |
| "grad_norm": 0.44399574398994446, | |
| "learning_rate": 8.088484765467286e-06, | |
| "loss": 0.5403, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 3.507169811320755, | |
| "grad_norm": 0.46749594807624817, | |
| "learning_rate": 8.037392697337418e-06, | |
| "loss": 0.5609, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 3.543396226415094, | |
| "grad_norm": 0.39268767833709717, | |
| "learning_rate": 7.985792958513932e-06, | |
| "loss": 0.5616, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 3.579622641509434, | |
| "grad_norm": 0.4343853294849396, | |
| "learning_rate": 7.93369417339209e-06, | |
| "loss": 0.5493, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 3.6158490566037735, | |
| "grad_norm": 0.4399295151233673, | |
| "learning_rate": 7.881105049777902e-06, | |
| "loss": 0.5118, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 3.6520754716981134, | |
| "grad_norm": 0.5278156995773315, | |
| "learning_rate": 7.828034377432694e-06, | |
| "loss": 0.5592, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 3.688301886792453, | |
| "grad_norm": 0.46402445435523987, | |
| "learning_rate": 7.774491026603985e-06, | |
| "loss": 0.5389, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 3.7245283018867923, | |
| "grad_norm": 0.4599299430847168, | |
| "learning_rate": 7.720483946542913e-06, | |
| "loss": 0.5429, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 3.760754716981132, | |
| "grad_norm": 0.4408857822418213, | |
| "learning_rate": 7.666022164008458e-06, | |
| "loss": 0.557, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 3.7969811320754716, | |
| "grad_norm": 0.41980406641960144, | |
| "learning_rate": 7.6111147817586925e-06, | |
| "loss": 0.5434, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 3.8332075471698115, | |
| "grad_norm": 0.5317021012306213, | |
| "learning_rate": 7.5557709770293664e-06, | |
| "loss": 0.552, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 3.869433962264151, | |
| "grad_norm": 0.5827487111091614, | |
| "learning_rate": 7.500000000000001e-06, | |
| "loss": 0.5203, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 3.9056603773584904, | |
| "grad_norm": 0.4034237265586853, | |
| "learning_rate": 7.443811172247822e-06, | |
| "loss": 0.5418, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 3.9418867924528302, | |
| "grad_norm": 0.4142571985721588, | |
| "learning_rate": 7.387213885189746e-06, | |
| "loss": 0.5517, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 3.9781132075471697, | |
| "grad_norm": 0.4551447033882141, | |
| "learning_rate": 7.330217598512696e-06, | |
| "loss": 0.5426, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.5489457845687866, | |
| "learning_rate": 7.2728318385925035e-06, | |
| "loss": 0.5209, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 4.036226415094339, | |
| "grad_norm": 0.5183960795402527, | |
| "learning_rate": 7.215066196901676e-06, | |
| "loss": 0.4952, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 4.072452830188679, | |
| "grad_norm": 0.4089732766151428, | |
| "learning_rate": 7.156930328406268e-06, | |
| "loss": 0.4918, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 4.108679245283019, | |
| "grad_norm": 0.47553887963294983, | |
| "learning_rate": 7.098433949952146e-06, | |
| "loss": 0.4745, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 4.144905660377359, | |
| "grad_norm": 0.43845227360725403, | |
| "learning_rate": 7.039586838640918e-06, | |
| "loss": 0.463, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 4.181132075471698, | |
| "grad_norm": 0.4647291898727417, | |
| "learning_rate": 6.980398830195785e-06, | |
| "loss": 0.5002, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 4.2173584905660375, | |
| "grad_norm": 0.42274776101112366, | |
| "learning_rate": 6.920879817317588e-06, | |
| "loss": 0.4634, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 4.253584905660377, | |
| "grad_norm": 0.46993109583854675, | |
| "learning_rate": 6.861039748031351e-06, | |
| "loss": 0.4527, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 4.289811320754717, | |
| "grad_norm": 0.49515578150749207, | |
| "learning_rate": 6.800888624023552e-06, | |
| "loss": 0.473, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 4.326037735849057, | |
| "grad_norm": 0.4713442027568817, | |
| "learning_rate": 6.740436498970453e-06, | |
| "loss": 0.4611, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 4.362264150943396, | |
| "grad_norm": 0.4236859977245331, | |
| "learning_rate": 6.679693476857712e-06, | |
| "loss": 0.4632, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 4.398490566037736, | |
| "grad_norm": 0.4841013252735138, | |
| "learning_rate": 6.618669710291607e-06, | |
| "loss": 0.4825, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 4.434716981132075, | |
| "grad_norm": 0.5489901304244995, | |
| "learning_rate": 6.557375398802124e-06, | |
| "loss": 0.4679, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 4.470943396226415, | |
| "grad_norm": 0.49812740087509155, | |
| "learning_rate": 6.495820787138209e-06, | |
| "loss": 0.4615, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 4.507169811320755, | |
| "grad_norm": 0.3921999931335449, | |
| "learning_rate": 6.434016163555452e-06, | |
| "loss": 0.4664, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 4.543396226415094, | |
| "grad_norm": 0.478518545627594, | |
| "learning_rate": 6.371971858096509e-06, | |
| "loss": 0.4754, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 4.579622641509434, | |
| "grad_norm": 0.5080537796020508, | |
| "learning_rate": 6.30969824086453e-06, | |
| "loss": 0.442, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 4.615849056603773, | |
| "grad_norm": 0.44495469331741333, | |
| "learning_rate": 6.247205720289907e-06, | |
| "loss": 0.4527, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 4.652075471698113, | |
| "grad_norm": 0.4623711407184601, | |
| "learning_rate": 6.184504741390596e-06, | |
| "loss": 0.434, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 4.688301886792453, | |
| "grad_norm": 0.38758838176727295, | |
| "learning_rate": 6.121605784026339e-06, | |
| "loss": 0.45, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 4.724528301886792, | |
| "grad_norm": 0.4601069688796997, | |
| "learning_rate": 6.058519361147055e-06, | |
| "loss": 0.4655, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 4.760754716981132, | |
| "grad_norm": 0.46330124139785767, | |
| "learning_rate": 5.995256017035703e-06, | |
| "loss": 0.4531, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 4.796981132075472, | |
| "grad_norm": 0.5022570490837097, | |
| "learning_rate": 5.931826325545912e-06, | |
| "loss": 0.465, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 4.8332075471698115, | |
| "grad_norm": 0.4626692533493042, | |
| "learning_rate": 5.8682408883346535e-06, | |
| "loss": 0.4511, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 4.869433962264151, | |
| "grad_norm": 0.3865698575973511, | |
| "learning_rate": 5.804510333090287e-06, | |
| "loss": 0.46, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 4.90566037735849, | |
| "grad_norm": 0.4084339737892151, | |
| "learning_rate": 5.740645311756246e-06, | |
| "loss": 0.4587, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 4.94188679245283, | |
| "grad_norm": 0.42995211482048035, | |
| "learning_rate": 5.6766564987506564e-06, | |
| "loss": 0.4516, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 4.97811320754717, | |
| "grad_norm": 0.41004374623298645, | |
| "learning_rate": 5.612554589182228e-06, | |
| "loss": 0.4644, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.5633000135421753, | |
| "learning_rate": 5.548350297062659e-06, | |
| "loss": 0.4656, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 5.036226415094339, | |
| "grad_norm": 0.4353588819503784, | |
| "learning_rate": 5.484054353515896e-06, | |
| "loss": 0.3881, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 5.072452830188679, | |
| "grad_norm": 0.4253406822681427, | |
| "learning_rate": 5.419677504984534e-06, | |
| "loss": 0.3969, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 5.108679245283019, | |
| "grad_norm": 0.46061569452285767, | |
| "learning_rate": 5.3552305114336515e-06, | |
| "loss": 0.3986, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 5.144905660377359, | |
| "grad_norm": 0.4197911024093628, | |
| "learning_rate": 5.290724144552379e-06, | |
| "loss": 0.4135, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 5.181132075471698, | |
| "grad_norm": 0.40193256735801697, | |
| "learning_rate": 5.2261691859535325e-06, | |
| "loss": 0.3908, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 5.2173584905660375, | |
| "grad_norm": 0.4688466787338257, | |
| "learning_rate": 5.161576425371554e-06, | |
| "loss": 0.3923, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 5.253584905660377, | |
| "grad_norm": 0.5070242881774902, | |
| "learning_rate": 5.096956658859122e-06, | |
| "loss": 0.3837, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 5.289811320754717, | |
| "grad_norm": 0.46500205993652344, | |
| "learning_rate": 5.032320686982697e-06, | |
| "loss": 0.3959, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 5.326037735849057, | |
| "grad_norm": 1.2324038743972778, | |
| "learning_rate": 4.967679313017304e-06, | |
| "loss": 0.3785, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 5.362264150943396, | |
| "grad_norm": 0.4382549524307251, | |
| "learning_rate": 4.903043341140879e-06, | |
| "loss": 0.3595, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 5.398490566037736, | |
| "grad_norm": 0.4779147803783417, | |
| "learning_rate": 4.838423574628447e-06, | |
| "loss": 0.369, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 5.434716981132075, | |
| "grad_norm": 0.506566047668457, | |
| "learning_rate": 4.773830814046469e-06, | |
| "loss": 0.3752, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 5.470943396226415, | |
| "grad_norm": 0.4850460886955261, | |
| "learning_rate": 4.7092758554476215e-06, | |
| "loss": 0.3805, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 5.507169811320755, | |
| "grad_norm": 0.4675082266330719, | |
| "learning_rate": 4.644769488566351e-06, | |
| "loss": 0.3696, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 5.543396226415094, | |
| "grad_norm": 0.4534352123737335, | |
| "learning_rate": 4.580322495015466e-06, | |
| "loss": 0.3937, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 5.579622641509434, | |
| "grad_norm": 0.4178565442562103, | |
| "learning_rate": 4.515945646484105e-06, | |
| "loss": 0.3673, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 5.615849056603773, | |
| "grad_norm": 0.43837058544158936, | |
| "learning_rate": 4.451649702937343e-06, | |
| "loss": 0.3932, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 5.652075471698113, | |
| "grad_norm": 0.5009051561355591, | |
| "learning_rate": 4.387445410817774e-06, | |
| "loss": 0.3767, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 5.688301886792453, | |
| "grad_norm": 0.4401375651359558, | |
| "learning_rate": 4.323343501249346e-06, | |
| "loss": 0.3836, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 5.724528301886792, | |
| "grad_norm": 0.49288874864578247, | |
| "learning_rate": 4.259354688243758e-06, | |
| "loss": 0.3735, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 5.760754716981132, | |
| "grad_norm": 0.41812247037887573, | |
| "learning_rate": 4.195489666909714e-06, | |
| "loss": 0.3632, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 5.796981132075472, | |
| "grad_norm": 0.4192414879798889, | |
| "learning_rate": 4.131759111665349e-06, | |
| "loss": 0.3742, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 5.8332075471698115, | |
| "grad_norm": 0.46856996417045593, | |
| "learning_rate": 4.06817367445409e-06, | |
| "loss": 0.3842, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 5.869433962264151, | |
| "grad_norm": 0.4367072582244873, | |
| "learning_rate": 4.004743982964298e-06, | |
| "loss": 0.3778, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 5.90566037735849, | |
| "grad_norm": 0.4130428433418274, | |
| "learning_rate": 3.941480638852948e-06, | |
| "loss": 0.392, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 5.94188679245283, | |
| "grad_norm": 0.44017598032951355, | |
| "learning_rate": 3.878394215973663e-06, | |
| "loss": 0.3997, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 5.97811320754717, | |
| "grad_norm": 0.44622254371643066, | |
| "learning_rate": 3.815495258609404e-06, | |
| "loss": 0.3827, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 0.44622254371643066, | |
| "learning_rate": 3.752794279710094e-06, | |
| "loss": 0.4119, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 6.036226415094339, | |
| "grad_norm": 0.6544888615608215, | |
| "learning_rate": 3.690301759135471e-06, | |
| "loss": 0.3342, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 6.072452830188679, | |
| "grad_norm": 0.48705703020095825, | |
| "learning_rate": 3.6280281419034934e-06, | |
| "loss": 0.3163, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 6.108679245283019, | |
| "grad_norm": 0.42752087116241455, | |
| "learning_rate": 3.5659838364445505e-06, | |
| "loss": 0.3442, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 6.144905660377359, | |
| "grad_norm": 0.3857191205024719, | |
| "learning_rate": 3.504179212861793e-06, | |
| "loss": 0.3319, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 6.181132075471698, | |
| "grad_norm": 0.46637335419654846, | |
| "learning_rate": 3.442624601197877e-06, | |
| "loss": 0.3167, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 6.2173584905660375, | |
| "grad_norm": 0.44155874848365784, | |
| "learning_rate": 3.3813302897083955e-06, | |
| "loss": 0.3032, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 6.253584905660377, | |
| "grad_norm": 0.4230786859989166, | |
| "learning_rate": 3.3203065231422904e-06, | |
| "loss": 0.3082, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 6.289811320754717, | |
| "grad_norm": 0.45516237616539, | |
| "learning_rate": 3.259563501029548e-06, | |
| "loss": 0.3323, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 6.326037735849057, | |
| "grad_norm": 0.4669758081436157, | |
| "learning_rate": 3.1991113759764493e-06, | |
| "loss": 0.3143, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 6.362264150943396, | |
| "grad_norm": 0.5168375968933105, | |
| "learning_rate": 3.1389602519686515e-06, | |
| "loss": 0.3154, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 6.398490566037736, | |
| "grad_norm": 0.4606865644454956, | |
| "learning_rate": 3.0791201826824117e-06, | |
| "loss": 0.3067, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 6.434716981132075, | |
| "grad_norm": 0.4242306053638458, | |
| "learning_rate": 3.019601169804216e-06, | |
| "loss": 0.3326, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 6.470943396226415, | |
| "grad_norm": 0.4492356777191162, | |
| "learning_rate": 2.9604131613590825e-06, | |
| "loss": 0.3422, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 6.507169811320755, | |
| "grad_norm": 0.4301685690879822, | |
| "learning_rate": 2.901566050047855e-06, | |
| "loss": 0.3071, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 6.543396226415094, | |
| "grad_norm": 0.46162980794906616, | |
| "learning_rate": 2.843069671593734e-06, | |
| "loss": 0.3084, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 6.579622641509434, | |
| "grad_norm": 0.43854185938835144, | |
| "learning_rate": 2.784933803098326e-06, | |
| "loss": 0.3256, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 6.615849056603773, | |
| "grad_norm": 0.46818047761917114, | |
| "learning_rate": 2.7271681614074973e-06, | |
| "loss": 0.3109, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 6.652075471698113, | |
| "grad_norm": 0.3902846872806549, | |
| "learning_rate": 2.6697824014873076e-06, | |
| "loss": 0.303, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 6.688301886792453, | |
| "grad_norm": 0.4288512170314789, | |
| "learning_rate": 2.6127861148102552e-06, | |
| "loss": 0.2988, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 6.724528301886792, | |
| "grad_norm": 0.3952915668487549, | |
| "learning_rate": 2.5561888277521797e-06, | |
| "loss": 0.3003, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 6.760754716981132, | |
| "grad_norm": 0.4569963812828064, | |
| "learning_rate": 2.5000000000000015e-06, | |
| "loss": 0.3145, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 6.796981132075472, | |
| "grad_norm": 0.448163241147995, | |
| "learning_rate": 2.4442290229706344e-06, | |
| "loss": 0.3302, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 6.8332075471698115, | |
| "grad_norm": 0.44628778100013733, | |
| "learning_rate": 2.3888852182413087e-06, | |
| "loss": 0.3256, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 6.869433962264151, | |
| "grad_norm": 0.4496423900127411, | |
| "learning_rate": 2.333977835991545e-06, | |
| "loss": 0.3399, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 6.90566037735849, | |
| "grad_norm": 0.4480762481689453, | |
| "learning_rate": 2.2795160534570866e-06, | |
| "loss": 0.3202, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 6.94188679245283, | |
| "grad_norm": 0.4092267155647278, | |
| "learning_rate": 2.2255089733960162e-06, | |
| "loss": 0.3302, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 6.97811320754717, | |
| "grad_norm": 0.43138426542282104, | |
| "learning_rate": 2.171965622567308e-06, | |
| "loss": 0.3106, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 0.5887420773506165, | |
| "learning_rate": 2.1188949502220987e-06, | |
| "loss": 0.3219, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 7.036226415094339, | |
| "grad_norm": 0.556282103061676, | |
| "learning_rate": 2.066305826607911e-06, | |
| "loss": 0.3141, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 7.072452830188679, | |
| "grad_norm": 0.4548152983188629, | |
| "learning_rate": 2.0142070414860704e-06, | |
| "loss": 0.2697, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 7.108679245283019, | |
| "grad_norm": 0.45477020740509033, | |
| "learning_rate": 1.962607302662582e-06, | |
| "loss": 0.27, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 7.144905660377359, | |
| "grad_norm": 0.3835766911506653, | |
| "learning_rate": 1.9115152345327154e-06, | |
| "loss": 0.2718, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 7.181132075471698, | |
| "grad_norm": 0.4106207489967346, | |
| "learning_rate": 1.8609393766395083e-06, | |
| "loss": 0.2902, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 7.2173584905660375, | |
| "grad_norm": 0.40980860590934753, | |
| "learning_rate": 1.8108881822464697e-06, | |
| "loss": 0.294, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 7.253584905660377, | |
| "grad_norm": 0.4507136940956116, | |
| "learning_rate": 1.7613700169247055e-06, | |
| "loss": 0.2941, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 7.289811320754717, | |
| "grad_norm": 0.3944286108016968, | |
| "learning_rate": 1.7123931571546826e-06, | |
| "loss": 0.2596, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 7.326037735849057, | |
| "grad_norm": 0.42168113589286804, | |
| "learning_rate": 1.6639657889429017e-06, | |
| "loss": 0.2757, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 7.362264150943396, | |
| "grad_norm": 0.45149892568588257, | |
| "learning_rate": 1.6160960064536907e-06, | |
| "loss": 0.2618, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 7.398490566037736, | |
| "grad_norm": 0.4606323540210724, | |
| "learning_rate": 1.5687918106563326e-06, | |
| "loss": 0.2756, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 7.434716981132075, | |
| "grad_norm": 0.5498378872871399, | |
| "learning_rate": 1.52206110798779e-06, | |
| "loss": 0.2818, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 7.470943396226415, | |
| "grad_norm": 0.4124191403388977, | |
| "learning_rate": 1.4759117090312197e-06, | |
| "loss": 0.2731, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 7.507169811320755, | |
| "grad_norm": 0.44834864139556885, | |
| "learning_rate": 1.4303513272105057e-06, | |
| "loss": 0.2687, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 7.543396226415094, | |
| "grad_norm": 0.39862528443336487, | |
| "learning_rate": 1.3853875775010355e-06, | |
| "loss": 0.2688, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 7.579622641509434, | |
| "grad_norm": 0.48807084560394287, | |
| "learning_rate": 1.3410279751569399e-06, | |
| "loss": 0.3119, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 7.615849056603773, | |
| "grad_norm": 0.44604766368865967, | |
| "learning_rate": 1.297279934454978e-06, | |
| "loss": 0.2621, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 7.652075471698113, | |
| "grad_norm": 0.3903985619544983, | |
| "learning_rate": 1.25415076745532e-06, | |
| "loss": 0.2745, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 7.688301886792453, | |
| "grad_norm": 0.4241688549518585, | |
| "learning_rate": 1.2116476827794104e-06, | |
| "loss": 0.264, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 7.724528301886792, | |
| "grad_norm": 0.4157409369945526, | |
| "learning_rate": 1.1697777844051105e-06, | |
| "loss": 0.2695, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 7.760754716981132, | |
| "grad_norm": 0.42209798097610474, | |
| "learning_rate": 1.1285480704793378e-06, | |
| "loss": 0.293, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 7.796981132075472, | |
| "grad_norm": 0.39040008187294006, | |
| "learning_rate": 1.0879654321484012e-06, | |
| "loss": 0.2443, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 7.8332075471698115, | |
| "grad_norm": 0.4144212603569031, | |
| "learning_rate": 1.0480366524062041e-06, | |
| "loss": 0.2747, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 7.869433962264151, | |
| "grad_norm": 0.38053098320961, | |
| "learning_rate": 1.008768404960535e-06, | |
| "loss": 0.2634, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 7.90566037735849, | |
| "grad_norm": 0.4866553246974945, | |
| "learning_rate": 9.701672531176287e-07, | |
| "loss": 0.2883, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 7.94188679245283, | |
| "grad_norm": 0.401796817779541, | |
| "learning_rate": 9.322396486851626e-07, | |
| "loss": 0.2885, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 7.97811320754717, | |
| "grad_norm": 0.4356318414211273, | |
| "learning_rate": 8.949919308939081e-07, | |
| "loss": 0.2985, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.4356318414211273, | |
| "learning_rate": 8.584303253381848e-07, | |
| "loss": 0.2454, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 8.03622641509434, | |
| "grad_norm": 0.5875634551048279, | |
| "learning_rate": 8.225609429353187e-07, | |
| "loss": 0.2752, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 8.072452830188679, | |
| "grad_norm": 0.4065380394458771, | |
| "learning_rate": 7.873897789042523e-07, | |
| "loss": 0.2725, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 8.10867924528302, | |
| "grad_norm": 0.47121724486351013, | |
| "learning_rate": 7.529227117635135e-07, | |
| "loss": 0.2802, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 8.144905660377358, | |
| "grad_norm": 0.41164955496788025, | |
| "learning_rate": 7.191655023486682e-07, | |
| "loss": 0.2475, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 8.181132075471698, | |
| "grad_norm": 0.40498465299606323, | |
| "learning_rate": 6.86123792849458e-07, | |
| "loss": 0.2585, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 8.217358490566038, | |
| "grad_norm": 0.39466381072998047, | |
| "learning_rate": 6.53803105866761e-07, | |
| "loss": 0.2552, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 8.253584905660377, | |
| "grad_norm": 0.3694005310535431, | |
| "learning_rate": 6.222088434895462e-07, | |
| "loss": 0.266, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 8.289811320754717, | |
| "grad_norm": 0.3654123544692993, | |
| "learning_rate": 5.9134628639196e-07, | |
| "loss": 0.261, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 8.326037735849056, | |
| "grad_norm": 0.3835010230541229, | |
| "learning_rate": 5.612205929507209e-07, | |
| "loss": 0.262, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 8.362264150943396, | |
| "grad_norm": 0.4232535660266876, | |
| "learning_rate": 5.318367983829393e-07, | |
| "loss": 0.2443, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 8.398490566037736, | |
| "grad_norm": 0.3865967392921448, | |
| "learning_rate": 5.031998139045352e-07, | |
| "loss": 0.2479, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 8.434716981132075, | |
| "grad_norm": 0.3988674581050873, | |
| "learning_rate": 4.753144259093734e-07, | |
| "loss": 0.2569, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 8.470943396226415, | |
| "grad_norm": 0.40713047981262207, | |
| "learning_rate": 4.481852951692672e-07, | |
| "loss": 0.2446, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 8.507169811320754, | |
| "grad_norm": 0.3784768283367157, | |
| "learning_rate": 4.2181695605497066e-07, | |
| "loss": 0.2561, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 8.543396226415094, | |
| "grad_norm": 0.39897289872169495, | |
| "learning_rate": 3.9621381577830855e-07, | |
| "loss": 0.2504, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 8.579622641509435, | |
| "grad_norm": 0.40435102581977844, | |
| "learning_rate": 3.7138015365554834e-07, | |
| "loss": 0.2572, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 8.615849056603773, | |
| "grad_norm": 0.3976576328277588, | |
| "learning_rate": 3.473201203921578e-07, | |
| "loss": 0.2622, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 8.652075471698113, | |
| "grad_norm": 0.40374553203582764, | |
| "learning_rate": 3.2403773738905185e-07, | |
| "loss": 0.2302, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 8.688301886792452, | |
| "grad_norm": 0.39839479327201843, | |
| "learning_rate": 3.015368960704584e-07, | |
| "loss": 0.2658, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 8.724528301886792, | |
| "grad_norm": 0.3632482588291168, | |
| "learning_rate": 2.798213572335001e-07, | |
| "loss": 0.2435, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 8.760754716981133, | |
| "grad_norm": 0.37311646342277527, | |
| "learning_rate": 2.5889475041961767e-07, | |
| "loss": 0.2427, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 8.796981132075471, | |
| "grad_norm": 0.43890783190727234, | |
| "learning_rate": 2.3876057330792344e-07, | |
| "loss": 0.2576, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 8.833207547169811, | |
| "grad_norm": 0.3749382197856903, | |
| "learning_rate": 2.1942219113060215e-07, | |
| "loss": 0.2588, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 8.86943396226415, | |
| "grad_norm": 0.41522374749183655, | |
| "learning_rate": 2.0088283611044034e-07, | |
| "loss": 0.2526, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 8.90566037735849, | |
| "grad_norm": 0.3923836648464203, | |
| "learning_rate": 1.8314560692059836e-07, | |
| "loss": 0.2504, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 8.94188679245283, | |
| "grad_norm": 0.40502244234085083, | |
| "learning_rate": 1.6621346816668993e-07, | |
| "loss": 0.2409, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 8.97811320754717, | |
| "grad_norm": 0.391658753156662, | |
| "learning_rate": 1.500892498912826e-07, | |
| "loss": 0.2607, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 0.5146093964576721, | |
| "learning_rate": 1.3477564710088097e-07, | |
| "loss": 0.2573, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 9.03622641509434, | |
| "grad_norm": 0.5422486662864685, | |
| "learning_rate": 1.2027521931548214e-07, | |
| "loss": 0.2379, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 9.072452830188679, | |
| "grad_norm": 0.40020889043807983, | |
| "learning_rate": 1.0659039014077943e-07, | |
| "loss": 0.2412, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 9.10867924528302, | |
| "grad_norm": 0.3687814772129059, | |
| "learning_rate": 9.372344686307655e-08, | |
| "loss": 0.2409, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 9.144905660377358, | |
| "grad_norm": 0.380043089389801, | |
| "learning_rate": 8.167654006699444e-08, | |
| "loss": 0.2699, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 9.181132075471698, | |
| "grad_norm": 0.39074811339378357, | |
| "learning_rate": 7.04516832760177e-08, | |
| "loss": 0.2489, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 9.217358490566038, | |
| "grad_norm": 0.38397496938705444, | |
| "learning_rate": 6.005075261595495e-08, | |
| "loss": 0.2585, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 9.253584905660377, | |
| "grad_norm": 0.4563198685646057, | |
| "learning_rate": 5.047548650136513e-08, | |
| "loss": 0.2351, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 9.289811320754717, | |
| "grad_norm": 0.3548761308193207, | |
| "learning_rate": 4.172748534499449e-08, | |
| "loss": 0.2508, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 9.326037735849056, | |
| "grad_norm": 0.41531455516815186, | |
| "learning_rate": 3.3808211290284886e-08, | |
| "loss": 0.2534, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 9.362264150943396, | |
| "grad_norm": 0.34948405623435974, | |
| "learning_rate": 2.6718987966992683e-08, | |
| "loss": 0.2497, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 9.398490566037736, | |
| "grad_norm": 0.3671889305114746, | |
| "learning_rate": 2.0461000269953457e-08, | |
| "loss": 0.2317, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 9.434716981132075, | |
| "grad_norm": 0.43335914611816406, | |
| "learning_rate": 1.5035294161039882e-08, | |
| "loss": 0.2353, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 9.470943396226415, | |
| "grad_norm": 0.41739046573638916, | |
| "learning_rate": 1.044277649433989e-08, | |
| "loss": 0.2589, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 9.507169811320754, | |
| "grad_norm": 0.36336734890937805, | |
| "learning_rate": 6.6842148645840374e-09, | |
| "loss": 0.2228, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 9.543396226415094, | |
| "grad_norm": 0.3958076238632202, | |
| "learning_rate": 3.760237478849793e-09, | |
| "loss": 0.2405, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 9.579622641509435, | |
| "grad_norm": 0.40326353907585144, | |
| "learning_rate": 1.6713330515627512e-09, | |
| "loss": 0.2645, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 9.615849056603773, | |
| "grad_norm": 0.38379284739494324, | |
| "learning_rate": 4.178507228136397e-10, | |
| "loss": 0.2699, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 9.652075471698113, | |
| "grad_norm": 0.4049612283706665, | |
| "learning_rate": 0.0, | |
| "loss": 0.2758, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 9.652075471698113, | |
| "step": 270, | |
| "total_flos": 99687939342336.0, | |
| "train_loss": 0.48553379895510496, | |
| "train_runtime": 12266.0559, | |
| "train_samples_per_second": 1.08, | |
| "train_steps_per_second": 0.022 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 270, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 100000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 99687939342336.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |