| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.511530398322851, | |
| "eval_steps": 500, | |
| "global_step": 300, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.033542976939203356, | |
| "grad_norm": 0.9615421891212463, | |
| "learning_rate": 1e-05, | |
| "loss": 1.818, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.06708595387840671, | |
| "grad_norm": 1.061348557472229, | |
| "learning_rate": 2e-05, | |
| "loss": 1.9118, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.10062893081761007, | |
| "grad_norm": 0.906833827495575, | |
| "learning_rate": 3e-05, | |
| "loss": 1.7764, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.13417190775681342, | |
| "grad_norm": 0.8332676887512207, | |
| "learning_rate": 4e-05, | |
| "loss": 1.8419, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.16771488469601678, | |
| "grad_norm": 0.6788995265960693, | |
| "learning_rate": 5e-05, | |
| "loss": 1.8896, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.20125786163522014, | |
| "grad_norm": 0.5330966711044312, | |
| "learning_rate": 4.888888888888889e-05, | |
| "loss": 1.7301, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.2348008385744235, | |
| "grad_norm": 0.4760504364967346, | |
| "learning_rate": 4.7777777777777784e-05, | |
| "loss": 1.7193, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.26834381551362685, | |
| "grad_norm": 0.3972032070159912, | |
| "learning_rate": 4.666666666666667e-05, | |
| "loss": 1.6832, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.3018867924528302, | |
| "grad_norm": 0.3579612672328949, | |
| "learning_rate": 4.555555555555556e-05, | |
| "loss": 1.608, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.33542976939203356, | |
| "grad_norm": 0.3818889856338501, | |
| "learning_rate": 4.4444444444444447e-05, | |
| "loss": 1.8124, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.3689727463312369, | |
| "grad_norm": 0.3447263538837433, | |
| "learning_rate": 4.3333333333333334e-05, | |
| "loss": 1.7331, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.4025157232704403, | |
| "grad_norm": 0.323868989944458, | |
| "learning_rate": 4.222222222222222e-05, | |
| "loss": 1.7071, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.4360587002096436, | |
| "grad_norm": 0.3421488106250763, | |
| "learning_rate": 4.111111111111111e-05, | |
| "loss": 1.6768, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.469601677148847, | |
| "grad_norm": 0.3541533052921295, | |
| "learning_rate": 4e-05, | |
| "loss": 1.7081, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.5031446540880503, | |
| "grad_norm": 0.33424726128578186, | |
| "learning_rate": 3.888888888888889e-05, | |
| "loss": 1.536, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.5366876310272537, | |
| "grad_norm": 0.36894017457962036, | |
| "learning_rate": 3.777777777777778e-05, | |
| "loss": 1.5753, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.570230607966457, | |
| "grad_norm": 0.3404862880706787, | |
| "learning_rate": 3.6666666666666666e-05, | |
| "loss": 1.6014, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.6037735849056604, | |
| "grad_norm": 0.3427893817424774, | |
| "learning_rate": 3.555555555555556e-05, | |
| "loss": 1.6454, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.6373165618448637, | |
| "grad_norm": 0.3238353431224823, | |
| "learning_rate": 3.444444444444445e-05, | |
| "loss": 1.5531, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.6708595387840671, | |
| "grad_norm": 0.318460077047348, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 1.6036, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.7044025157232704, | |
| "grad_norm": 0.3185439109802246, | |
| "learning_rate": 3.222222222222223e-05, | |
| "loss": 1.5285, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.7379454926624738, | |
| "grad_norm": 0.3249723017215729, | |
| "learning_rate": 3.111111111111111e-05, | |
| "loss": 1.5633, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.7714884696016772, | |
| "grad_norm": 0.3257281184196472, | |
| "learning_rate": 3e-05, | |
| "loss": 1.5892, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.8050314465408805, | |
| "grad_norm": 0.35130995512008667, | |
| "learning_rate": 2.8888888888888888e-05, | |
| "loss": 1.6089, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.8385744234800838, | |
| "grad_norm": 0.32942768931388855, | |
| "learning_rate": 2.777777777777778e-05, | |
| "loss": 1.5397, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.8721174004192872, | |
| "grad_norm": 0.3389425575733185, | |
| "learning_rate": 2.6666666666666667e-05, | |
| "loss": 1.5448, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.9056603773584906, | |
| "grad_norm": 0.322301983833313, | |
| "learning_rate": 2.5555555555555554e-05, | |
| "loss": 1.506, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.939203354297694, | |
| "grad_norm": 0.3191693127155304, | |
| "learning_rate": 2.4444444444444445e-05, | |
| "loss": 1.5321, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.9727463312368972, | |
| "grad_norm": 0.330905020236969, | |
| "learning_rate": 2.3333333333333336e-05, | |
| "loss": 1.734, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 1.0335429769392033, | |
| "grad_norm": 0.6754521131515503, | |
| "learning_rate": 2.2222222222222223e-05, | |
| "loss": 3.2346, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 1.0670859538784068, | |
| "grad_norm": 0.31602275371551514, | |
| "learning_rate": 2.111111111111111e-05, | |
| "loss": 1.5563, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 1.10062893081761, | |
| "grad_norm": 0.3237570822238922, | |
| "learning_rate": 2e-05, | |
| "loss": 1.5454, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 1.1341719077568135, | |
| "grad_norm": 0.30346807837486267, | |
| "learning_rate": 1.888888888888889e-05, | |
| "loss": 1.5016, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 1.1677148846960168, | |
| "grad_norm": 0.3014126121997833, | |
| "learning_rate": 1.777777777777778e-05, | |
| "loss": 1.5729, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 1.20125786163522, | |
| "grad_norm": 0.307090699672699, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 1.6538, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 1.2348008385744236, | |
| "grad_norm": 0.29638656973838806, | |
| "learning_rate": 1.5555555555555555e-05, | |
| "loss": 1.5825, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 1.2683438155136268, | |
| "grad_norm": 0.3165188431739807, | |
| "learning_rate": 1.4444444444444444e-05, | |
| "loss": 1.736, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 1.3018867924528301, | |
| "grad_norm": 0.2909906804561615, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 1.5676, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 1.3354297693920336, | |
| "grad_norm": 0.2958202064037323, | |
| "learning_rate": 1.2222222222222222e-05, | |
| "loss": 1.5789, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 1.368972746331237, | |
| "grad_norm": 0.3097296953201294, | |
| "learning_rate": 1.1111111111111112e-05, | |
| "loss": 1.6394, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.4025157232704402, | |
| "grad_norm": 0.31482434272766113, | |
| "learning_rate": 1e-05, | |
| "loss": 1.6685, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 1.4360587002096437, | |
| "grad_norm": 0.2970486283302307, | |
| "learning_rate": 8.88888888888889e-06, | |
| "loss": 1.5871, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 1.469601677148847, | |
| "grad_norm": 0.29967784881591797, | |
| "learning_rate": 7.777777777777777e-06, | |
| "loss": 1.508, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 1.5031446540880502, | |
| "grad_norm": 0.288617342710495, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 1.5807, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 1.5366876310272537, | |
| "grad_norm": 0.298141747713089, | |
| "learning_rate": 5.555555555555556e-06, | |
| "loss": 1.6094, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 1.570230607966457, | |
| "grad_norm": 0.300231009721756, | |
| "learning_rate": 4.444444444444445e-06, | |
| "loss": 1.6047, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 1.6037735849056602, | |
| "grad_norm": 0.3105227053165436, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 1.6895, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 1.6373165618448637, | |
| "grad_norm": 0.29686439037323, | |
| "learning_rate": 2.2222222222222225e-06, | |
| "loss": 1.568, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 1.6708595387840672, | |
| "grad_norm": 0.29341980814933777, | |
| "learning_rate": 1.1111111111111112e-06, | |
| "loss": 1.4565, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 1.7044025157232703, | |
| "grad_norm": 0.29961156845092773, | |
| "learning_rate": 0.0, | |
| "loss": 1.5627, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.4276729559748428, | |
| "grad_norm": 0.43133091926574707, | |
| "learning_rate": 4.220338983050848e-05, | |
| "loss": 1.6502, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.4360587002096436, | |
| "grad_norm": 0.409037321805954, | |
| "learning_rate": 4.2033898305084746e-05, | |
| "loss": 1.7093, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 0.3889688551425934, | |
| "learning_rate": 4.186440677966102e-05, | |
| "loss": 1.6019, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.4528301886792453, | |
| "grad_norm": 0.4061110019683838, | |
| "learning_rate": 4.1694915254237285e-05, | |
| "loss": 1.7135, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.4612159329140461, | |
| "grad_norm": 0.4317370057106018, | |
| "learning_rate": 4.152542372881356e-05, | |
| "loss": 1.6998, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.469601677148847, | |
| "grad_norm": 0.40719956159591675, | |
| "learning_rate": 4.135593220338983e-05, | |
| "loss": 1.6249, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.4779874213836478, | |
| "grad_norm": 0.3879191279411316, | |
| "learning_rate": 4.1186440677966105e-05, | |
| "loss": 1.5039, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.4863731656184486, | |
| "grad_norm": 0.4131089448928833, | |
| "learning_rate": 4.101694915254237e-05, | |
| "loss": 1.5849, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.4947589098532495, | |
| "grad_norm": 0.3905002772808075, | |
| "learning_rate": 4.0847457627118644e-05, | |
| "loss": 1.4827, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.5031446540880503, | |
| "grad_norm": 0.4169052839279175, | |
| "learning_rate": 4.067796610169492e-05, | |
| "loss": 1.3878, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.5115303983228512, | |
| "grad_norm": 0.42767494916915894, | |
| "learning_rate": 4.050847457627119e-05, | |
| "loss": 1.5039, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.519916142557652, | |
| "grad_norm": 0.42942067980766296, | |
| "learning_rate": 4.0338983050847464e-05, | |
| "loss": 1.5792, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.5283018867924528, | |
| "grad_norm": 0.45012345910072327, | |
| "learning_rate": 4.016949152542373e-05, | |
| "loss": 1.5006, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.5366876310272537, | |
| "grad_norm": 0.43249914050102234, | |
| "learning_rate": 4e-05, | |
| "loss": 1.572, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.5450733752620545, | |
| "grad_norm": 0.43645647168159485, | |
| "learning_rate": 3.983050847457627e-05, | |
| "loss": 1.7374, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.5534591194968553, | |
| "grad_norm": 0.41012486815452576, | |
| "learning_rate": 3.966101694915255e-05, | |
| "loss": 1.4895, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.5618448637316562, | |
| "grad_norm": 0.4467809796333313, | |
| "learning_rate": 3.9491525423728816e-05, | |
| "loss": 1.7326, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.570230607966457, | |
| "grad_norm": 0.4244528114795685, | |
| "learning_rate": 3.932203389830509e-05, | |
| "loss": 1.4259, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.5786163522012578, | |
| "grad_norm": 0.449232280254364, | |
| "learning_rate": 3.9152542372881355e-05, | |
| "loss": 1.6682, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.5870020964360587, | |
| "grad_norm": 0.4241749942302704, | |
| "learning_rate": 3.898305084745763e-05, | |
| "loss": 1.6646, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.5953878406708596, | |
| "grad_norm": 0.4776236116886139, | |
| "learning_rate": 3.88135593220339e-05, | |
| "loss": 1.6555, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.6037735849056604, | |
| "grad_norm": 0.4678778052330017, | |
| "learning_rate": 3.8644067796610175e-05, | |
| "loss": 1.4964, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.6121593291404612, | |
| "grad_norm": 0.4315565526485443, | |
| "learning_rate": 3.847457627118644e-05, | |
| "loss": 1.5025, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.6205450733752621, | |
| "grad_norm": 0.3997185528278351, | |
| "learning_rate": 3.8305084745762714e-05, | |
| "loss": 1.497, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.6289308176100629, | |
| "grad_norm": 0.42872926592826843, | |
| "learning_rate": 3.813559322033898e-05, | |
| "loss": 1.4873, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.6373165618448637, | |
| "grad_norm": 0.45695438981056213, | |
| "learning_rate": 3.7966101694915254e-05, | |
| "loss": 1.606, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.6457023060796646, | |
| "grad_norm": 0.4163571894168854, | |
| "learning_rate": 3.779661016949153e-05, | |
| "loss": 1.4837, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.6540880503144654, | |
| "grad_norm": 0.45837995409965515, | |
| "learning_rate": 3.76271186440678e-05, | |
| "loss": 1.5733, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.6624737945492662, | |
| "grad_norm": 0.4821924865245819, | |
| "learning_rate": 3.745762711864407e-05, | |
| "loss": 1.742, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.6708595387840671, | |
| "grad_norm": 0.4537578225135803, | |
| "learning_rate": 3.728813559322034e-05, | |
| "loss": 1.5152, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.6792452830188679, | |
| "grad_norm": 0.42695993185043335, | |
| "learning_rate": 3.711864406779661e-05, | |
| "loss": 1.5123, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.6876310272536688, | |
| "grad_norm": 0.436599463224411, | |
| "learning_rate": 3.6949152542372886e-05, | |
| "loss": 1.4442, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.6960167714884696, | |
| "grad_norm": 0.44244834780693054, | |
| "learning_rate": 3.677966101694915e-05, | |
| "loss": 1.3896, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.7044025157232704, | |
| "grad_norm": 0.4942834675312042, | |
| "learning_rate": 3.6610169491525426e-05, | |
| "loss": 1.5767, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.7127882599580713, | |
| "grad_norm": 0.4958462119102478, | |
| "learning_rate": 3.644067796610169e-05, | |
| "loss": 1.745, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.7211740041928721, | |
| "grad_norm": 0.4499577581882477, | |
| "learning_rate": 3.6271186440677965e-05, | |
| "loss": 1.4771, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.7295597484276729, | |
| "grad_norm": 0.486020565032959, | |
| "learning_rate": 3.610169491525424e-05, | |
| "loss": 1.5388, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.7379454926624738, | |
| "grad_norm": 0.44480133056640625, | |
| "learning_rate": 3.593220338983051e-05, | |
| "loss": 1.377, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.7463312368972747, | |
| "grad_norm": 0.45817309617996216, | |
| "learning_rate": 3.5762711864406785e-05, | |
| "loss": 1.4927, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.7547169811320755, | |
| "grad_norm": 0.5093894004821777, | |
| "learning_rate": 3.559322033898305e-05, | |
| "loss": 1.5837, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.7631027253668763, | |
| "grad_norm": 0.4713049829006195, | |
| "learning_rate": 3.5423728813559324e-05, | |
| "loss": 1.6117, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.7714884696016772, | |
| "grad_norm": 0.45069509744644165, | |
| "learning_rate": 3.52542372881356e-05, | |
| "loss": 1.4815, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.779874213836478, | |
| "grad_norm": 0.5270215272903442, | |
| "learning_rate": 3.508474576271187e-05, | |
| "loss": 1.5292, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.7882599580712788, | |
| "grad_norm": 0.5345816016197205, | |
| "learning_rate": 3.491525423728814e-05, | |
| "loss": 1.7355, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.7966457023060797, | |
| "grad_norm": 0.43932732939720154, | |
| "learning_rate": 3.474576271186441e-05, | |
| "loss": 1.3706, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.8050314465408805, | |
| "grad_norm": 0.5658639073371887, | |
| "learning_rate": 3.4576271186440676e-05, | |
| "loss": 1.6624, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.8134171907756813, | |
| "grad_norm": 0.4958181083202362, | |
| "learning_rate": 3.440677966101695e-05, | |
| "loss": 1.446, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.8218029350104822, | |
| "grad_norm": 0.45654749870300293, | |
| "learning_rate": 3.423728813559322e-05, | |
| "loss": 1.3261, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.8301886792452831, | |
| "grad_norm": 0.5374109745025635, | |
| "learning_rate": 3.4067796610169496e-05, | |
| "loss": 1.6, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.8385744234800838, | |
| "grad_norm": 0.5511431097984314, | |
| "learning_rate": 3.389830508474576e-05, | |
| "loss": 1.623, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.8469601677148847, | |
| "grad_norm": 0.5506657361984253, | |
| "learning_rate": 3.3728813559322035e-05, | |
| "loss": 1.3841, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.8553459119496856, | |
| "grad_norm": 0.5375157594680786, | |
| "learning_rate": 3.35593220338983e-05, | |
| "loss": 1.5645, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.8637316561844863, | |
| "grad_norm": 0.4963093101978302, | |
| "learning_rate": 3.338983050847458e-05, | |
| "loss": 1.4993, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.8721174004192872, | |
| "grad_norm": 0.5070456862449646, | |
| "learning_rate": 3.322033898305085e-05, | |
| "loss": 1.4661, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.8805031446540881, | |
| "grad_norm": 0.48827776312828064, | |
| "learning_rate": 3.305084745762712e-05, | |
| "loss": 1.5453, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 0.5393761396408081, | |
| "learning_rate": 3.288135593220339e-05, | |
| "loss": 1.4993, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.8972746331236897, | |
| "grad_norm": 0.47431623935699463, | |
| "learning_rate": 3.271186440677966e-05, | |
| "loss": 1.3194, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.9056603773584906, | |
| "grad_norm": 0.5005940794944763, | |
| "learning_rate": 3.2542372881355934e-05, | |
| "loss": 1.4166, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.9140461215932913, | |
| "grad_norm": 0.5325838327407837, | |
| "learning_rate": 3.237288135593221e-05, | |
| "loss": 1.3792, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.9224318658280922, | |
| "grad_norm": 0.48578980565071106, | |
| "learning_rate": 3.2203389830508473e-05, | |
| "loss": 1.3679, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.9308176100628931, | |
| "grad_norm": 0.5063319206237793, | |
| "learning_rate": 3.203389830508475e-05, | |
| "loss": 1.507, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.939203354297694, | |
| "grad_norm": 0.5529047250747681, | |
| "learning_rate": 3.186440677966101e-05, | |
| "loss": 1.6027, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.9475890985324947, | |
| "grad_norm": 0.5580345392227173, | |
| "learning_rate": 3.169491525423729e-05, | |
| "loss": 1.5906, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.9559748427672956, | |
| "grad_norm": 0.5370936393737793, | |
| "learning_rate": 3.1525423728813566e-05, | |
| "loss": 1.8187, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.9643605870020965, | |
| "grad_norm": 0.5383415222167969, | |
| "learning_rate": 3.135593220338983e-05, | |
| "loss": 1.688, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.9727463312368972, | |
| "grad_norm": 0.550933837890625, | |
| "learning_rate": 3.1186440677966106e-05, | |
| "loss": 1.5902, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.9811320754716981, | |
| "grad_norm": 0.5485110878944397, | |
| "learning_rate": 3.101694915254237e-05, | |
| "loss": 1.5818, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.989517819706499, | |
| "grad_norm": 0.6686434149742126, | |
| "learning_rate": 3.0847457627118645e-05, | |
| "loss": 1.5586, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.9979035639412998, | |
| "grad_norm": 0.5468031167984009, | |
| "learning_rate": 3.067796610169492e-05, | |
| "loss": 1.7003, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.0083857442348008, | |
| "grad_norm": 1.3953214883804321, | |
| "learning_rate": 3.050847457627119e-05, | |
| "loss": 3.0796, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.0167714884696017, | |
| "grad_norm": 0.5408557057380676, | |
| "learning_rate": 3.0338983050847458e-05, | |
| "loss": 1.4953, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 1.0251572327044025, | |
| "grad_norm": 0.5604081749916077, | |
| "learning_rate": 3.016949152542373e-05, | |
| "loss": 1.391, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 1.0335429769392033, | |
| "grad_norm": 0.5473874807357788, | |
| "learning_rate": 3e-05, | |
| "loss": 1.5113, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.0419287211740043, | |
| "grad_norm": 0.5697469115257263, | |
| "learning_rate": 2.9830508474576274e-05, | |
| "loss": 1.3537, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.050314465408805, | |
| "grad_norm": 0.6473388075828552, | |
| "learning_rate": 2.9661016949152544e-05, | |
| "loss": 1.4589, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.0587002096436058, | |
| "grad_norm": 0.5580431222915649, | |
| "learning_rate": 2.9491525423728817e-05, | |
| "loss": 1.5658, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.0670859538784068, | |
| "grad_norm": 0.5432992577552795, | |
| "learning_rate": 2.9322033898305083e-05, | |
| "loss": 1.4302, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 1.0754716981132075, | |
| "grad_norm": 0.5975386500358582, | |
| "learning_rate": 2.915254237288136e-05, | |
| "loss": 1.5508, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.0838574423480083, | |
| "grad_norm": 0.565565288066864, | |
| "learning_rate": 2.8983050847457626e-05, | |
| "loss": 1.4234, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.0922431865828093, | |
| "grad_norm": 0.6301350593566895, | |
| "learning_rate": 2.88135593220339e-05, | |
| "loss": 1.5267, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.10062893081761, | |
| "grad_norm": 0.5254076719284058, | |
| "learning_rate": 2.864406779661017e-05, | |
| "loss": 1.3074, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 1.1090146750524108, | |
| "grad_norm": 0.6312873959541321, | |
| "learning_rate": 2.8474576271186442e-05, | |
| "loss": 1.4878, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.1174004192872118, | |
| "grad_norm": 0.4790211319923401, | |
| "learning_rate": 2.8305084745762712e-05, | |
| "loss": 1.3158, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 1.1257861635220126, | |
| "grad_norm": 0.5577117800712585, | |
| "learning_rate": 2.8135593220338985e-05, | |
| "loss": 1.4097, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.1341719077568135, | |
| "grad_norm": 0.5645062327384949, | |
| "learning_rate": 2.7966101694915255e-05, | |
| "loss": 1.4276, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.1425576519916143, | |
| "grad_norm": 0.5814913511276245, | |
| "learning_rate": 2.7796610169491528e-05, | |
| "loss": 1.401, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.150943396226415, | |
| "grad_norm": 0.5780409574508667, | |
| "learning_rate": 2.7627118644067794e-05, | |
| "loss": 1.4739, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 1.159329140461216, | |
| "grad_norm": 0.6315497756004333, | |
| "learning_rate": 2.7457627118644068e-05, | |
| "loss": 1.5386, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.1677148846960168, | |
| "grad_norm": 0.5676960945129395, | |
| "learning_rate": 2.7288135593220337e-05, | |
| "loss": 1.4961, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 1.1761006289308176, | |
| "grad_norm": 0.5943715572357178, | |
| "learning_rate": 2.711864406779661e-05, | |
| "loss": 1.6106, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.1844863731656186, | |
| "grad_norm": 0.5691059231758118, | |
| "learning_rate": 2.6949152542372884e-05, | |
| "loss": 1.5345, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.1928721174004193, | |
| "grad_norm": 0.6429669260978699, | |
| "learning_rate": 2.6779661016949153e-05, | |
| "loss": 1.6742, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.20125786163522, | |
| "grad_norm": 0.5904098749160767, | |
| "learning_rate": 2.6610169491525427e-05, | |
| "loss": 1.458, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 1.209643605870021, | |
| "grad_norm": 0.5914203524589539, | |
| "learning_rate": 2.6440677966101696e-05, | |
| "loss": 1.5086, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.2180293501048218, | |
| "grad_norm": 0.6000847816467285, | |
| "learning_rate": 2.627118644067797e-05, | |
| "loss": 1.4316, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.2264150943396226, | |
| "grad_norm": 0.6070534586906433, | |
| "learning_rate": 2.610169491525424e-05, | |
| "loss": 1.4388, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.2348008385744236, | |
| "grad_norm": 0.5641275644302368, | |
| "learning_rate": 2.5932203389830512e-05, | |
| "loss": 1.5318, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 1.2431865828092243, | |
| "grad_norm": 0.5671488642692566, | |
| "learning_rate": 2.576271186440678e-05, | |
| "loss": 1.6092, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.251572327044025, | |
| "grad_norm": 0.6899793744087219, | |
| "learning_rate": 2.5593220338983055e-05, | |
| "loss": 1.536, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 1.259958071278826, | |
| "grad_norm": 0.6142588257789612, | |
| "learning_rate": 2.5423728813559322e-05, | |
| "loss": 1.6281, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.2683438155136268, | |
| "grad_norm": 0.6308810114860535, | |
| "learning_rate": 2.5254237288135595e-05, | |
| "loss": 1.6989, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.2767295597484276, | |
| "grad_norm": 0.699433445930481, | |
| "learning_rate": 2.5084745762711865e-05, | |
| "loss": 1.4067, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.2851153039832286, | |
| "grad_norm": 0.6100484132766724, | |
| "learning_rate": 2.4915254237288138e-05, | |
| "loss": 1.569, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.2935010482180294, | |
| "grad_norm": 0.5674847364425659, | |
| "learning_rate": 2.4745762711864408e-05, | |
| "loss": 1.4536, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.3018867924528301, | |
| "grad_norm": 0.6240501999855042, | |
| "learning_rate": 2.457627118644068e-05, | |
| "loss": 1.3428, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.310272536687631, | |
| "grad_norm": 0.6679978370666504, | |
| "learning_rate": 2.440677966101695e-05, | |
| "loss": 1.5999, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.3186582809224319, | |
| "grad_norm": 0.5994001626968384, | |
| "learning_rate": 2.4237288135593224e-05, | |
| "loss": 1.5542, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.3270440251572326, | |
| "grad_norm": 0.6358633041381836, | |
| "learning_rate": 2.4067796610169493e-05, | |
| "loss": 1.3593, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.3354297693920336, | |
| "grad_norm": 0.5659995079040527, | |
| "learning_rate": 2.3898305084745763e-05, | |
| "loss": 1.3259, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.3438155136268344, | |
| "grad_norm": 0.7298100590705872, | |
| "learning_rate": 2.3728813559322036e-05, | |
| "loss": 1.5724, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.3522012578616351, | |
| "grad_norm": 0.6506521701812744, | |
| "learning_rate": 2.3559322033898306e-05, | |
| "loss": 1.5445, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.3605870020964361, | |
| "grad_norm": 0.6763033866882324, | |
| "learning_rate": 2.338983050847458e-05, | |
| "loss": 1.5003, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.368972746331237, | |
| "grad_norm": 0.5723408460617065, | |
| "learning_rate": 2.322033898305085e-05, | |
| "loss": 1.5313, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.3773584905660377, | |
| "grad_norm": 0.6918197870254517, | |
| "learning_rate": 2.305084745762712e-05, | |
| "loss": 1.5711, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.3857442348008386, | |
| "grad_norm": 0.6125330924987793, | |
| "learning_rate": 2.2881355932203392e-05, | |
| "loss": 1.395, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.3941299790356394, | |
| "grad_norm": 0.6379712820053101, | |
| "learning_rate": 2.271186440677966e-05, | |
| "loss": 1.684, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.4025157232704402, | |
| "grad_norm": 0.6271690726280212, | |
| "learning_rate": 2.2542372881355935e-05, | |
| "loss": 1.4623, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.4109014675052411, | |
| "grad_norm": 0.6018547415733337, | |
| "learning_rate": 2.2372881355932205e-05, | |
| "loss": 1.3288, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.419287211740042, | |
| "grad_norm": 0.6406589150428772, | |
| "learning_rate": 2.2203389830508474e-05, | |
| "loss": 1.4531, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.4276729559748427, | |
| "grad_norm": 0.6161438822746277, | |
| "learning_rate": 2.2033898305084748e-05, | |
| "loss": 1.5999, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.4360587002096437, | |
| "grad_norm": 0.614861249923706, | |
| "learning_rate": 2.1864406779661017e-05, | |
| "loss": 1.3684, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.4444444444444444, | |
| "grad_norm": 0.6136622428894043, | |
| "learning_rate": 2.1694915254237287e-05, | |
| "loss": 1.399, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.4528301886792452, | |
| "grad_norm": 0.5674051642417908, | |
| "learning_rate": 2.152542372881356e-05, | |
| "loss": 1.274, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.4612159329140462, | |
| "grad_norm": 0.6396893858909607, | |
| "learning_rate": 2.135593220338983e-05, | |
| "loss": 1.5185, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.469601677148847, | |
| "grad_norm": 0.6016610264778137, | |
| "learning_rate": 2.1186440677966103e-05, | |
| "loss": 1.2965, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.4779874213836477, | |
| "grad_norm": 0.6875283122062683, | |
| "learning_rate": 2.1016949152542373e-05, | |
| "loss": 1.6257, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.4863731656184487, | |
| "grad_norm": 0.5814647078514099, | |
| "learning_rate": 2.0847457627118643e-05, | |
| "loss": 1.3111, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 1.4947589098532494, | |
| "grad_norm": 0.6307722926139832, | |
| "learning_rate": 2.0677966101694916e-05, | |
| "loss": 1.547, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.5031446540880502, | |
| "grad_norm": 0.588858962059021, | |
| "learning_rate": 2.0508474576271186e-05, | |
| "loss": 1.3264, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 1.5115303983228512, | |
| "grad_norm": 0.669362485408783, | |
| "learning_rate": 2.033898305084746e-05, | |
| "loss": 1.5909, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.519916142557652, | |
| "grad_norm": 0.7193084359169006, | |
| "learning_rate": 2.0169491525423732e-05, | |
| "loss": 1.5327, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 1.5283018867924527, | |
| "grad_norm": 0.635857105255127, | |
| "learning_rate": 2e-05, | |
| "loss": 1.4276, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.5366876310272537, | |
| "grad_norm": 0.636381208896637, | |
| "learning_rate": 1.9830508474576275e-05, | |
| "loss": 1.3813, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 1.5450733752620545, | |
| "grad_norm": 0.6892669796943665, | |
| "learning_rate": 1.9661016949152545e-05, | |
| "loss": 1.3461, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.5534591194968552, | |
| "grad_norm": 0.671186089515686, | |
| "learning_rate": 1.9491525423728814e-05, | |
| "loss": 1.3873, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.5618448637316562, | |
| "grad_norm": 0.5775100588798523, | |
| "learning_rate": 1.9322033898305087e-05, | |
| "loss": 1.396, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.570230607966457, | |
| "grad_norm": 0.634170651435852, | |
| "learning_rate": 1.9152542372881357e-05, | |
| "loss": 1.6692, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 1.5786163522012577, | |
| "grad_norm": 0.6621935963630676, | |
| "learning_rate": 1.8983050847457627e-05, | |
| "loss": 1.5905, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.5870020964360587, | |
| "grad_norm": 0.6979579329490662, | |
| "learning_rate": 1.88135593220339e-05, | |
| "loss": 1.5344, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.5953878406708597, | |
| "grad_norm": 0.6624859571456909, | |
| "learning_rate": 1.864406779661017e-05, | |
| "loss": 1.3544, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.6037735849056602, | |
| "grad_norm": 0.6619541645050049, | |
| "learning_rate": 1.8474576271186443e-05, | |
| "loss": 1.6527, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 1.6121593291404612, | |
| "grad_norm": 0.646507978439331, | |
| "learning_rate": 1.8305084745762713e-05, | |
| "loss": 1.4865, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.6205450733752622, | |
| "grad_norm": 0.6617197394371033, | |
| "learning_rate": 1.8135593220338983e-05, | |
| "loss": 1.5115, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.6289308176100628, | |
| "grad_norm": 0.5884259939193726, | |
| "learning_rate": 1.7966101694915256e-05, | |
| "loss": 1.2601, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.6373165618448637, | |
| "grad_norm": 0.6010658144950867, | |
| "learning_rate": 1.7796610169491526e-05, | |
| "loss": 1.408, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.6457023060796647, | |
| "grad_norm": 0.7407470941543579, | |
| "learning_rate": 1.76271186440678e-05, | |
| "loss": 1.346, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.6540880503144653, | |
| "grad_norm": 0.7493016719818115, | |
| "learning_rate": 1.745762711864407e-05, | |
| "loss": 1.421, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.6624737945492662, | |
| "grad_norm": 0.5945444107055664, | |
| "learning_rate": 1.7288135593220338e-05, | |
| "loss": 1.3774, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.6708595387840672, | |
| "grad_norm": 0.5583181977272034, | |
| "learning_rate": 1.711864406779661e-05, | |
| "loss": 1.1518, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 1.6792452830188678, | |
| "grad_norm": 0.6571647524833679, | |
| "learning_rate": 1.694915254237288e-05, | |
| "loss": 1.401, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.6876310272536688, | |
| "grad_norm": 0.6961767673492432, | |
| "learning_rate": 1.677966101694915e-05, | |
| "loss": 1.4618, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 1.6960167714884697, | |
| "grad_norm": 0.6763336062431335, | |
| "learning_rate": 1.6610169491525424e-05, | |
| "loss": 1.3297, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.7044025157232703, | |
| "grad_norm": 0.7434819340705872, | |
| "learning_rate": 1.6440677966101694e-05, | |
| "loss": 1.3833, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 1.7127882599580713, | |
| "grad_norm": 0.6780304908752441, | |
| "learning_rate": 1.6271186440677967e-05, | |
| "loss": 1.39, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.7211740041928723, | |
| "grad_norm": 0.6340621113777161, | |
| "learning_rate": 1.6101694915254237e-05, | |
| "loss": 1.2273, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.7295597484276728, | |
| "grad_norm": 0.6686990261077881, | |
| "learning_rate": 1.5932203389830507e-05, | |
| "loss": 1.4499, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.7379454926624738, | |
| "grad_norm": 0.7210912108421326, | |
| "learning_rate": 1.5762711864406783e-05, | |
| "loss": 1.4879, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 1.7463312368972748, | |
| "grad_norm": 0.7638130784034729, | |
| "learning_rate": 1.5593220338983053e-05, | |
| "loss": 1.5784, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.7547169811320755, | |
| "grad_norm": 0.7345211505889893, | |
| "learning_rate": 1.5423728813559323e-05, | |
| "loss": 1.3925, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 1.7631027253668763, | |
| "grad_norm": 0.5969035625457764, | |
| "learning_rate": 1.5254237288135596e-05, | |
| "loss": 1.2387, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.7714884696016773, | |
| "grad_norm": 0.6565172076225281, | |
| "learning_rate": 1.5084745762711865e-05, | |
| "loss": 1.2855, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 1.779874213836478, | |
| "grad_norm": 0.6907662153244019, | |
| "learning_rate": 1.4915254237288137e-05, | |
| "loss": 1.3413, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.7882599580712788, | |
| "grad_norm": 0.6184176206588745, | |
| "learning_rate": 1.4745762711864408e-05, | |
| "loss": 1.445, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 1.7966457023060798, | |
| "grad_norm": 0.6009007096290588, | |
| "learning_rate": 1.457627118644068e-05, | |
| "loss": 1.1403, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.8050314465408805, | |
| "grad_norm": 0.7282977104187012, | |
| "learning_rate": 1.440677966101695e-05, | |
| "loss": 1.4733, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.8134171907756813, | |
| "grad_norm": 0.6807677745819092, | |
| "learning_rate": 1.4237288135593221e-05, | |
| "loss": 1.6621, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.8218029350104823, | |
| "grad_norm": 0.5497955083847046, | |
| "learning_rate": 1.4067796610169493e-05, | |
| "loss": 1.2456, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 1.830188679245283, | |
| "grad_norm": 0.65602046251297, | |
| "learning_rate": 1.3898305084745764e-05, | |
| "loss": 1.3571, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.8385744234800838, | |
| "grad_norm": 0.8637228012084961, | |
| "learning_rate": 1.3728813559322034e-05, | |
| "loss": 1.5819, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 1.8469601677148848, | |
| "grad_norm": 0.671103298664093, | |
| "learning_rate": 1.3559322033898305e-05, | |
| "loss": 1.4916, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.8553459119496856, | |
| "grad_norm": 0.652407705783844, | |
| "learning_rate": 1.3389830508474577e-05, | |
| "loss": 1.4693, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 1.8637316561844863, | |
| "grad_norm": 0.6491547226905823, | |
| "learning_rate": 1.3220338983050848e-05, | |
| "loss": 1.4074, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.8721174004192873, | |
| "grad_norm": 0.6175271272659302, | |
| "learning_rate": 1.305084745762712e-05, | |
| "loss": 1.3958, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 1.880503144654088, | |
| "grad_norm": 0.6546741127967834, | |
| "learning_rate": 1.288135593220339e-05, | |
| "loss": 1.2658, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.8888888888888888, | |
| "grad_norm": 0.7430203557014465, | |
| "learning_rate": 1.2711864406779661e-05, | |
| "loss": 1.5967, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.8972746331236898, | |
| "grad_norm": 0.7903656959533691, | |
| "learning_rate": 1.2542372881355932e-05, | |
| "loss": 1.3528, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.9056603773584906, | |
| "grad_norm": 0.712054967880249, | |
| "learning_rate": 1.2372881355932204e-05, | |
| "loss": 1.4618, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 1.9140461215932913, | |
| "grad_norm": 0.6519030332565308, | |
| "learning_rate": 1.2203389830508475e-05, | |
| "loss": 1.3854, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.9224318658280923, | |
| "grad_norm": 0.6560716032981873, | |
| "learning_rate": 1.2033898305084747e-05, | |
| "loss": 1.4835, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 1.930817610062893, | |
| "grad_norm": 0.61641526222229, | |
| "learning_rate": 1.1864406779661018e-05, | |
| "loss": 1.4169, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.9392033542976939, | |
| "grad_norm": 0.8207079172134399, | |
| "learning_rate": 1.169491525423729e-05, | |
| "loss": 1.5823, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 1.9475890985324948, | |
| "grad_norm": 0.6479889154434204, | |
| "learning_rate": 1.152542372881356e-05, | |
| "loss": 1.4131, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.9559748427672956, | |
| "grad_norm": 0.746671199798584, | |
| "learning_rate": 1.135593220338983e-05, | |
| "loss": 1.391, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 1.9643605870020964, | |
| "grad_norm": 0.7543257474899292, | |
| "learning_rate": 1.1186440677966102e-05, | |
| "loss": 1.5147, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.9727463312368974, | |
| "grad_norm": 0.6632611751556396, | |
| "learning_rate": 1.1016949152542374e-05, | |
| "loss": 1.5113, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.9811320754716981, | |
| "grad_norm": 0.6857608556747437, | |
| "learning_rate": 1.0847457627118644e-05, | |
| "loss": 1.3092, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.9895178197064989, | |
| "grad_norm": 0.6538596153259277, | |
| "learning_rate": 1.0677966101694915e-05, | |
| "loss": 1.2827, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 1.9979035639412999, | |
| "grad_norm": 0.6871718764305115, | |
| "learning_rate": 1.0508474576271186e-05, | |
| "loss": 1.4524, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.580600380897522, | |
| "learning_rate": 1.0338983050847458e-05, | |
| "loss": 1.3756, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 2.008385744234801, | |
| "grad_norm": 0.8496657609939575, | |
| "learning_rate": 1.016949152542373e-05, | |
| "loss": 1.5361, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.0167714884696015, | |
| "grad_norm": 0.6252016425132751, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2379, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 2.0251572327044025, | |
| "grad_norm": 0.5890762209892273, | |
| "learning_rate": 9.830508474576272e-06, | |
| "loss": 1.1365, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 2.0335429769392035, | |
| "grad_norm": 0.6594178080558777, | |
| "learning_rate": 9.661016949152544e-06, | |
| "loss": 1.4264, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 2.041928721174004, | |
| "grad_norm": 0.6303755640983582, | |
| "learning_rate": 9.491525423728814e-06, | |
| "loss": 1.5162, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 2.050314465408805, | |
| "grad_norm": 0.7321446537971497, | |
| "learning_rate": 9.322033898305085e-06, | |
| "loss": 1.5888, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.058700209643606, | |
| "grad_norm": 0.6928514838218689, | |
| "learning_rate": 9.152542372881356e-06, | |
| "loss": 1.5963, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 2.0670859538784065, | |
| "grad_norm": 0.7305393815040588, | |
| "learning_rate": 8.983050847457628e-06, | |
| "loss": 1.4996, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 2.0754716981132075, | |
| "grad_norm": 0.7094164490699768, | |
| "learning_rate": 8.8135593220339e-06, | |
| "loss": 1.3651, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 2.0838574423480085, | |
| "grad_norm": 0.6932939291000366, | |
| "learning_rate": 8.644067796610169e-06, | |
| "loss": 1.4837, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 2.092243186582809, | |
| "grad_norm": 0.846845269203186, | |
| "learning_rate": 8.47457627118644e-06, | |
| "loss": 1.5293, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.10062893081761, | |
| "grad_norm": 0.7128404974937439, | |
| "learning_rate": 8.305084745762712e-06, | |
| "loss": 1.3928, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 2.109014675052411, | |
| "grad_norm": 0.7099897861480713, | |
| "learning_rate": 8.135593220338983e-06, | |
| "loss": 1.396, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 2.1174004192872116, | |
| "grad_norm": 0.6866568922996521, | |
| "learning_rate": 7.966101694915253e-06, | |
| "loss": 1.5034, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 2.1257861635220126, | |
| "grad_norm": 0.583806574344635, | |
| "learning_rate": 7.796610169491526e-06, | |
| "loss": 1.2417, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 2.1341719077568135, | |
| "grad_norm": 0.6535069942474365, | |
| "learning_rate": 7.627118644067798e-06, | |
| "loss": 1.2934, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.142557651991614, | |
| "grad_norm": 0.6619601845741272, | |
| "learning_rate": 7.4576271186440685e-06, | |
| "loss": 1.2856, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 2.150943396226415, | |
| "grad_norm": 0.7087454795837402, | |
| "learning_rate": 7.28813559322034e-06, | |
| "loss": 1.3244, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 2.159329140461216, | |
| "grad_norm": 0.7019234895706177, | |
| "learning_rate": 7.1186440677966106e-06, | |
| "loss": 1.3269, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 2.1677148846960166, | |
| "grad_norm": 0.6695578694343567, | |
| "learning_rate": 6.949152542372882e-06, | |
| "loss": 1.2839, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 2.1761006289308176, | |
| "grad_norm": 0.6900045275688171, | |
| "learning_rate": 6.779661016949153e-06, | |
| "loss": 1.5439, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.1844863731656186, | |
| "grad_norm": 0.7736982107162476, | |
| "learning_rate": 6.610169491525424e-06, | |
| "loss": 1.5258, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 2.192872117400419, | |
| "grad_norm": 0.5855519771575928, | |
| "learning_rate": 6.440677966101695e-06, | |
| "loss": 1.1754, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 2.20125786163522, | |
| "grad_norm": 0.6449745893478394, | |
| "learning_rate": 6.271186440677966e-06, | |
| "loss": 1.4888, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 2.209643605870021, | |
| "grad_norm": 0.7780332565307617, | |
| "learning_rate": 6.101694915254238e-06, | |
| "loss": 1.5469, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 2.2180293501048216, | |
| "grad_norm": 0.6325747966766357, | |
| "learning_rate": 5.932203389830509e-06, | |
| "loss": 1.3144, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 2.2264150943396226, | |
| "grad_norm": 0.6543543338775635, | |
| "learning_rate": 5.76271186440678e-06, | |
| "loss": 1.1714, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 2.2348008385744236, | |
| "grad_norm": 0.8630987405776978, | |
| "learning_rate": 5.593220338983051e-06, | |
| "loss": 1.4851, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 2.243186582809224, | |
| "grad_norm": 0.7857372164726257, | |
| "learning_rate": 5.423728813559322e-06, | |
| "loss": 1.3268, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 2.251572327044025, | |
| "grad_norm": 0.7938205599784851, | |
| "learning_rate": 5.254237288135593e-06, | |
| "loss": 1.5033, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 2.259958071278826, | |
| "grad_norm": 0.6283496022224426, | |
| "learning_rate": 5.084745762711865e-06, | |
| "loss": 1.2449, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.268343815513627, | |
| "grad_norm": 0.7021183967590332, | |
| "learning_rate": 4.915254237288136e-06, | |
| "loss": 1.3424, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 2.2767295597484276, | |
| "grad_norm": 0.730631411075592, | |
| "learning_rate": 4.745762711864407e-06, | |
| "loss": 1.3327, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 2.2851153039832286, | |
| "grad_norm": 0.6509723663330078, | |
| "learning_rate": 4.576271186440678e-06, | |
| "loss": 1.1817, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 2.2935010482180296, | |
| "grad_norm": 0.6313263177871704, | |
| "learning_rate": 4.40677966101695e-06, | |
| "loss": 1.3395, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 2.30188679245283, | |
| "grad_norm": 0.7210220694541931, | |
| "learning_rate": 4.23728813559322e-06, | |
| "loss": 1.5118, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.310272536687631, | |
| "grad_norm": 0.698341429233551, | |
| "learning_rate": 4.067796610169492e-06, | |
| "loss": 1.4284, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 2.318658280922432, | |
| "grad_norm": 0.6756731271743774, | |
| "learning_rate": 3.898305084745763e-06, | |
| "loss": 1.5066, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 2.3270440251572326, | |
| "grad_norm": 0.6834630370140076, | |
| "learning_rate": 3.7288135593220342e-06, | |
| "loss": 1.476, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 2.3354297693920336, | |
| "grad_norm": 0.715414822101593, | |
| "learning_rate": 3.5593220338983053e-06, | |
| "loss": 1.4063, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 2.3438155136268346, | |
| "grad_norm": 0.6956151723861694, | |
| "learning_rate": 3.3898305084745763e-06, | |
| "loss": 1.412, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.352201257861635, | |
| "grad_norm": 0.6213716268539429, | |
| "learning_rate": 3.2203389830508473e-06, | |
| "loss": 1.2284, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 2.360587002096436, | |
| "grad_norm": 0.7275508642196655, | |
| "learning_rate": 3.050847457627119e-06, | |
| "loss": 1.4907, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 2.368972746331237, | |
| "grad_norm": 0.672480046749115, | |
| "learning_rate": 2.88135593220339e-06, | |
| "loss": 1.3039, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 2.3773584905660377, | |
| "grad_norm": 0.7390619516372681, | |
| "learning_rate": 2.711864406779661e-06, | |
| "loss": 1.452, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 2.3857442348008386, | |
| "grad_norm": 0.6363676190376282, | |
| "learning_rate": 2.5423728813559323e-06, | |
| "loss": 1.264, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 2.3941299790356396, | |
| "grad_norm": 0.7060114145278931, | |
| "learning_rate": 2.3728813559322034e-06, | |
| "loss": 1.438, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 2.40251572327044, | |
| "grad_norm": 0.7109473347663879, | |
| "learning_rate": 2.203389830508475e-06, | |
| "loss": 1.4074, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 2.410901467505241, | |
| "grad_norm": 0.7845531105995178, | |
| "learning_rate": 2.033898305084746e-06, | |
| "loss": 1.513, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 2.419287211740042, | |
| "grad_norm": 0.7582221627235413, | |
| "learning_rate": 1.8644067796610171e-06, | |
| "loss": 1.475, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 2.4276729559748427, | |
| "grad_norm": 0.7518870234489441, | |
| "learning_rate": 1.6949152542372882e-06, | |
| "loss": 1.5362, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.4360587002096437, | |
| "grad_norm": 0.7295182347297668, | |
| "learning_rate": 1.5254237288135594e-06, | |
| "loss": 1.3987, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 2.4444444444444446, | |
| "grad_norm": 0.7670787572860718, | |
| "learning_rate": 1.3559322033898304e-06, | |
| "loss": 1.4496, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 2.452830188679245, | |
| "grad_norm": 0.7006129026412964, | |
| "learning_rate": 1.1864406779661017e-06, | |
| "loss": 1.366, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 2.461215932914046, | |
| "grad_norm": 0.6317689418792725, | |
| "learning_rate": 1.016949152542373e-06, | |
| "loss": 1.3638, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 2.469601677148847, | |
| "grad_norm": 0.6305463910102844, | |
| "learning_rate": 8.474576271186441e-07, | |
| "loss": 1.166, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 2.4779874213836477, | |
| "grad_norm": 0.7784201502799988, | |
| "learning_rate": 6.779661016949152e-07, | |
| "loss": 1.4706, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 2.4863731656184487, | |
| "grad_norm": 0.7264308333396912, | |
| "learning_rate": 5.084745762711865e-07, | |
| "loss": 1.4284, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 2.4947589098532497, | |
| "grad_norm": 0.7306190133094788, | |
| "learning_rate": 3.389830508474576e-07, | |
| "loss": 1.4214, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 2.50314465408805, | |
| "grad_norm": 0.743761420249939, | |
| "learning_rate": 1.694915254237288e-07, | |
| "loss": 1.5255, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 2.511530398322851, | |
| "grad_norm": 0.699112594127655, | |
| "learning_rate": 0.0, | |
| "loss": 1.4776, | |
| "step": 300 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 300, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.114333412565627e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |