| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.899521531100478, |
| "eval_steps": 500, |
| "global_step": 312, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.019138755980861243, |
| "grad_norm": 3.7146408557891846, |
| "learning_rate": 2.2222222222222223e-05, |
| "loss": 4.4869, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.03827751196172249, |
| "grad_norm": 3.3118133544921875, |
| "learning_rate": 4.4444444444444447e-05, |
| "loss": 4.1867, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.05741626794258373, |
| "grad_norm": 2.972708225250244, |
| "learning_rate": 6.666666666666667e-05, |
| "loss": 4.001, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.07655502392344497, |
| "grad_norm": 4.938202381134033, |
| "learning_rate": 8.888888888888889e-05, |
| "loss": 5.0582, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.09569377990430622, |
| "grad_norm": 3.5732812881469727, |
| "learning_rate": 0.00011111111111111112, |
| "loss": 4.5871, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.11483253588516747, |
| "grad_norm": 3.350315570831299, |
| "learning_rate": 0.00013333333333333334, |
| "loss": 4.0071, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.1339712918660287, |
| "grad_norm": 3.4415643215179443, |
| "learning_rate": 0.00015555555555555556, |
| "loss": 3.9791, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.15311004784688995, |
| "grad_norm": 2.558781385421753, |
| "learning_rate": 0.00017777777777777779, |
| "loss": 3.6497, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.1722488038277512, |
| "grad_norm": 2.3021087646484375, |
| "learning_rate": 0.0002, |
| "loss": 3.5205, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.19138755980861244, |
| "grad_norm": 2.301999568939209, |
| "learning_rate": 0.00019999462497359466, |
| "loss": 4.112, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.21052631578947367, |
| "grad_norm": 3.0552637577056885, |
| "learning_rate": 0.0001999785004721968, |
| "loss": 3.8723, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.22966507177033493, |
| "grad_norm": 2.5972537994384766, |
| "learning_rate": 0.00019995162822919883, |
| "loss": 3.8135, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.24880382775119617, |
| "grad_norm": 2.0281920433044434, |
| "learning_rate": 0.00019991401113338104, |
| "loss": 3.8702, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.2679425837320574, |
| "grad_norm": 1.7147849798202515, |
| "learning_rate": 0.00019986565322860115, |
| "loss": 3.463, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.28708133971291866, |
| "grad_norm": 2.082582473754883, |
| "learning_rate": 0.00019980655971335945, |
| "loss": 3.3816, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.3062200956937799, |
| "grad_norm": 2.1299426555633545, |
| "learning_rate": 0.00019973673694024, |
| "loss": 3.698, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.3253588516746411, |
| "grad_norm": 1.8626389503479004, |
| "learning_rate": 0.0001996561924152278, |
| "loss": 3.3583, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.3444976076555024, |
| "grad_norm": 2.452871322631836, |
| "learning_rate": 0.0001995649347969019, |
| "loss": 3.4957, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.36363636363636365, |
| "grad_norm": 2.265108585357666, |
| "learning_rate": 0.00019946297389550433, |
| "loss": 3.1115, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.3827751196172249, |
| "grad_norm": 1.996728777885437, |
| "learning_rate": 0.0001993503206718859, |
| "loss": 3.4159, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.4019138755980861, |
| "grad_norm": 1.913594365119934, |
| "learning_rate": 0.00019922698723632767, |
| "loss": 3.3288, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.42105263157894735, |
| "grad_norm": 2.4316132068634033, |
| "learning_rate": 0.00019909298684723904, |
| "loss": 3.4245, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.44019138755980863, |
| "grad_norm": 1.998693823814392, |
| "learning_rate": 0.00019894833390973266, |
| "loss": 3.1687, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.45933014354066987, |
| "grad_norm": 2.21382737159729, |
| "learning_rate": 0.0001987930439740757, |
| "loss": 3.4307, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.4784688995215311, |
| "grad_norm": 2.586013078689575, |
| "learning_rate": 0.0001986271337340182, |
| "loss": 3.3596, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.49760765550239233, |
| "grad_norm": 2.8244550228118896, |
| "learning_rate": 0.0001984506210249986, |
| "loss": 3.3107, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.5167464114832536, |
| "grad_norm": 2.0228700637817383, |
| "learning_rate": 0.00019826352482222638, |
| "loss": 3.1749, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.5358851674641149, |
| "grad_norm": 2.7035820484161377, |
| "learning_rate": 0.0001980658652386421, |
| "loss": 3.0995, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.5550239234449761, |
| "grad_norm": 2.119741916656494, |
| "learning_rate": 0.00019785766352275542, |
| "loss": 3.225, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.5741626794258373, |
| "grad_norm": 2.5071310997009277, |
| "learning_rate": 0.00019763894205636072, |
| "loss": 3.0066, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.5933014354066986, |
| "grad_norm": 2.992201566696167, |
| "learning_rate": 0.00019740972435213115, |
| "loss": 3.0412, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.6124401913875598, |
| "grad_norm": 2.820875883102417, |
| "learning_rate": 0.00019717003505109095, |
| "loss": 3.3575, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.631578947368421, |
| "grad_norm": 2.7096059322357178, |
| "learning_rate": 0.00019691989991996663, |
| "loss": 3.3531, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.6507177033492823, |
| "grad_norm": 2.172783374786377, |
| "learning_rate": 0.00019665934584841682, |
| "loss": 3.2852, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.6698564593301436, |
| "grad_norm": 3.238025188446045, |
| "learning_rate": 0.00019638840084614182, |
| "loss": 3.3818, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.6889952153110048, |
| "grad_norm": 2.92851185798645, |
| "learning_rate": 0.00019610709403987246, |
| "loss": 3.1, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.7081339712918661, |
| "grad_norm": 2.514800786972046, |
| "learning_rate": 0.000195815455670239, |
| "loss": 3.0883, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.7272727272727273, |
| "grad_norm": 3.264613151550293, |
| "learning_rate": 0.0001955135170885202, |
| "loss": 2.9932, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.7464114832535885, |
| "grad_norm": 2.4111247062683105, |
| "learning_rate": 0.00019520131075327298, |
| "loss": 2.974, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.7655502392344498, |
| "grad_norm": 2.692473888397217, |
| "learning_rate": 0.00019487887022684336, |
| "loss": 2.8822, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.784688995215311, |
| "grad_norm": 3.3863365650177, |
| "learning_rate": 0.00019454623017175812, |
| "loss": 3.0255, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.8038277511961722, |
| "grad_norm": 2.2267720699310303, |
| "learning_rate": 0.0001942034263469989, |
| "loss": 3.1883, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.8229665071770335, |
| "grad_norm": 2.31858491897583, |
| "learning_rate": 0.00019385049560415794, |
| "loss": 2.9882, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.8421052631578947, |
| "grad_norm": 2.3098323345184326, |
| "learning_rate": 0.00019348747588347637, |
| "loss": 2.8801, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.861244019138756, |
| "grad_norm": 3.3286585807800293, |
| "learning_rate": 0.00019311440620976597, |
| "loss": 2.9762, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.8803827751196173, |
| "grad_norm": 3.1082146167755127, |
| "learning_rate": 0.00019273132668821364, |
| "loss": 2.8514, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.8995215311004785, |
| "grad_norm": 2.2908411026000977, |
| "learning_rate": 0.00019233827850007027, |
| "loss": 3.2993, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.9186602870813397, |
| "grad_norm": 2.1068387031555176, |
| "learning_rate": 0.00019193530389822363, |
| "loss": 3.0606, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.937799043062201, |
| "grad_norm": 2.951885938644409, |
| "learning_rate": 0.0001915224462026563, |
| "loss": 3.042, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.9569377990430622, |
| "grad_norm": 2.2476351261138916, |
| "learning_rate": 0.0001910997497957885, |
| "loss": 2.9928, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.9760765550239234, |
| "grad_norm": 1.9801242351531982, |
| "learning_rate": 0.00019066726011770726, |
| "loss": 2.8911, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.9952153110047847, |
| "grad_norm": 2.5246548652648926, |
| "learning_rate": 0.00019022502366128135, |
| "loss": 3.2457, |
| "step": 52 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 5.682666778564453, |
| "learning_rate": 0.0001897730879671634, |
| "loss": 2.4435, |
| "step": 53 |
| }, |
| { |
| "epoch": 1.0191387559808613, |
| "grad_norm": 2.66831374168396, |
| "learning_rate": 0.00018931150161867916, |
| "loss": 2.7807, |
| "step": 54 |
| }, |
| { |
| "epoch": 1.0382775119617225, |
| "grad_norm": 2.5246026515960693, |
| "learning_rate": 0.0001888403142366049, |
| "loss": 2.7599, |
| "step": 55 |
| }, |
| { |
| "epoch": 1.0574162679425838, |
| "grad_norm": 1.959625244140625, |
| "learning_rate": 0.00018835957647383303, |
| "loss": 2.9087, |
| "step": 56 |
| }, |
| { |
| "epoch": 1.076555023923445, |
| "grad_norm": 2.277261257171631, |
| "learning_rate": 0.00018786934000992688, |
| "loss": 3.2283, |
| "step": 57 |
| }, |
| { |
| "epoch": 1.0956937799043063, |
| "grad_norm": 3.0258898735046387, |
| "learning_rate": 0.00018736965754556528, |
| "loss": 3.0084, |
| "step": 58 |
| }, |
| { |
| "epoch": 1.1148325358851674, |
| "grad_norm": 2.4277517795562744, |
| "learning_rate": 0.00018686058279687698, |
| "loss": 2.7159, |
| "step": 59 |
| }, |
| { |
| "epoch": 1.1339712918660287, |
| "grad_norm": 3.0732321739196777, |
| "learning_rate": 0.00018634217048966637, |
| "loss": 2.9704, |
| "step": 60 |
| }, |
| { |
| "epoch": 1.1531100478468899, |
| "grad_norm": 3.0256996154785156, |
| "learning_rate": 0.0001858144763535302, |
| "loss": 3.0254, |
| "step": 61 |
| }, |
| { |
| "epoch": 1.1722488038277512, |
| "grad_norm": 2.7575695514678955, |
| "learning_rate": 0.00018527755711586678, |
| "loss": 2.4907, |
| "step": 62 |
| }, |
| { |
| "epoch": 1.1913875598086126, |
| "grad_norm": 2.813037157058716, |
| "learning_rate": 0.00018473147049577774, |
| "loss": 2.8598, |
| "step": 63 |
| }, |
| { |
| "epoch": 1.2105263157894737, |
| "grad_norm": 2.197244644165039, |
| "learning_rate": 0.00018417627519786315, |
| "loss": 2.965, |
| "step": 64 |
| }, |
| { |
| "epoch": 1.229665071770335, |
| "grad_norm": 2.0711350440979004, |
| "learning_rate": 0.00018361203090591071, |
| "loss": 2.9582, |
| "step": 65 |
| }, |
| { |
| "epoch": 1.2488038277511961, |
| "grad_norm": 2.7295780181884766, |
| "learning_rate": 0.00018303879827647975, |
| "loss": 2.5148, |
| "step": 66 |
| }, |
| { |
| "epoch": 1.2679425837320575, |
| "grad_norm": 2.511603593826294, |
| "learning_rate": 0.00018245663893238075, |
| "loss": 2.7535, |
| "step": 67 |
| }, |
| { |
| "epoch": 1.2870813397129186, |
| "grad_norm": 3.695086717605591, |
| "learning_rate": 0.00018186561545605054, |
| "loss": 2.6003, |
| "step": 68 |
| }, |
| { |
| "epoch": 1.30622009569378, |
| "grad_norm": 3.2395761013031006, |
| "learning_rate": 0.00018126579138282503, |
| "loss": 2.7334, |
| "step": 69 |
| }, |
| { |
| "epoch": 1.325358851674641, |
| "grad_norm": 3.004142999649048, |
| "learning_rate": 0.00018065723119410884, |
| "loss": 2.7788, |
| "step": 70 |
| }, |
| { |
| "epoch": 1.3444976076555024, |
| "grad_norm": 2.964301824569702, |
| "learning_rate": 0.0001800400003104436, |
| "loss": 2.734, |
| "step": 71 |
| }, |
| { |
| "epoch": 1.3636363636363638, |
| "grad_norm": 3.981093645095825, |
| "learning_rate": 0.00017941416508447536, |
| "loss": 2.7476, |
| "step": 72 |
| }, |
| { |
| "epoch": 1.3827751196172249, |
| "grad_norm": 3.2536420822143555, |
| "learning_rate": 0.00017877979279382135, |
| "loss": 2.5386, |
| "step": 73 |
| }, |
| { |
| "epoch": 1.401913875598086, |
| "grad_norm": 3.6163337230682373, |
| "learning_rate": 0.0001781369516338378, |
| "loss": 2.6767, |
| "step": 74 |
| }, |
| { |
| "epoch": 1.4210526315789473, |
| "grad_norm": 3.6883926391601562, |
| "learning_rate": 0.000177485710710289, |
| "loss": 2.5656, |
| "step": 75 |
| }, |
| { |
| "epoch": 1.4401913875598087, |
| "grad_norm": 3.5389018058776855, |
| "learning_rate": 0.00017682614003191807, |
| "loss": 2.6626, |
| "step": 76 |
| }, |
| { |
| "epoch": 1.4593301435406698, |
| "grad_norm": 2.324506998062134, |
| "learning_rate": 0.0001761583105029213, |
| "loss": 2.6479, |
| "step": 77 |
| }, |
| { |
| "epoch": 1.4784688995215312, |
| "grad_norm": 2.271515130996704, |
| "learning_rate": 0.00017548229391532572, |
| "loss": 2.874, |
| "step": 78 |
| }, |
| { |
| "epoch": 1.4976076555023923, |
| "grad_norm": 3.023533821105957, |
| "learning_rate": 0.00017479816294127152, |
| "loss": 2.4017, |
| "step": 79 |
| }, |
| { |
| "epoch": 1.5167464114832536, |
| "grad_norm": 4.101243495941162, |
| "learning_rate": 0.0001741059911251997, |
| "loss": 3.0185, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.535885167464115, |
| "grad_norm": 3.056877374649048, |
| "learning_rate": 0.00017340585287594604, |
| "loss": 2.7875, |
| "step": 81 |
| }, |
| { |
| "epoch": 1.555023923444976, |
| "grad_norm": 3.0255823135375977, |
| "learning_rate": 0.00017269782345874203, |
| "loss": 2.8453, |
| "step": 82 |
| }, |
| { |
| "epoch": 1.5741626794258372, |
| "grad_norm": 3.57423734664917, |
| "learning_rate": 0.00017198197898712404, |
| "loss": 2.6948, |
| "step": 83 |
| }, |
| { |
| "epoch": 1.5933014354066986, |
| "grad_norm": 3.436167001724243, |
| "learning_rate": 0.00017125839641475072, |
| "loss": 2.6287, |
| "step": 84 |
| }, |
| { |
| "epoch": 1.61244019138756, |
| "grad_norm": 3.1058871746063232, |
| "learning_rate": 0.00017052715352713075, |
| "loss": 2.5887, |
| "step": 85 |
| }, |
| { |
| "epoch": 1.631578947368421, |
| "grad_norm": 2.1073200702667236, |
| "learning_rate": 0.00016978832893326074, |
| "loss": 2.9573, |
| "step": 86 |
| }, |
| { |
| "epoch": 1.6507177033492821, |
| "grad_norm": 2.8039920330047607, |
| "learning_rate": 0.0001690420020571747, |
| "loss": 2.9652, |
| "step": 87 |
| }, |
| { |
| "epoch": 1.6698564593301435, |
| "grad_norm": 2.8494677543640137, |
| "learning_rate": 0.00016828825312940592, |
| "loss": 2.6263, |
| "step": 88 |
| }, |
| { |
| "epoch": 1.6889952153110048, |
| "grad_norm": 2.3521246910095215, |
| "learning_rate": 0.00016752716317836229, |
| "loss": 2.683, |
| "step": 89 |
| }, |
| { |
| "epoch": 1.7081339712918662, |
| "grad_norm": 2.5750181674957275, |
| "learning_rate": 0.00016675881402161536, |
| "loss": 2.611, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.7272727272727273, |
| "grad_norm": 2.687619924545288, |
| "learning_rate": 0.00016598328825710533, |
| "loss": 2.523, |
| "step": 91 |
| }, |
| { |
| "epoch": 1.7464114832535884, |
| "grad_norm": 3.112954616546631, |
| "learning_rate": 0.00016520066925426144, |
| "loss": 2.6558, |
| "step": 92 |
| }, |
| { |
| "epoch": 1.7655502392344498, |
| "grad_norm": 3.4932713508605957, |
| "learning_rate": 0.0001644110411450398, |
| "loss": 2.6962, |
| "step": 93 |
| }, |
| { |
| "epoch": 1.784688995215311, |
| "grad_norm": 2.564894437789917, |
| "learning_rate": 0.00016361448881487914, |
| "loss": 2.8202, |
| "step": 94 |
| }, |
| { |
| "epoch": 1.8038277511961722, |
| "grad_norm": 3.1496503353118896, |
| "learning_rate": 0.0001628110978935756, |
| "loss": 2.2734, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.8229665071770333, |
| "grad_norm": 2.6274123191833496, |
| "learning_rate": 0.00016200095474607753, |
| "loss": 2.5264, |
| "step": 96 |
| }, |
| { |
| "epoch": 1.8421052631578947, |
| "grad_norm": 2.374180555343628, |
| "learning_rate": 0.0001611841464632011, |
| "loss": 2.8034, |
| "step": 97 |
| }, |
| { |
| "epoch": 1.861244019138756, |
| "grad_norm": 2.691254138946533, |
| "learning_rate": 0.00016036076085226814, |
| "loss": 2.5935, |
| "step": 98 |
| }, |
| { |
| "epoch": 1.8803827751196174, |
| "grad_norm": 2.9795515537261963, |
| "learning_rate": 0.0001595308864276666, |
| "loss": 2.8707, |
| "step": 99 |
| }, |
| { |
| "epoch": 1.8995215311004785, |
| "grad_norm": 3.1781864166259766, |
| "learning_rate": 0.0001586946124013354, |
| "loss": 2.458, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.9186602870813396, |
| "grad_norm": 2.8759453296661377, |
| "learning_rate": 0.00015785202867317407, |
| "loss": 2.5201, |
| "step": 101 |
| }, |
| { |
| "epoch": 1.937799043062201, |
| "grad_norm": 3.2317118644714355, |
| "learning_rate": 0.00015700322582137827, |
| "loss": 2.5585, |
| "step": 102 |
| }, |
| { |
| "epoch": 1.9569377990430623, |
| "grad_norm": 3.463688373565674, |
| "learning_rate": 0.0001561482950927029, |
| "loss": 2.4652, |
| "step": 103 |
| }, |
| { |
| "epoch": 1.9760765550239234, |
| "grad_norm": 2.4766316413879395, |
| "learning_rate": 0.00015528732839265272, |
| "loss": 2.5966, |
| "step": 104 |
| }, |
| { |
| "epoch": 1.9952153110047846, |
| "grad_norm": 2.8042709827423096, |
| "learning_rate": 0.00015442041827560274, |
| "loss": 2.5278, |
| "step": 105 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 8.298028945922852, |
| "learning_rate": 0.00015354765793484834, |
| "loss": 2.8732, |
| "step": 106 |
| }, |
| { |
| "epoch": 2.0191387559808613, |
| "grad_norm": 3.808393716812134, |
| "learning_rate": 0.000152669141192587, |
| "loss": 2.1442, |
| "step": 107 |
| }, |
| { |
| "epoch": 2.0382775119617227, |
| "grad_norm": 3.3381223678588867, |
| "learning_rate": 0.00015178496248983254, |
| "loss": 2.6125, |
| "step": 108 |
| }, |
| { |
| "epoch": 2.0574162679425836, |
| "grad_norm": 4.778241157531738, |
| "learning_rate": 0.00015089521687626243, |
| "loss": 2.399, |
| "step": 109 |
| }, |
| { |
| "epoch": 2.076555023923445, |
| "grad_norm": 2.613919973373413, |
| "learning_rate": 0.00015000000000000001, |
| "loss": 2.5189, |
| "step": 110 |
| }, |
| { |
| "epoch": 2.0956937799043063, |
| "grad_norm": 3.6656932830810547, |
| "learning_rate": 0.00014909940809733222, |
| "loss": 2.2785, |
| "step": 111 |
| }, |
| { |
| "epoch": 2.1148325358851676, |
| "grad_norm": 2.968078136444092, |
| "learning_rate": 0.00014819353798236427, |
| "loss": 2.3605, |
| "step": 112 |
| }, |
| { |
| "epoch": 2.1339712918660285, |
| "grad_norm": 2.7252070903778076, |
| "learning_rate": 0.00014728248703661182, |
| "loss": 2.173, |
| "step": 113 |
| }, |
| { |
| "epoch": 2.15311004784689, |
| "grad_norm": 3.9389491081237793, |
| "learning_rate": 0.00014636635319853275, |
| "loss": 2.457, |
| "step": 114 |
| }, |
| { |
| "epoch": 2.172248803827751, |
| "grad_norm": 3.658862590789795, |
| "learning_rate": 0.00014544523495299842, |
| "loss": 2.6971, |
| "step": 115 |
| }, |
| { |
| "epoch": 2.1913875598086126, |
| "grad_norm": 3.303403377532959, |
| "learning_rate": 0.0001445192313207067, |
| "loss": 2.7851, |
| "step": 116 |
| }, |
| { |
| "epoch": 2.2105263157894735, |
| "grad_norm": 3.910428047180176, |
| "learning_rate": 0.00014358844184753712, |
| "loss": 2.1422, |
| "step": 117 |
| }, |
| { |
| "epoch": 2.229665071770335, |
| "grad_norm": 3.3043367862701416, |
| "learning_rate": 0.00014265296659384956, |
| "loss": 2.5404, |
| "step": 118 |
| }, |
| { |
| "epoch": 2.248803827751196, |
| "grad_norm": 2.9098987579345703, |
| "learning_rate": 0.0001417129061237278, |
| "loss": 2.567, |
| "step": 119 |
| }, |
| { |
| "epoch": 2.2679425837320575, |
| "grad_norm": 4.142232894897461, |
| "learning_rate": 0.00014076836149416887, |
| "loss": 2.4179, |
| "step": 120 |
| }, |
| { |
| "epoch": 2.287081339712919, |
| "grad_norm": 2.110104560852051, |
| "learning_rate": 0.00013981943424421932, |
| "loss": 2.4976, |
| "step": 121 |
| }, |
| { |
| "epoch": 2.3062200956937797, |
| "grad_norm": 2.6828229427337646, |
| "learning_rate": 0.00013886622638405952, |
| "loss": 2.5762, |
| "step": 122 |
| }, |
| { |
| "epoch": 2.325358851674641, |
| "grad_norm": 3.0066471099853516, |
| "learning_rate": 0.00013790884038403795, |
| "loss": 2.2882, |
| "step": 123 |
| }, |
| { |
| "epoch": 2.3444976076555024, |
| "grad_norm": 3.791444778442383, |
| "learning_rate": 0.00013694737916365517, |
| "loss": 2.1788, |
| "step": 124 |
| }, |
| { |
| "epoch": 2.3636363636363638, |
| "grad_norm": 2.78275203704834, |
| "learning_rate": 0.0001359819460805001, |
| "loss": 2.6037, |
| "step": 125 |
| }, |
| { |
| "epoch": 2.382775119617225, |
| "grad_norm": 4.18953275680542, |
| "learning_rate": 0.00013501264491913906, |
| "loss": 2.3284, |
| "step": 126 |
| }, |
| { |
| "epoch": 2.401913875598086, |
| "grad_norm": 2.925140142440796, |
| "learning_rate": 0.00013403957987995882, |
| "loss": 2.4364, |
| "step": 127 |
| }, |
| { |
| "epoch": 2.4210526315789473, |
| "grad_norm": 4.545037746429443, |
| "learning_rate": 0.00013306285556796495, |
| "loss": 2.4096, |
| "step": 128 |
| }, |
| { |
| "epoch": 2.4401913875598087, |
| "grad_norm": 3.785428524017334, |
| "learning_rate": 0.00013208257698153677, |
| "loss": 2.1047, |
| "step": 129 |
| }, |
| { |
| "epoch": 2.45933014354067, |
| "grad_norm": 3.6228346824645996, |
| "learning_rate": 0.00013109884950114007, |
| "loss": 2.5744, |
| "step": 130 |
| }, |
| { |
| "epoch": 2.478468899521531, |
| "grad_norm": 2.9221742153167725, |
| "learning_rate": 0.00013011177887799845, |
| "loss": 2.5266, |
| "step": 131 |
| }, |
| { |
| "epoch": 2.4976076555023923, |
| "grad_norm": 3.659484386444092, |
| "learning_rate": 0.00012912147122272523, |
| "loss": 2.3707, |
| "step": 132 |
| }, |
| { |
| "epoch": 2.5167464114832536, |
| "grad_norm": 3.5442514419555664, |
| "learning_rate": 0.00012812803299391628, |
| "loss": 2.4695, |
| "step": 133 |
| }, |
| { |
| "epoch": 2.535885167464115, |
| "grad_norm": 3.1291420459747314, |
| "learning_rate": 0.0001271315709867059, |
| "loss": 2.687, |
| "step": 134 |
| }, |
| { |
| "epoch": 2.555023923444976, |
| "grad_norm": 4.138225078582764, |
| "learning_rate": 0.00012613219232128608, |
| "loss": 2.2378, |
| "step": 135 |
| }, |
| { |
| "epoch": 2.574162679425837, |
| "grad_norm": 2.8483548164367676, |
| "learning_rate": 0.00012513000443139112, |
| "loss": 2.4044, |
| "step": 136 |
| }, |
| { |
| "epoch": 2.5933014354066986, |
| "grad_norm": 2.434741497039795, |
| "learning_rate": 0.00012412511505274844, |
| "loss": 2.5664, |
| "step": 137 |
| }, |
| { |
| "epoch": 2.61244019138756, |
| "grad_norm": 3.9319725036621094, |
| "learning_rate": 0.000123117632211497, |
| "loss": 2.2586, |
| "step": 138 |
| }, |
| { |
| "epoch": 2.6315789473684212, |
| "grad_norm": 3.4802486896514893, |
| "learning_rate": 0.0001221076642125742, |
| "loss": 2.0743, |
| "step": 139 |
| }, |
| { |
| "epoch": 2.650717703349282, |
| "grad_norm": 3.1535286903381348, |
| "learning_rate": 0.00012109531962807332, |
| "loss": 2.302, |
| "step": 140 |
| }, |
| { |
| "epoch": 2.6698564593301435, |
| "grad_norm": 2.9818458557128906, |
| "learning_rate": 0.00012008070728557186, |
| "loss": 2.4418, |
| "step": 141 |
| }, |
| { |
| "epoch": 2.688995215311005, |
| "grad_norm": 4.8768630027771, |
| "learning_rate": 0.00011906393625643244, |
| "loss": 2.5083, |
| "step": 142 |
| }, |
| { |
| "epoch": 2.708133971291866, |
| "grad_norm": 3.8520619869232178, |
| "learning_rate": 0.00011804511584407763, |
| "loss": 1.9994, |
| "step": 143 |
| }, |
| { |
| "epoch": 2.7272727272727275, |
| "grad_norm": 3.784248113632202, |
| "learning_rate": 0.00011702435557223987, |
| "loss": 2.2376, |
| "step": 144 |
| }, |
| { |
| "epoch": 2.7464114832535884, |
| "grad_norm": 4.1650800704956055, |
| "learning_rate": 0.00011600176517318741, |
| "loss": 2.3886, |
| "step": 145 |
| }, |
| { |
| "epoch": 2.7655502392344498, |
| "grad_norm": 4.099468231201172, |
| "learning_rate": 0.00011497745457592816, |
| "loss": 2.5978, |
| "step": 146 |
| }, |
| { |
| "epoch": 2.784688995215311, |
| "grad_norm": 4.268674850463867, |
| "learning_rate": 0.00011395153389439233, |
| "loss": 2.4882, |
| "step": 147 |
| }, |
| { |
| "epoch": 2.803827751196172, |
| "grad_norm": 4.081464767456055, |
| "learning_rate": 0.0001129241134155949, |
| "loss": 2.6547, |
| "step": 148 |
| }, |
| { |
| "epoch": 2.8229665071770333, |
| "grad_norm": 3.1537716388702393, |
| "learning_rate": 0.00011189530358778005, |
| "loss": 2.5361, |
| "step": 149 |
| }, |
| { |
| "epoch": 2.8421052631578947, |
| "grad_norm": 4.182295322418213, |
| "learning_rate": 0.00011086521500854745, |
| "loss": 2.385, |
| "step": 150 |
| }, |
| { |
| "epoch": 2.861244019138756, |
| "grad_norm": 2.5511474609375, |
| "learning_rate": 0.00010983395841296348, |
| "loss": 2.4617, |
| "step": 151 |
| }, |
| { |
| "epoch": 2.8803827751196174, |
| "grad_norm": 3.1007962226867676, |
| "learning_rate": 0.00010880164466165674, |
| "loss": 2.5788, |
| "step": 152 |
| }, |
| { |
| "epoch": 2.8995215311004783, |
| "grad_norm": 4.509490966796875, |
| "learning_rate": 0.00010776838472890065, |
| "loss": 2.1361, |
| "step": 153 |
| }, |
| { |
| "epoch": 2.9186602870813396, |
| "grad_norm": 2.6765851974487305, |
| "learning_rate": 0.00010673428969068364, |
| "loss": 2.3922, |
| "step": 154 |
| }, |
| { |
| "epoch": 2.937799043062201, |
| "grad_norm": 3.704310894012451, |
| "learning_rate": 0.00010569947071276847, |
| "loss": 2.6924, |
| "step": 155 |
| }, |
| { |
| "epoch": 2.9569377990430623, |
| "grad_norm": 3.935804843902588, |
| "learning_rate": 0.00010466403903874176, |
| "loss": 2.2886, |
| "step": 156 |
| }, |
| { |
| "epoch": 2.9760765550239237, |
| "grad_norm": 4.105613708496094, |
| "learning_rate": 0.00010362810597805526, |
| "loss": 2.3865, |
| "step": 157 |
| }, |
| { |
| "epoch": 2.9952153110047846, |
| "grad_norm": 3.669766664505005, |
| "learning_rate": 0.00010259178289406011, |
| "loss": 2.2158, |
| "step": 158 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 8.930411338806152, |
| "learning_rate": 0.0001015551811920351, |
| "loss": 2.5214, |
| "step": 159 |
| }, |
| { |
| "epoch": 3.0191387559808613, |
| "grad_norm": 3.3217484951019287, |
| "learning_rate": 0.00010051841230721065, |
| "loss": 2.5409, |
| "step": 160 |
| }, |
| { |
| "epoch": 3.0382775119617227, |
| "grad_norm": 3.8041253089904785, |
| "learning_rate": 9.948158769278939e-05, |
| "loss": 1.8309, |
| "step": 161 |
| }, |
| { |
| "epoch": 3.0574162679425836, |
| "grad_norm": 3.892636775970459, |
| "learning_rate": 9.844481880796491e-05, |
| "loss": 2.0955, |
| "step": 162 |
| }, |
| { |
| "epoch": 3.076555023923445, |
| "grad_norm": 3.4822261333465576, |
| "learning_rate": 9.740821710593989e-05, |
| "loss": 2.2865, |
| "step": 163 |
| }, |
| { |
| "epoch": 3.0956937799043063, |
| "grad_norm": 3.033822774887085, |
| "learning_rate": 9.637189402194476e-05, |
| "loss": 2.3693, |
| "step": 164 |
| }, |
| { |
| "epoch": 3.1148325358851676, |
| "grad_norm": 3.693204641342163, |
| "learning_rate": 9.533596096125825e-05, |
| "loss": 1.984, |
| "step": 165 |
| }, |
| { |
| "epoch": 3.1339712918660285, |
| "grad_norm": 3.3877508640289307, |
| "learning_rate": 9.430052928723153e-05, |
| "loss": 2.0891, |
| "step": 166 |
| }, |
| { |
| "epoch": 3.15311004784689, |
| "grad_norm": 4.376189708709717, |
| "learning_rate": 9.326571030931637e-05, |
| "loss": 2.1974, |
| "step": 167 |
| }, |
| { |
| "epoch": 3.172248803827751, |
| "grad_norm": 3.557032823562622, |
| "learning_rate": 9.223161527109937e-05, |
| "loss": 1.9757, |
| "step": 168 |
| }, |
| { |
| "epoch": 3.1913875598086126, |
| "grad_norm": 2.733353853225708, |
| "learning_rate": 9.119835533834331e-05, |
| "loss": 2.4171, |
| "step": 169 |
| }, |
| { |
| "epoch": 3.2105263157894735, |
| "grad_norm": 2.7016165256500244, |
| "learning_rate": 9.016604158703654e-05, |
| "loss": 2.2992, |
| "step": 170 |
| }, |
| { |
| "epoch": 3.229665071770335, |
| "grad_norm": 3.997654438018799, |
| "learning_rate": 8.913478499145254e-05, |
| "loss": 2.1117, |
| "step": 171 |
| }, |
| { |
| "epoch": 3.248803827751196, |
| "grad_norm": 4.044878005981445, |
| "learning_rate": 8.810469641222001e-05, |
| "loss": 2.245, |
| "step": 172 |
| }, |
| { |
| "epoch": 3.2679425837320575, |
| "grad_norm": 3.080991506576538, |
| "learning_rate": 8.707588658440511e-05, |
| "loss": 2.1612, |
| "step": 173 |
| }, |
| { |
| "epoch": 3.287081339712919, |
| "grad_norm": 3.295807123184204, |
| "learning_rate": 8.604846610560771e-05, |
| "loss": 2.3217, |
| "step": 174 |
| }, |
| { |
| "epoch": 3.3062200956937797, |
| "grad_norm": 3.5904176235198975, |
| "learning_rate": 8.502254542407186e-05, |
| "loss": 2.3138, |
| "step": 175 |
| }, |
| { |
| "epoch": 3.325358851674641, |
| "grad_norm": 4.395754814147949, |
| "learning_rate": 8.399823482681262e-05, |
| "loss": 1.9074, |
| "step": 176 |
| }, |
| { |
| "epoch": 3.3444976076555024, |
| "grad_norm": 3.2221572399139404, |
| "learning_rate": 8.297564442776014e-05, |
| "loss": 2.2977, |
| "step": 177 |
| }, |
| { |
| "epoch": 3.3636363636363638, |
| "grad_norm": 2.9927215576171875, |
| "learning_rate": 8.195488415592238e-05, |
| "loss": 2.3785, |
| "step": 178 |
| }, |
| { |
| "epoch": 3.382775119617225, |
| "grad_norm": 3.9036011695861816, |
| "learning_rate": 8.093606374356759e-05, |
| "loss": 1.9962, |
| "step": 179 |
| }, |
| { |
| "epoch": 3.401913875598086, |
| "grad_norm": 4.485937595367432, |
| "learning_rate": 7.991929271442817e-05, |
| "loss": 1.7251, |
| "step": 180 |
| }, |
| { |
| "epoch": 3.4210526315789473, |
| "grad_norm": 4.750828742980957, |
| "learning_rate": 7.89046803719267e-05, |
| "loss": 2.1263, |
| "step": 181 |
| }, |
| { |
| "epoch": 3.4401913875598087, |
| "grad_norm": 4.138678550720215, |
| "learning_rate": 7.789233578742582e-05, |
| "loss": 2.0091, |
| "step": 182 |
| }, |
| { |
| "epoch": 3.45933014354067, |
| "grad_norm": 3.6726274490356445, |
| "learning_rate": 7.688236778850306e-05, |
| "loss": 2.3806, |
| "step": 183 |
| }, |
| { |
| "epoch": 3.478468899521531, |
| "grad_norm": 4.481295108795166, |
| "learning_rate": 7.587488494725157e-05, |
| "loss": 2.1338, |
| "step": 184 |
| }, |
| { |
| "epoch": 3.4976076555023923, |
| "grad_norm": 3.9401016235351562, |
| "learning_rate": 7.48699955686089e-05, |
| "loss": 2.1403, |
| "step": 185 |
| }, |
| { |
| "epoch": 3.5167464114832536, |
| "grad_norm": 4.227544784545898, |
| "learning_rate": 7.386780767871397e-05, |
| "loss": 2.3207, |
| "step": 186 |
| }, |
| { |
| "epoch": 3.535885167464115, |
| "grad_norm": 3.4885573387145996, |
| "learning_rate": 7.286842901329412e-05, |
| "loss": 2.2671, |
| "step": 187 |
| }, |
| { |
| "epoch": 3.555023923444976, |
| "grad_norm": 4.438218593597412, |
| "learning_rate": 7.187196700608373e-05, |
| "loss": 2.0748, |
| "step": 188 |
| }, |
| { |
| "epoch": 3.574162679425837, |
| "grad_norm": 3.766284465789795, |
| "learning_rate": 7.087852877727481e-05, |
| "loss": 2.5101, |
| "step": 189 |
| }, |
| { |
| "epoch": 3.5933014354066986, |
| "grad_norm": 4.027716636657715, |
| "learning_rate": 6.988822112200156e-05, |
| "loss": 2.3361, |
| "step": 190 |
| }, |
| { |
| "epoch": 3.61244019138756, |
| "grad_norm": 4.409999370574951, |
| "learning_rate": 6.890115049885994e-05, |
| "loss": 2.2492, |
| "step": 191 |
| }, |
| { |
| "epoch": 3.6315789473684212, |
| "grad_norm": 3.596459150314331, |
| "learning_rate": 6.791742301846326e-05, |
| "loss": 2.2855, |
| "step": 192 |
| }, |
| { |
| "epoch": 3.650717703349282, |
| "grad_norm": 4.667017459869385, |
| "learning_rate": 6.693714443203507e-05, |
| "loss": 2.083, |
| "step": 193 |
| }, |
| { |
| "epoch": 3.6698564593301435, |
| "grad_norm": 4.831173896789551, |
| "learning_rate": 6.59604201200412e-05, |
| "loss": 2.1568, |
| "step": 194 |
| }, |
| { |
| "epoch": 3.688995215311005, |
| "grad_norm": 3.5013201236724854, |
| "learning_rate": 6.498735508086093e-05, |
| "loss": 2.108, |
| "step": 195 |
| }, |
| { |
| "epoch": 3.708133971291866, |
| "grad_norm": 4.176932334899902, |
| "learning_rate": 6.40180539194999e-05, |
| "loss": 1.8315, |
| "step": 196 |
| }, |
| { |
| "epoch": 3.7272727272727275, |
| "grad_norm": 5.187565803527832, |
| "learning_rate": 6.305262083634488e-05, |
| "loss": 2.3541, |
| "step": 197 |
| }, |
| { |
| "epoch": 3.7464114832535884, |
| "grad_norm": 4.090083599090576, |
| "learning_rate": 6.209115961596208e-05, |
| "loss": 2.0691, |
| "step": 198 |
| }, |
| { |
| "epoch": 3.7655502392344498, |
| "grad_norm": 3.806030750274658, |
| "learning_rate": 6.113377361594049e-05, |
| "loss": 2.0471, |
| "step": 199 |
| }, |
| { |
| "epoch": 3.784688995215311, |
| "grad_norm": 4.668728828430176, |
| "learning_rate": 6.018056575578075e-05, |
| "loss": 2.335, |
| "step": 200 |
| }, |
| { |
| "epoch": 3.803827751196172, |
| "grad_norm": 4.811546325683594, |
| "learning_rate": 5.923163850583113e-05, |
| "loss": 2.2242, |
| "step": 201 |
| }, |
| { |
| "epoch": 3.8229665071770333, |
| "grad_norm": 5.359763145446777, |
| "learning_rate": 5.828709387627218e-05, |
| "loss": 2.3298, |
| "step": 202 |
| }, |
| { |
| "epoch": 3.8421052631578947, |
| "grad_norm": 3.5501046180725098, |
| "learning_rate": 5.73470334061505e-05, |
| "loss": 2.1297, |
| "step": 203 |
| }, |
| { |
| "epoch": 3.861244019138756, |
| "grad_norm": 3.4878952503204346, |
| "learning_rate": 5.6411558152462894e-05, |
| "loss": 2.3615, |
| "step": 204 |
| }, |
| { |
| "epoch": 3.8803827751196174, |
| "grad_norm": 4.381737232208252, |
| "learning_rate": 5.54807686792933e-05, |
| "loss": 2.0084, |
| "step": 205 |
| }, |
| { |
| "epoch": 3.8995215311004783, |
| "grad_norm": 5.2298359870910645, |
| "learning_rate": 5.4554765047001613e-05, |
| "loss": 2.362, |
| "step": 206 |
| }, |
| { |
| "epoch": 3.9186602870813396, |
| "grad_norm": 3.3613922595977783, |
| "learning_rate": 5.363364680146725e-05, |
| "loss": 2.1292, |
| "step": 207 |
| }, |
| { |
| "epoch": 3.937799043062201, |
| "grad_norm": 4.079115867614746, |
| "learning_rate": 5.271751296338823e-05, |
| "loss": 2.3561, |
| "step": 208 |
| }, |
| { |
| "epoch": 3.9569377990430623, |
| "grad_norm": 4.030163764953613, |
| "learning_rate": 5.180646201763577e-05, |
| "loss": 2.1954, |
| "step": 209 |
| }, |
| { |
| "epoch": 3.9760765550239237, |
| "grad_norm": 4.383935928344727, |
| "learning_rate": 5.090059190266779e-05, |
| "loss": 2.0793, |
| "step": 210 |
| }, |
| { |
| "epoch": 3.9952153110047846, |
| "grad_norm": 5.1565775871276855, |
| "learning_rate": 5.000000000000002e-05, |
| "loss": 2.0134, |
| "step": 211 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 8.855152130126953, |
| "learning_rate": 4.9104783123737566e-05, |
| "loss": 2.4084, |
| "step": 212 |
| }, |
| { |
| "epoch": 4.019138755980861, |
| "grad_norm": 3.997187614440918, |
| "learning_rate": 4.821503751016746e-05, |
| "loss": 1.978, |
| "step": 213 |
| }, |
| { |
| "epoch": 4.038277511961723, |
| "grad_norm": 5.068262100219727, |
| "learning_rate": 4.733085880741301e-05, |
| "loss": 2.199, |
| "step": 214 |
| }, |
| { |
| "epoch": 4.057416267942584, |
| "grad_norm": 3.602715015411377, |
| "learning_rate": 4.645234206515171e-05, |
| "loss": 2.0417, |
| "step": 215 |
| }, |
| { |
| "epoch": 4.076555023923445, |
| "grad_norm": 4.461487293243408, |
| "learning_rate": 4.5579581724397255e-05, |
| "loss": 1.7777, |
| "step": 216 |
| }, |
| { |
| "epoch": 4.095693779904306, |
| "grad_norm": 3.1041159629821777, |
| "learning_rate": 4.471267160734731e-05, |
| "loss": 2.1061, |
| "step": 217 |
| }, |
| { |
| "epoch": 4.114832535885167, |
| "grad_norm": 3.8727328777313232, |
| "learning_rate": 4.385170490729712e-05, |
| "loss": 2.4006, |
| "step": 218 |
| }, |
| { |
| "epoch": 4.133971291866029, |
| "grad_norm": 3.868997097015381, |
| "learning_rate": 4.2996774178621736e-05, |
| "loss": 2.0803, |
| "step": 219 |
| }, |
| { |
| "epoch": 4.15311004784689, |
| "grad_norm": 3.5627689361572266, |
| "learning_rate": 4.2147971326825966e-05, |
| "loss": 2.0853, |
| "step": 220 |
| }, |
| { |
| "epoch": 4.172248803827751, |
| "grad_norm": 4.383954048156738, |
| "learning_rate": 4.130538759866457e-05, |
| "loss": 1.7176, |
| "step": 221 |
| }, |
| { |
| "epoch": 4.1913875598086126, |
| "grad_norm": 4.526845932006836, |
| "learning_rate": 4.046911357233343e-05, |
| "loss": 2.0527, |
| "step": 222 |
| }, |
| { |
| "epoch": 4.2105263157894735, |
| "grad_norm": 2.923349380493164, |
| "learning_rate": 3.963923914773187e-05, |
| "loss": 2.1541, |
| "step": 223 |
| }, |
| { |
| "epoch": 4.229665071770335, |
| "grad_norm": 4.575229167938232, |
| "learning_rate": 3.8815853536798904e-05, |
| "loss": 1.986, |
| "step": 224 |
| }, |
| { |
| "epoch": 4.248803827751196, |
| "grad_norm": 3.529787540435791, |
| "learning_rate": 3.79990452539225e-05, |
| "loss": 2.0539, |
| "step": 225 |
| }, |
| { |
| "epoch": 4.267942583732057, |
| "grad_norm": 4.581504821777344, |
| "learning_rate": 3.7188902106424416e-05, |
| "loss": 1.9526, |
| "step": 226 |
| }, |
| { |
| "epoch": 4.287081339712919, |
| "grad_norm": 3.2781484127044678, |
| "learning_rate": 3.638551118512089e-05, |
| "loss": 2.1485, |
| "step": 227 |
| }, |
| { |
| "epoch": 4.30622009569378, |
| "grad_norm": 3.7174124717712402, |
| "learning_rate": 3.558895885496023e-05, |
| "loss": 2.3293, |
| "step": 228 |
| }, |
| { |
| "epoch": 4.3253588516746415, |
| "grad_norm": 4.923449993133545, |
| "learning_rate": 3.479933074573858e-05, |
| "loss": 2.0144, |
| "step": 229 |
| }, |
| { |
| "epoch": 4.344497607655502, |
| "grad_norm": 4.567214488983154, |
| "learning_rate": 3.401671174289469e-05, |
| "loss": 1.8614, |
| "step": 230 |
| }, |
| { |
| "epoch": 4.363636363636363, |
| "grad_norm": 3.7625460624694824, |
| "learning_rate": 3.324118597838464e-05, |
| "loss": 2.1933, |
| "step": 231 |
| }, |
| { |
| "epoch": 4.382775119617225, |
| "grad_norm": 5.30003023147583, |
| "learning_rate": 3.2472836821637744e-05, |
| "loss": 2.0038, |
| "step": 232 |
| }, |
| { |
| "epoch": 4.401913875598086, |
| "grad_norm": 4.20980167388916, |
| "learning_rate": 3.1711746870594086e-05, |
| "loss": 1.9264, |
| "step": 233 |
| }, |
| { |
| "epoch": 4.421052631578947, |
| "grad_norm": 4.678532600402832, |
| "learning_rate": 3.0957997942825336e-05, |
| "loss": 1.9475, |
| "step": 234 |
| }, |
| { |
| "epoch": 4.440191387559809, |
| "grad_norm": 4.418569564819336, |
| "learning_rate": 3.021167106673928e-05, |
| "loss": 2.062, |
| "step": 235 |
| }, |
| { |
| "epoch": 4.45933014354067, |
| "grad_norm": 4.576781272888184, |
| "learning_rate": 2.9472846472869298e-05, |
| "loss": 2.2673, |
| "step": 236 |
| }, |
| { |
| "epoch": 4.478468899521531, |
| "grad_norm": 5.059473037719727, |
| "learning_rate": 2.874160358524931e-05, |
| "loss": 2.2399, |
| "step": 237 |
| }, |
| { |
| "epoch": 4.497607655502392, |
| "grad_norm": 5.032463073730469, |
| "learning_rate": 2.8018021012875994e-05, |
| "loss": 1.8512, |
| "step": 238 |
| }, |
| { |
| "epoch": 4.516746411483254, |
| "grad_norm": 4.410358428955078, |
| "learning_rate": 2.7302176541257986e-05, |
| "loss": 1.8909, |
| "step": 239 |
| }, |
| { |
| "epoch": 4.535885167464115, |
| "grad_norm": 4.2732319831848145, |
| "learning_rate": 2.659414712405398e-05, |
| "loss": 1.833, |
| "step": 240 |
| }, |
| { |
| "epoch": 4.555023923444976, |
| "grad_norm": 4.440384387969971, |
| "learning_rate": 2.5894008874800325e-05, |
| "loss": 1.8964, |
| "step": 241 |
| }, |
| { |
| "epoch": 4.574162679425838, |
| "grad_norm": 4.8430891036987305, |
| "learning_rate": 2.5201837058728505e-05, |
| "loss": 1.7943, |
| "step": 242 |
| }, |
| { |
| "epoch": 4.5933014354066986, |
| "grad_norm": 3.676851987838745, |
| "learning_rate": 2.451770608467432e-05, |
| "loss": 2.0328, |
| "step": 243 |
| }, |
| { |
| "epoch": 4.6124401913875595, |
| "grad_norm": 4.80816650390625, |
| "learning_rate": 2.3841689497078746e-05, |
| "loss": 2.1791, |
| "step": 244 |
| }, |
| { |
| "epoch": 4.631578947368421, |
| "grad_norm": 4.105157852172852, |
| "learning_rate": 2.3173859968081944e-05, |
| "loss": 2.2402, |
| "step": 245 |
| }, |
| { |
| "epoch": 4.650717703349282, |
| "grad_norm": 5.055697441101074, |
| "learning_rate": 2.251428928971102e-05, |
| "loss": 2.2174, |
| "step": 246 |
| }, |
| { |
| "epoch": 4.669856459330144, |
| "grad_norm": 5.220304012298584, |
| "learning_rate": 2.1863048366162208e-05, |
| "loss": 2.163, |
| "step": 247 |
| }, |
| { |
| "epoch": 4.688995215311005, |
| "grad_norm": 5.349198818206787, |
| "learning_rate": 2.1220207206178688e-05, |
| "loss": 1.8591, |
| "step": 248 |
| }, |
| { |
| "epoch": 4.708133971291866, |
| "grad_norm": 3.800992012023926, |
| "learning_rate": 2.058583491552465e-05, |
| "loss": 2.1511, |
| "step": 249 |
| }, |
| { |
| "epoch": 4.7272727272727275, |
| "grad_norm": 4.178462982177734, |
| "learning_rate": 1.995999968955641e-05, |
| "loss": 2.2553, |
| "step": 250 |
| }, |
| { |
| "epoch": 4.746411483253588, |
| "grad_norm": 5.495607852935791, |
| "learning_rate": 1.9342768805891178e-05, |
| "loss": 2.022, |
| "step": 251 |
| }, |
| { |
| "epoch": 4.76555023923445, |
| "grad_norm": 4.614135265350342, |
| "learning_rate": 1.8734208617174988e-05, |
| "loss": 2.1751, |
| "step": 252 |
| }, |
| { |
| "epoch": 4.784688995215311, |
| "grad_norm": 3.8945748805999756, |
| "learning_rate": 1.8134384543949478e-05, |
| "loss": 2.0986, |
| "step": 253 |
| }, |
| { |
| "epoch": 4.803827751196172, |
| "grad_norm": 5.491265773773193, |
| "learning_rate": 1.754336106761927e-05, |
| "loss": 1.8482, |
| "step": 254 |
| }, |
| { |
| "epoch": 4.822966507177034, |
| "grad_norm": 5.249953269958496, |
| "learning_rate": 1.696120172352025e-05, |
| "loss": 1.8416, |
| "step": 255 |
| }, |
| { |
| "epoch": 4.842105263157895, |
| "grad_norm": 4.254781246185303, |
| "learning_rate": 1.6387969094089316e-05, |
| "loss": 2.0863, |
| "step": 256 |
| }, |
| { |
| "epoch": 4.861244019138756, |
| "grad_norm": 5.3179779052734375, |
| "learning_rate": 1.5823724802136865e-05, |
| "loss": 2.2049, |
| "step": 257 |
| }, |
| { |
| "epoch": 4.880382775119617, |
| "grad_norm": 5.007632732391357, |
| "learning_rate": 1.526852950422226e-05, |
| "loss": 1.9023, |
| "step": 258 |
| }, |
| { |
| "epoch": 4.899521531100478, |
| "grad_norm": 3.3414082527160645, |
| "learning_rate": 1.4722442884133214e-05, |
| "loss": 2.0638, |
| "step": 259 |
| }, |
| { |
| "epoch": 4.91866028708134, |
| "grad_norm": 4.421596050262451, |
| "learning_rate": 1.4185523646469822e-05, |
| "loss": 2.0366, |
| "step": 260 |
| }, |
| { |
| "epoch": 4.937799043062201, |
| "grad_norm": 5.466579914093018, |
| "learning_rate": 1.3657829510333654e-05, |
| "loss": 1.8725, |
| "step": 261 |
| }, |
| { |
| "epoch": 4.956937799043062, |
| "grad_norm": 3.617340326309204, |
| "learning_rate": 1.3139417203123027e-05, |
| "loss": 2.0564, |
| "step": 262 |
| }, |
| { |
| "epoch": 4.976076555023924, |
| "grad_norm": 3.936239719390869, |
| "learning_rate": 1.263034245443473e-05, |
| "loss": 1.8447, |
| "step": 263 |
| }, |
| { |
| "epoch": 4.9952153110047846, |
| "grad_norm": 5.094753265380859, |
| "learning_rate": 1.2130659990073146e-05, |
| "loss": 1.7893, |
| "step": 264 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 8.643269538879395, |
| "learning_rate": 1.1640423526166988e-05, |
| "loss": 1.6151, |
| "step": 265 |
| }, |
| { |
| "epoch": 5.019138755980861, |
| "grad_norm": 3.2149994373321533, |
| "learning_rate": 1.1159685763395111e-05, |
| "loss": 2.0046, |
| "step": 266 |
| }, |
| { |
| "epoch": 5.038277511961723, |
| "grad_norm": 4.97629976272583, |
| "learning_rate": 1.0688498381320855e-05, |
| "loss": 1.8201, |
| "step": 267 |
| }, |
| { |
| "epoch": 5.057416267942584, |
| "grad_norm": 3.756946325302124, |
| "learning_rate": 1.0226912032836611e-05, |
| "loss": 2.0823, |
| "step": 268 |
| }, |
| { |
| "epoch": 5.076555023923445, |
| "grad_norm": 3.5998761653900146, |
| "learning_rate": 9.774976338718677e-06, |
| "loss": 2.1409, |
| "step": 269 |
| }, |
| { |
| "epoch": 5.095693779904306, |
| "grad_norm": 4.599725246429443, |
| "learning_rate": 9.332739882292752e-06, |
| "loss": 1.9638, |
| "step": 270 |
| }, |
| { |
| "epoch": 5.114832535885167, |
| "grad_norm": 5.403920650482178, |
| "learning_rate": 8.900250204211514e-06, |
| "loss": 1.9577, |
| "step": 271 |
| }, |
| { |
| "epoch": 5.133971291866029, |
| "grad_norm": 4.915902137756348, |
| "learning_rate": 8.47755379734373e-06, |
| "loss": 2.0126, |
| "step": 272 |
| }, |
| { |
| "epoch": 5.15311004784689, |
| "grad_norm": 5.105212688446045, |
| "learning_rate": 8.064696101776358e-06, |
| "loss": 1.9987, |
| "step": 273 |
| }, |
| { |
| "epoch": 5.172248803827751, |
| "grad_norm": 4.95185661315918, |
| "learning_rate": 7.661721499929753e-06, |
| "loss": 1.7857, |
| "step": 274 |
| }, |
| { |
| "epoch": 5.1913875598086126, |
| "grad_norm": 4.504748344421387, |
| "learning_rate": 7.2686733117863784e-06, |
| "loss": 1.8317, |
| "step": 275 |
| }, |
| { |
| "epoch": 5.2105263157894735, |
| "grad_norm": 4.897287845611572, |
| "learning_rate": 6.8855937902340576e-06, |
| "loss": 1.8711, |
| "step": 276 |
| }, |
| { |
| "epoch": 5.229665071770335, |
| "grad_norm": 4.072137355804443, |
| "learning_rate": 6.512524116523633e-06, |
| "loss": 2.0629, |
| "step": 277 |
| }, |
| { |
| "epoch": 5.248803827751196, |
| "grad_norm": 3.6332151889801025, |
| "learning_rate": 6.149504395842087e-06, |
| "loss": 2.1024, |
| "step": 278 |
| }, |
| { |
| "epoch": 5.267942583732057, |
| "grad_norm": 3.8086438179016113, |
| "learning_rate": 5.7965736530010916e-06, |
| "loss": 2.247, |
| "step": 279 |
| }, |
| { |
| "epoch": 5.287081339712919, |
| "grad_norm": 3.1464338302612305, |
| "learning_rate": 5.453769828241872e-06, |
| "loss": 2.205, |
| "step": 280 |
| }, |
| { |
| "epoch": 5.30622009569378, |
| "grad_norm": 4.133326530456543, |
| "learning_rate": 5.121129773156663e-06, |
| "loss": 1.9466, |
| "step": 281 |
| }, |
| { |
| "epoch": 5.3253588516746415, |
| "grad_norm": 3.292668342590332, |
| "learning_rate": 4.798689246727006e-06, |
| "loss": 2.1296, |
| "step": 282 |
| }, |
| { |
| "epoch": 5.344497607655502, |
| "grad_norm": 3.0857577323913574, |
| "learning_rate": 4.486482911479839e-06, |
| "loss": 2.1429, |
| "step": 283 |
| }, |
| { |
| "epoch": 5.363636363636363, |
| "grad_norm": 3.311474084854126, |
| "learning_rate": 4.184544329761009e-06, |
| "loss": 1.9996, |
| "step": 284 |
| }, |
| { |
| "epoch": 5.382775119617225, |
| "grad_norm": 4.887283802032471, |
| "learning_rate": 3.892905960127546e-06, |
| "loss": 1.9941, |
| "step": 285 |
| }, |
| { |
| "epoch": 5.401913875598086, |
| "grad_norm": 4.46961784362793, |
| "learning_rate": 3.611599153858214e-06, |
| "loss": 2.0069, |
| "step": 286 |
| }, |
| { |
| "epoch": 5.421052631578947, |
| "grad_norm": 4.479908466339111, |
| "learning_rate": 3.3406541515832003e-06, |
| "loss": 1.9726, |
| "step": 287 |
| }, |
| { |
| "epoch": 5.440191387559809, |
| "grad_norm": 4.271525859832764, |
| "learning_rate": 3.0801000800333877e-06, |
| "loss": 2.0064, |
| "step": 288 |
| }, |
| { |
| "epoch": 5.45933014354067, |
| "grad_norm": 4.767016410827637, |
| "learning_rate": 2.8299649489090475e-06, |
| "loss": 1.7359, |
| "step": 289 |
| }, |
| { |
| "epoch": 5.478468899521531, |
| "grad_norm": 4.151036262512207, |
| "learning_rate": 2.590275647868867e-06, |
| "loss": 1.9747, |
| "step": 290 |
| }, |
| { |
| "epoch": 5.497607655502392, |
| "grad_norm": 5.108813285827637, |
| "learning_rate": 2.3610579436393e-06, |
| "loss": 1.9633, |
| "step": 291 |
| }, |
| { |
| "epoch": 5.516746411483254, |
| "grad_norm": 5.201232433319092, |
| "learning_rate": 2.1423364772445887e-06, |
| "loss": 1.9408, |
| "step": 292 |
| }, |
| { |
| "epoch": 5.535885167464115, |
| "grad_norm": 3.8995492458343506, |
| "learning_rate": 1.9341347613579087e-06, |
| "loss": 1.9715, |
| "step": 293 |
| }, |
| { |
| "epoch": 5.555023923444976, |
| "grad_norm": 5.370357036590576, |
| "learning_rate": 1.7364751777736332e-06, |
| "loss": 1.9897, |
| "step": 294 |
| }, |
| { |
| "epoch": 5.574162679425838, |
| "grad_norm": 3.702716588973999, |
| "learning_rate": 1.5493789750014031e-06, |
| "loss": 1.8739, |
| "step": 295 |
| }, |
| { |
| "epoch": 5.5933014354066986, |
| "grad_norm": 4.484430313110352, |
| "learning_rate": 1.3728662659818204e-06, |
| "loss": 2.0162, |
| "step": 296 |
| }, |
| { |
| "epoch": 5.6124401913875595, |
| "grad_norm": 4.100718975067139, |
| "learning_rate": 1.2069560259243328e-06, |
| "loss": 2.0228, |
| "step": 297 |
| }, |
| { |
| "epoch": 5.631578947368421, |
| "grad_norm": 4.891741752624512, |
| "learning_rate": 1.0516660902673448e-06, |
| "loss": 1.9772, |
| "step": 298 |
| }, |
| { |
| "epoch": 5.650717703349282, |
| "grad_norm": 4.323902130126953, |
| "learning_rate": 9.070131527609604e-07, |
| "loss": 1.8804, |
| "step": 299 |
| }, |
| { |
| "epoch": 5.669856459330144, |
| "grad_norm": 5.198728561401367, |
| "learning_rate": 7.730127636723539e-07, |
| "loss": 1.6451, |
| "step": 300 |
| }, |
| { |
| "epoch": 5.688995215311005, |
| "grad_norm": 4.846747398376465, |
| "learning_rate": 6.496793281141056e-07, |
| "loss": 1.9766, |
| "step": 301 |
| }, |
| { |
| "epoch": 5.708133971291866, |
| "grad_norm": 5.043095588684082, |
| "learning_rate": 5.370261044956971e-07, |
| "loss": 2.1393, |
| "step": 302 |
| }, |
| { |
| "epoch": 5.7272727272727275, |
| "grad_norm": 4.933630466461182, |
| "learning_rate": 4.3506520309813947e-07, |
| "loss": 1.8928, |
| "step": 303 |
| }, |
| { |
| "epoch": 5.746411483253588, |
| "grad_norm": 4.211745738983154, |
| "learning_rate": 3.4380758477219333e-07, |
| "loss": 1.9026, |
| "step": 304 |
| }, |
| { |
| "epoch": 5.76555023923445, |
| "grad_norm": 5.295810222625732, |
| "learning_rate": 2.6326305976001055e-07, |
| "loss": 2.0479, |
| "step": 305 |
| }, |
| { |
| "epoch": 5.784688995215311, |
| "grad_norm": 4.567193984985352, |
| "learning_rate": 1.9344028664056713e-07, |
| "loss": 2.1354, |
| "step": 306 |
| }, |
| { |
| "epoch": 5.803827751196172, |
| "grad_norm": 4.380620002746582, |
| "learning_rate": 1.3434677139885222e-07, |
| "loss": 1.952, |
| "step": 307 |
| }, |
| { |
| "epoch": 5.822966507177034, |
| "grad_norm": 4.634738922119141, |
| "learning_rate": 8.598886661895788e-08, |
| "loss": 1.6821, |
| "step": 308 |
| }, |
| { |
| "epoch": 5.842105263157895, |
| "grad_norm": 4.653122901916504, |
| "learning_rate": 4.837177080119215e-08, |
| "loss": 1.8293, |
| "step": 309 |
| }, |
| { |
| "epoch": 5.861244019138756, |
| "grad_norm": 5.309375286102295, |
| "learning_rate": 2.1499527803214846e-08, |
| "loss": 1.7619, |
| "step": 310 |
| }, |
| { |
| "epoch": 5.880382775119617, |
| "grad_norm": 5.267163276672363, |
| "learning_rate": 5.375026405352035e-09, |
| "loss": 1.9069, |
| "step": 311 |
| }, |
| { |
| "epoch": 5.899521531100478, |
| "grad_norm": 3.548534870147705, |
| "learning_rate": 0.0, |
| "loss": 2.1263, |
| "step": 312 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 312, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 6, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 876589621248000.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|