| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 418, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0023923444976076554, | |
| "grad_norm": 20.119582297751847, | |
| "learning_rate": 0.0, | |
| "loss": 2.1821, | |
| "num_tokens": 274125.0, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.004784688995215311, | |
| "grad_norm": 23.170097201666195, | |
| "learning_rate": 7.692307692307694e-07, | |
| "loss": 2.2358, | |
| "num_tokens": 493377.0, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.007177033492822967, | |
| "grad_norm": 23.45399110433363, | |
| "learning_rate": 1.5384615384615387e-06, | |
| "loss": 2.2012, | |
| "num_tokens": 686897.0, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.009569377990430622, | |
| "grad_norm": 19.228005860305963, | |
| "learning_rate": 2.307692307692308e-06, | |
| "loss": 2.2219, | |
| "num_tokens": 914354.0, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.011961722488038277, | |
| "grad_norm": 18.548172181534362, | |
| "learning_rate": 3.0769230769230774e-06, | |
| "loss": 2.1958, | |
| "num_tokens": 1087390.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.014354066985645933, | |
| "grad_norm": 13.728999411657618, | |
| "learning_rate": 3.846153846153847e-06, | |
| "loss": 2.1247, | |
| "num_tokens": 1268762.0, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.01674641148325359, | |
| "grad_norm": 8.505233076733274, | |
| "learning_rate": 4.615384615384616e-06, | |
| "loss": 2.0737, | |
| "num_tokens": 1433561.0, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.019138755980861243, | |
| "grad_norm": 4.1867059667712025, | |
| "learning_rate": 5.384615384615385e-06, | |
| "loss": 1.938, | |
| "num_tokens": 1655898.0, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.0215311004784689, | |
| "grad_norm": 3.3689397757728203, | |
| "learning_rate": 6.153846153846155e-06, | |
| "loss": 1.8762, | |
| "num_tokens": 1904754.0, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.023923444976076555, | |
| "grad_norm": 2.6965544763096254, | |
| "learning_rate": 6.923076923076923e-06, | |
| "loss": 1.8481, | |
| "num_tokens": 2100951.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.02631578947368421, | |
| "grad_norm": 2.0908111385220045, | |
| "learning_rate": 7.692307692307694e-06, | |
| "loss": 1.7457, | |
| "num_tokens": 2264681.0, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.028708133971291867, | |
| "grad_norm": 2.170718723726301, | |
| "learning_rate": 8.461538461538462e-06, | |
| "loss": 1.7225, | |
| "num_tokens": 2459076.0, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.03110047846889952, | |
| "grad_norm": 2.16857982636961, | |
| "learning_rate": 9.230769230769232e-06, | |
| "loss": 1.6537, | |
| "num_tokens": 2606612.0, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.03349282296650718, | |
| "grad_norm": 1.5656854944876009, | |
| "learning_rate": 1e-05, | |
| "loss": 1.6801, | |
| "num_tokens": 2766328.0, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.03588516746411483, | |
| "grad_norm": 1.6899464949924934, | |
| "learning_rate": 9.999864615158956e-06, | |
| "loss": 1.3963, | |
| "num_tokens": 2939734.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.03827751196172249, | |
| "grad_norm": 1.2147889414450102, | |
| "learning_rate": 9.999458468782065e-06, | |
| "loss": 1.6588, | |
| "num_tokens": 3209741.0, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.04066985645933014, | |
| "grad_norm": 1.3059422864639767, | |
| "learning_rate": 9.998781585307577e-06, | |
| "loss": 1.2028, | |
| "num_tokens": 3331253.0, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.0430622009569378, | |
| "grad_norm": 0.8168354152517865, | |
| "learning_rate": 9.997834005464281e-06, | |
| "loss": 1.5119, | |
| "num_tokens": 3550942.0, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.045454545454545456, | |
| "grad_norm": 0.7578450765410201, | |
| "learning_rate": 9.996615786269036e-06, | |
| "loss": 1.5165, | |
| "num_tokens": 3734184.0, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.04784688995215311, | |
| "grad_norm": 0.772984535484589, | |
| "learning_rate": 9.995127001023362e-06, | |
| "loss": 1.4925, | |
| "num_tokens": 3923612.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.050239234449760764, | |
| "grad_norm": 0.7657276095351829, | |
| "learning_rate": 9.993367739309013e-06, | |
| "loss": 1.3945, | |
| "num_tokens": 4090661.0, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.05263157894736842, | |
| "grad_norm": 0.6839298661119211, | |
| "learning_rate": 9.991338106982598e-06, | |
| "loss": 1.46, | |
| "num_tokens": 4300333.0, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.05502392344497608, | |
| "grad_norm": 0.7054066291049598, | |
| "learning_rate": 9.98903822616921e-06, | |
| "loss": 1.3554, | |
| "num_tokens": 4483986.0, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.05741626794258373, | |
| "grad_norm": 0.7193972470009606, | |
| "learning_rate": 9.986468235255065e-06, | |
| "loss": 1.4998, | |
| "num_tokens": 4682593.0, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.05980861244019139, | |
| "grad_norm": 0.6625723448730417, | |
| "learning_rate": 9.983628288879193e-06, | |
| "loss": 1.4898, | |
| "num_tokens": 4880940.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.06220095693779904, | |
| "grad_norm": 0.7404539912651659, | |
| "learning_rate": 9.98051855792412e-06, | |
| "loss": 1.3321, | |
| "num_tokens": 5074700.0, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.0645933014354067, | |
| "grad_norm": 0.7881264974132591, | |
| "learning_rate": 9.977139229505596e-06, | |
| "loss": 1.2212, | |
| "num_tokens": 5225193.0, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.06698564593301436, | |
| "grad_norm": 0.6060089446257308, | |
| "learning_rate": 9.973490506961326e-06, | |
| "loss": 1.5731, | |
| "num_tokens": 5447459.0, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.06937799043062201, | |
| "grad_norm": 0.618254776059864, | |
| "learning_rate": 9.969572609838745e-06, | |
| "loss": 1.4722, | |
| "num_tokens": 5676623.0, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.07177033492822966, | |
| "grad_norm": 0.6304080009866732, | |
| "learning_rate": 9.965385773881795e-06, | |
| "loss": 1.3474, | |
| "num_tokens": 5898924.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.07416267942583732, | |
| "grad_norm": 0.6104465608230878, | |
| "learning_rate": 9.960930251016752e-06, | |
| "loss": 1.4138, | |
| "num_tokens": 6089369.0, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.07655502392344497, | |
| "grad_norm": 0.6581355504876419, | |
| "learning_rate": 9.956206309337067e-06, | |
| "loss": 1.4661, | |
| "num_tokens": 6294065.0, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.07894736842105263, | |
| "grad_norm": 0.5866617107994286, | |
| "learning_rate": 9.951214233087223e-06, | |
| "loss": 1.4306, | |
| "num_tokens": 6515957.0, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.08133971291866028, | |
| "grad_norm": 0.605393818271364, | |
| "learning_rate": 9.945954322645643e-06, | |
| "loss": 1.3046, | |
| "num_tokens": 6725025.0, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.08373205741626795, | |
| "grad_norm": 0.5778342378194031, | |
| "learning_rate": 9.940426894506608e-06, | |
| "loss": 1.4363, | |
| "num_tokens": 6949955.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.0861244019138756, | |
| "grad_norm": 0.6258805596031615, | |
| "learning_rate": 9.934632281261221e-06, | |
| "loss": 1.3519, | |
| "num_tokens": 7152815.0, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.08851674641148326, | |
| "grad_norm": 0.5788764918533683, | |
| "learning_rate": 9.928570831577396e-06, | |
| "loss": 1.4289, | |
| "num_tokens": 7365760.0, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.09090909090909091, | |
| "grad_norm": 0.6104478143341243, | |
| "learning_rate": 9.922242910178862e-06, | |
| "loss": 1.4927, | |
| "num_tokens": 7619917.0, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.09330143540669857, | |
| "grad_norm": 0.666007518151506, | |
| "learning_rate": 9.915648897823232e-06, | |
| "loss": 1.1965, | |
| "num_tokens": 7772797.0, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.09569377990430622, | |
| "grad_norm": 0.7244485739284531, | |
| "learning_rate": 9.908789191279093e-06, | |
| "loss": 1.3198, | |
| "num_tokens": 7978612.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.09808612440191387, | |
| "grad_norm": 0.6500652663575426, | |
| "learning_rate": 9.901664203302126e-06, | |
| "loss": 1.3692, | |
| "num_tokens": 8181944.0, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.10047846889952153, | |
| "grad_norm": 0.6523516464098081, | |
| "learning_rate": 9.89427436261027e-06, | |
| "loss": 1.2651, | |
| "num_tokens": 8349921.0, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.10287081339712918, | |
| "grad_norm": 0.6141096849362858, | |
| "learning_rate": 9.886620113857926e-06, | |
| "loss": 1.1674, | |
| "num_tokens": 8513062.0, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.10526315789473684, | |
| "grad_norm": 0.5176000363276883, | |
| "learning_rate": 9.878701917609208e-06, | |
| "loss": 1.3363, | |
| "num_tokens": 8739362.0, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.1076555023923445, | |
| "grad_norm": 0.6496907081327192, | |
| "learning_rate": 9.870520250310223e-06, | |
| "loss": 1.2051, | |
| "num_tokens": 8882227.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.11004784688995216, | |
| "grad_norm": 0.5781609822463768, | |
| "learning_rate": 9.862075604260402e-06, | |
| "loss": 1.4038, | |
| "num_tokens": 9101362.0, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.11244019138755981, | |
| "grad_norm": 0.7191639780141069, | |
| "learning_rate": 9.853368487582888e-06, | |
| "loss": 1.1333, | |
| "num_tokens": 9286876.0, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.11483253588516747, | |
| "grad_norm": 0.6406116951034948, | |
| "learning_rate": 9.84439942419395e-06, | |
| "loss": 1.4121, | |
| "num_tokens": 9459192.0, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.11722488038277512, | |
| "grad_norm": 0.5661996222062946, | |
| "learning_rate": 9.835168953771463e-06, | |
| "loss": 1.322, | |
| "num_tokens": 9724803.0, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.11961722488038277, | |
| "grad_norm": 0.5715728086031884, | |
| "learning_rate": 9.825677631722436e-06, | |
| "loss": 1.3516, | |
| "num_tokens": 9933571.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.12200956937799043, | |
| "grad_norm": 0.6325774615690734, | |
| "learning_rate": 9.815926029149593e-06, | |
| "loss": 1.258, | |
| "num_tokens": 10136490.0, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.12440191387559808, | |
| "grad_norm": 0.5904482238857803, | |
| "learning_rate": 9.805914732817007e-06, | |
| "loss": 1.293, | |
| "num_tokens": 10340564.0, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.12679425837320574, | |
| "grad_norm": 0.5710320806437825, | |
| "learning_rate": 9.795644345114796e-06, | |
| "loss": 1.2765, | |
| "num_tokens": 10553400.0, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.1291866028708134, | |
| "grad_norm": 0.622309054620362, | |
| "learning_rate": 9.78511548402287e-06, | |
| "loss": 1.123, | |
| "num_tokens": 10758112.0, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.13157894736842105, | |
| "grad_norm": 0.7557997838257337, | |
| "learning_rate": 9.77432878307376e-06, | |
| "loss": 1.1149, | |
| "num_tokens": 10934718.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.1339712918660287, | |
| "grad_norm": 0.4774648627893749, | |
| "learning_rate": 9.763284891314481e-06, | |
| "loss": 1.4329, | |
| "num_tokens": 11227923.0, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.13636363636363635, | |
| "grad_norm": 0.6518939385243675, | |
| "learning_rate": 9.751984473267498e-06, | |
| "loss": 1.2629, | |
| "num_tokens": 11417535.0, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.13875598086124402, | |
| "grad_norm": 0.5370370863120535, | |
| "learning_rate": 9.740428208890716e-06, | |
| "loss": 1.3426, | |
| "num_tokens": 11651380.0, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.14114832535885166, | |
| "grad_norm": 0.5696851508370838, | |
| "learning_rate": 9.728616793536588e-06, | |
| "loss": 1.125, | |
| "num_tokens": 11830736.0, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.14354066985645933, | |
| "grad_norm": 0.5644132429290988, | |
| "learning_rate": 9.716550937910268e-06, | |
| "loss": 1.2145, | |
| "num_tokens": 12023638.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.145933014354067, | |
| "grad_norm": 0.7461647382617252, | |
| "learning_rate": 9.70423136802684e-06, | |
| "loss": 1.204, | |
| "num_tokens": 12234061.0, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.14832535885167464, | |
| "grad_norm": 0.5086888568285274, | |
| "learning_rate": 9.691658825167641e-06, | |
| "loss": 1.3124, | |
| "num_tokens": 12472421.0, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.1507177033492823, | |
| "grad_norm": 0.5053241954118645, | |
| "learning_rate": 9.67883406583566e-06, | |
| "loss": 1.3634, | |
| "num_tokens": 12734106.0, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.15311004784688995, | |
| "grad_norm": 0.5179034670964426, | |
| "learning_rate": 9.665757861710008e-06, | |
| "loss": 1.3053, | |
| "num_tokens": 12960684.0, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.15550239234449761, | |
| "grad_norm": 0.5461947358982723, | |
| "learning_rate": 9.652430999599491e-06, | |
| "loss": 1.2969, | |
| "num_tokens": 13170331.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.15789473684210525, | |
| "grad_norm": 0.6423563162262463, | |
| "learning_rate": 9.638854281395271e-06, | |
| "loss": 1.3541, | |
| "num_tokens": 13397481.0, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.16028708133971292, | |
| "grad_norm": 0.5755576573234283, | |
| "learning_rate": 9.625028524022606e-06, | |
| "loss": 1.2183, | |
| "num_tokens": 13638917.0, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.16267942583732056, | |
| "grad_norm": 0.6393096708849371, | |
| "learning_rate": 9.610954559391704e-06, | |
| "loss": 1.2774, | |
| "num_tokens": 13845779.0, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.16507177033492823, | |
| "grad_norm": 0.6238780043211961, | |
| "learning_rate": 9.596633234347661e-06, | |
| "loss": 1.0493, | |
| "num_tokens": 14015645.0, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.1674641148325359, | |
| "grad_norm": 0.6004590974749275, | |
| "learning_rate": 9.582065410619503e-06, | |
| "loss": 1.1128, | |
| "num_tokens": 14174170.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.16985645933014354, | |
| "grad_norm": 0.5353801191806298, | |
| "learning_rate": 9.567251964768343e-06, | |
| "loss": 1.2534, | |
| "num_tokens": 14391398.0, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.1722488038277512, | |
| "grad_norm": 0.5703356560477955, | |
| "learning_rate": 9.55219378813463e-06, | |
| "loss": 1.2457, | |
| "num_tokens": 14610731.0, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.17464114832535885, | |
| "grad_norm": 0.5213842592670314, | |
| "learning_rate": 9.53689178678452e-06, | |
| "loss": 1.3794, | |
| "num_tokens": 14858252.0, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.17703349282296652, | |
| "grad_norm": 0.5665738251245545, | |
| "learning_rate": 9.521346881455356e-06, | |
| "loss": 1.3718, | |
| "num_tokens": 15084332.0, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.17942583732057416, | |
| "grad_norm": 0.5432851738944047, | |
| "learning_rate": 9.505560007500263e-06, | |
| "loss": 1.2429, | |
| "num_tokens": 15352232.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.18181818181818182, | |
| "grad_norm": 0.6029856670534988, | |
| "learning_rate": 9.489532114831876e-06, | |
| "loss": 1.1883, | |
| "num_tokens": 15574514.0, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.18421052631578946, | |
| "grad_norm": 0.5636116286831033, | |
| "learning_rate": 9.473264167865172e-06, | |
| "loss": 1.1939, | |
| "num_tokens": 15788273.0, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.18660287081339713, | |
| "grad_norm": 0.5273294226554239, | |
| "learning_rate": 9.456757145459445e-06, | |
| "loss": 1.3284, | |
| "num_tokens": 16058083.0, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.18899521531100477, | |
| "grad_norm": 0.6091499871383838, | |
| "learning_rate": 9.44001204085941e-06, | |
| "loss": 1.1578, | |
| "num_tokens": 16222078.0, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.19138755980861244, | |
| "grad_norm": 0.5729867351707406, | |
| "learning_rate": 9.423029861635431e-06, | |
| "loss": 1.1448, | |
| "num_tokens": 16452197.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1937799043062201, | |
| "grad_norm": 0.5753208503065251, | |
| "learning_rate": 9.405811629622904e-06, | |
| "loss": 1.3236, | |
| "num_tokens": 16678106.0, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.19617224880382775, | |
| "grad_norm": 0.613469703266833, | |
| "learning_rate": 9.388358380860763e-06, | |
| "loss": 1.1021, | |
| "num_tokens": 16908054.0, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.19856459330143542, | |
| "grad_norm": 0.6002222062441086, | |
| "learning_rate": 9.370671165529146e-06, | |
| "loss": 1.1476, | |
| "num_tokens": 17140981.0, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.20095693779904306, | |
| "grad_norm": 0.5295041630429093, | |
| "learning_rate": 9.3527510478862e-06, | |
| "loss": 1.2725, | |
| "num_tokens": 17364693.0, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.20334928229665072, | |
| "grad_norm": 0.5369203542352684, | |
| "learning_rate": 9.334599106204051e-06, | |
| "loss": 1.2895, | |
| "num_tokens": 17563578.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.20574162679425836, | |
| "grad_norm": 0.5193929587177428, | |
| "learning_rate": 9.316216432703918e-06, | |
| "loss": 1.2499, | |
| "num_tokens": 17740374.0, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.20813397129186603, | |
| "grad_norm": 0.49812886005887325, | |
| "learning_rate": 9.29760413349039e-06, | |
| "loss": 1.3455, | |
| "num_tokens": 18015806.0, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.21052631578947367, | |
| "grad_norm": 0.5190241504997857, | |
| "learning_rate": 9.278763328484875e-06, | |
| "loss": 1.0828, | |
| "num_tokens": 18245485.0, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.21291866028708134, | |
| "grad_norm": 0.534699634820348, | |
| "learning_rate": 9.259695151358215e-06, | |
| "loss": 1.2029, | |
| "num_tokens": 18441471.0, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.215311004784689, | |
| "grad_norm": 0.5368146817909797, | |
| "learning_rate": 9.240400749462467e-06, | |
| "loss": 1.13, | |
| "num_tokens": 18659186.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.21770334928229665, | |
| "grad_norm": 0.6643155654192867, | |
| "learning_rate": 9.220881283761868e-06, | |
| "loss": 1.1626, | |
| "num_tokens": 18811916.0, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.22009569377990432, | |
| "grad_norm": 0.5953751009151461, | |
| "learning_rate": 9.20113792876298e-06, | |
| "loss": 1.1446, | |
| "num_tokens": 18974285.0, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.22248803827751196, | |
| "grad_norm": 0.6067628035324104, | |
| "learning_rate": 9.181171872444015e-06, | |
| "loss": 1.2417, | |
| "num_tokens": 19182034.0, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.22488038277511962, | |
| "grad_norm": 0.6396322460129866, | |
| "learning_rate": 9.160984316183354e-06, | |
| "loss": 1.0376, | |
| "num_tokens": 19324593.0, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.22727272727272727, | |
| "grad_norm": 0.5167898612058803, | |
| "learning_rate": 9.140576474687263e-06, | |
| "loss": 1.0627, | |
| "num_tokens": 19559212.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.22966507177033493, | |
| "grad_norm": 0.6898506829068124, | |
| "learning_rate": 9.1199495759168e-06, | |
| "loss": 1.0682, | |
| "num_tokens": 19734777.0, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.23205741626794257, | |
| "grad_norm": 0.5632751758217261, | |
| "learning_rate": 9.099104861013922e-06, | |
| "loss": 1.2069, | |
| "num_tokens": 19924776.0, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.23444976076555024, | |
| "grad_norm": 0.4975676948616479, | |
| "learning_rate": 9.078043584226816e-06, | |
| "loss": 1.2944, | |
| "num_tokens": 20166431.0, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.23684210526315788, | |
| "grad_norm": 0.5811862630357938, | |
| "learning_rate": 9.056767012834417e-06, | |
| "loss": 1.2261, | |
| "num_tokens": 20342559.0, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.23923444976076555, | |
| "grad_norm": 0.6205394909309613, | |
| "learning_rate": 9.035276427070166e-06, | |
| "loss": 1.1827, | |
| "num_tokens": 20528647.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.24162679425837322, | |
| "grad_norm": 0.6101249338540917, | |
| "learning_rate": 9.013573120044968e-06, | |
| "loss": 1.0195, | |
| "num_tokens": 20735927.0, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.24401913875598086, | |
| "grad_norm": 0.5589655982664236, | |
| "learning_rate": 8.991658397669384e-06, | |
| "loss": 1.2941, | |
| "num_tokens": 20973055.0, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.24641148325358853, | |
| "grad_norm": 0.602415461668376, | |
| "learning_rate": 8.96953357857507e-06, | |
| "loss": 0.9238, | |
| "num_tokens": 21131698.0, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.24880382775119617, | |
| "grad_norm": 0.4635975776481471, | |
| "learning_rate": 8.947199994035402e-06, | |
| "loss": 1.206, | |
| "num_tokens": 21426277.0, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.2511961722488038, | |
| "grad_norm": 0.5416414335210736, | |
| "learning_rate": 8.924658987885403e-06, | |
| "loss": 1.1863, | |
| "num_tokens": 21629826.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.2535885167464115, | |
| "grad_norm": 0.703889948074174, | |
| "learning_rate": 8.901911916440867e-06, | |
| "loss": 1.0592, | |
| "num_tokens": 21805342.0, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.25598086124401914, | |
| "grad_norm": 0.5638998814508404, | |
| "learning_rate": 8.878960148416747e-06, | |
| "loss": 1.2387, | |
| "num_tokens": 21993750.0, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.2583732057416268, | |
| "grad_norm": 0.5224818527209029, | |
| "learning_rate": 8.855805064844808e-06, | |
| "loss": 1.3391, | |
| "num_tokens": 22182974.0, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.2607655502392344, | |
| "grad_norm": 0.5975570946282182, | |
| "learning_rate": 8.832448058990522e-06, | |
| "loss": 1.1119, | |
| "num_tokens": 22406584.0, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.2631578947368421, | |
| "grad_norm": 0.5342575640517132, | |
| "learning_rate": 8.80889053626923e-06, | |
| "loss": 1.1556, | |
| "num_tokens": 22591986.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.26555023923444976, | |
| "grad_norm": 0.6463928995023777, | |
| "learning_rate": 8.785133914161586e-06, | |
| "loss": 1.0927, | |
| "num_tokens": 22755674.0, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.2679425837320574, | |
| "grad_norm": 0.5540394516081272, | |
| "learning_rate": 8.761179622128264e-06, | |
| "loss": 1.1932, | |
| "num_tokens": 22979344.0, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.2703349282296651, | |
| "grad_norm": 0.5639562135925512, | |
| "learning_rate": 8.737029101523931e-06, | |
| "loss": 1.1062, | |
| "num_tokens": 23213393.0, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.2727272727272727, | |
| "grad_norm": 0.47416665855465817, | |
| "learning_rate": 8.712683805510547e-06, | |
| "loss": 1.0925, | |
| "num_tokens": 23440736.0, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.2751196172248804, | |
| "grad_norm": 0.6750642922896175, | |
| "learning_rate": 8.6881451989699e-06, | |
| "loss": 1.2461, | |
| "num_tokens": 23595366.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.27751196172248804, | |
| "grad_norm": 0.5459520630146212, | |
| "learning_rate": 8.66341475841548e-06, | |
| "loss": 1.1222, | |
| "num_tokens": 23807492.0, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.2799043062200957, | |
| "grad_norm": 0.5301705350454893, | |
| "learning_rate": 8.638493971903621e-06, | |
| "loss": 1.3022, | |
| "num_tokens": 24019959.0, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.2822966507177033, | |
| "grad_norm": 0.6424194649582932, | |
| "learning_rate": 8.613384338943982e-06, | |
| "loss": 1.0574, | |
| "num_tokens": 24205265.0, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.284688995215311, | |
| "grad_norm": 0.5546308776167657, | |
| "learning_rate": 8.588087370409303e-06, | |
| "loss": 1.2411, | |
| "num_tokens": 24429509.0, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.28708133971291866, | |
| "grad_norm": 0.480470812260585, | |
| "learning_rate": 8.562604588444498e-06, | |
| "loss": 1.2674, | |
| "num_tokens": 24680453.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2894736842105263, | |
| "grad_norm": 0.5297827708710372, | |
| "learning_rate": 8.536937526375075e-06, | |
| "loss": 1.2252, | |
| "num_tokens": 24893378.0, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.291866028708134, | |
| "grad_norm": 0.770470928681588, | |
| "learning_rate": 8.511087728614863e-06, | |
| "loss": 1.0353, | |
| "num_tokens": 25020898.0, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.2942583732057416, | |
| "grad_norm": 0.5337837938457338, | |
| "learning_rate": 8.485056750573088e-06, | |
| "loss": 1.2966, | |
| "num_tokens": 25273187.0, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.2966507177033493, | |
| "grad_norm": 0.592552325078839, | |
| "learning_rate": 8.458846158560787e-06, | |
| "loss": 1.1754, | |
| "num_tokens": 25469601.0, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.29904306220095694, | |
| "grad_norm": 0.5958320399693818, | |
| "learning_rate": 8.43245752969655e-06, | |
| "loss": 1.069, | |
| "num_tokens": 25648408.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.3014354066985646, | |
| "grad_norm": 0.624744711279868, | |
| "learning_rate": 8.40589245181163e-06, | |
| "loss": 1.1037, | |
| "num_tokens": 25866106.0, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.3038277511961722, | |
| "grad_norm": 0.6392805038022229, | |
| "learning_rate": 8.379152523354407e-06, | |
| "loss": 1.1845, | |
| "num_tokens": 26058009.0, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.3062200956937799, | |
| "grad_norm": 0.5505337156956458, | |
| "learning_rate": 8.352239353294196e-06, | |
| "loss": 1.245, | |
| "num_tokens": 26327152.0, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.30861244019138756, | |
| "grad_norm": 0.5429338635678093, | |
| "learning_rate": 8.325154561024445e-06, | |
| "loss": 1.3208, | |
| "num_tokens": 26559334.0, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.31100478468899523, | |
| "grad_norm": 0.5543720622642925, | |
| "learning_rate": 8.29789977626528e-06, | |
| "loss": 1.217, | |
| "num_tokens": 26754982.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3133971291866029, | |
| "grad_norm": 0.6525624593414054, | |
| "learning_rate": 8.270476638965463e-06, | |
| "loss": 1.0719, | |
| "num_tokens": 26887851.0, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.3157894736842105, | |
| "grad_norm": 0.6284711216463389, | |
| "learning_rate": 8.242886799203696e-06, | |
| "loss": 1.1727, | |
| "num_tokens": 27042502.0, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.3181818181818182, | |
| "grad_norm": 0.5632325030743454, | |
| "learning_rate": 8.215131917089342e-06, | |
| "loss": 1.1525, | |
| "num_tokens": 27248040.0, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.32057416267942584, | |
| "grad_norm": 0.6252698594109136, | |
| "learning_rate": 8.187213662662539e-06, | |
| "loss": 1.0868, | |
| "num_tokens": 27463386.0, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.3229665071770335, | |
| "grad_norm": 0.55667567195552, | |
| "learning_rate": 8.159133715793701e-06, | |
| "loss": 1.1098, | |
| "num_tokens": 27684485.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.3253588516746411, | |
| "grad_norm": 0.5109763125317217, | |
| "learning_rate": 8.13089376608245e-06, | |
| "loss": 1.1185, | |
| "num_tokens": 27901192.0, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.3277511961722488, | |
| "grad_norm": 0.5657322857245803, | |
| "learning_rate": 8.102495512755939e-06, | |
| "loss": 1.3105, | |
| "num_tokens": 28138162.0, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.33014354066985646, | |
| "grad_norm": 0.5063120233634636, | |
| "learning_rate": 8.073940664566623e-06, | |
| "loss": 1.2374, | |
| "num_tokens": 28355174.0, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.33253588516746413, | |
| "grad_norm": 0.5701958065694588, | |
| "learning_rate": 8.045230939689425e-06, | |
| "loss": 1.1063, | |
| "num_tokens": 28521259.0, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.3349282296650718, | |
| "grad_norm": 0.540247926031648, | |
| "learning_rate": 8.016368065618361e-06, | |
| "loss": 1.0551, | |
| "num_tokens": 28746191.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3373205741626794, | |
| "grad_norm": 0.5340355745257312, | |
| "learning_rate": 7.987353779062598e-06, | |
| "loss": 1.235, | |
| "num_tokens": 29022355.0, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.3397129186602871, | |
| "grad_norm": 0.5292859186809687, | |
| "learning_rate": 7.958189825841942e-06, | |
| "loss": 1.1531, | |
| "num_tokens": 29238427.0, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.34210526315789475, | |
| "grad_norm": 0.7322544316739465, | |
| "learning_rate": 7.928877960781808e-06, | |
| "loss": 0.9135, | |
| "num_tokens": 29379111.0, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.3444976076555024, | |
| "grad_norm": 0.5080774575481332, | |
| "learning_rate": 7.899419947607611e-06, | |
| "loss": 1.2097, | |
| "num_tokens": 29627097.0, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.34688995215311, | |
| "grad_norm": 0.5832151085081759, | |
| "learning_rate": 7.869817558838654e-06, | |
| "loss": 1.0816, | |
| "num_tokens": 29832123.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.3492822966507177, | |
| "grad_norm": 0.5206108052264397, | |
| "learning_rate": 7.840072575681468e-06, | |
| "loss": 1.108, | |
| "num_tokens": 30048644.0, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.35167464114832536, | |
| "grad_norm": 0.5570271309488313, | |
| "learning_rate": 7.810186787922645e-06, | |
| "loss": 1.1653, | |
| "num_tokens": 30247851.0, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.35406698564593303, | |
| "grad_norm": 0.4918371375990957, | |
| "learning_rate": 7.78016199382112e-06, | |
| "loss": 1.1408, | |
| "num_tokens": 30527686.0, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.35645933014354064, | |
| "grad_norm": 0.5481932300046403, | |
| "learning_rate": 7.75e-06, | |
| "loss": 1.2044, | |
| "num_tokens": 30723713.0, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.3588516746411483, | |
| "grad_norm": 0.6651847229876482, | |
| "learning_rate": 7.719702621337834e-06, | |
| "loss": 1.0119, | |
| "num_tokens": 30898218.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.361244019138756, | |
| "grad_norm": 0.46633215220880386, | |
| "learning_rate": 7.68927168085942e-06, | |
| "loss": 1.1705, | |
| "num_tokens": 31126739.0, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.36363636363636365, | |
| "grad_norm": 0.5876480626961219, | |
| "learning_rate": 7.658709009626109e-06, | |
| "loss": 0.9351, | |
| "num_tokens": 31301729.0, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.3660287081339713, | |
| "grad_norm": 0.49945896590659167, | |
| "learning_rate": 7.628016446625626e-06, | |
| "loss": 1.2641, | |
| "num_tokens": 31531161.0, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.3684210526315789, | |
| "grad_norm": 0.5384303848101453, | |
| "learning_rate": 7.597195838661426e-06, | |
| "loss": 1.1977, | |
| "num_tokens": 31785635.0, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.3708133971291866, | |
| "grad_norm": 0.6031598977170286, | |
| "learning_rate": 7.566249040241553e-06, | |
| "loss": 1.0982, | |
| "num_tokens": 32017995.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.37320574162679426, | |
| "grad_norm": 0.5114284004215709, | |
| "learning_rate": 7.53517791346707e-06, | |
| "loss": 1.2633, | |
| "num_tokens": 32246103.0, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.37559808612440193, | |
| "grad_norm": 0.511553264808467, | |
| "learning_rate": 7.503984327920003e-06, | |
| "loss": 1.1566, | |
| "num_tokens": 32461173.0, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.37799043062200954, | |
| "grad_norm": 0.4861428494553005, | |
| "learning_rate": 7.472670160550849e-06, | |
| "loss": 1.2219, | |
| "num_tokens": 32710394.0, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.3803827751196172, | |
| "grad_norm": 0.591981436959529, | |
| "learning_rate": 7.441237295565642e-06, | |
| "loss": 1.275, | |
| "num_tokens": 32910997.0, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.3827751196172249, | |
| "grad_norm": 0.5171815810924354, | |
| "learning_rate": 7.409687624312569e-06, | |
| "loss": 1.2906, | |
| "num_tokens": 33191166.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.38516746411483255, | |
| "grad_norm": 0.6093674065623558, | |
| "learning_rate": 7.378023045168181e-06, | |
| "loss": 1.1703, | |
| "num_tokens": 33380845.0, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.3875598086124402, | |
| "grad_norm": 0.5521223923681069, | |
| "learning_rate": 7.346245463423148e-06, | |
| "loss": 1.1532, | |
| "num_tokens": 33553617.0, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.38995215311004783, | |
| "grad_norm": 0.5177157946810159, | |
| "learning_rate": 7.314356791167626e-06, | |
| "loss": 1.1612, | |
| "num_tokens": 33785498.0, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.3923444976076555, | |
| "grad_norm": 0.5060522779515988, | |
| "learning_rate": 7.282358947176207e-06, | |
| "loss": 1.3366, | |
| "num_tokens": 34019728.0, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.39473684210526316, | |
| "grad_norm": 0.5610143836266379, | |
| "learning_rate": 7.250253856792452e-06, | |
| "loss": 1.2572, | |
| "num_tokens": 34236289.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.39712918660287083, | |
| "grad_norm": 0.5606343028811931, | |
| "learning_rate": 7.218043451813058e-06, | |
| "loss": 1.0956, | |
| "num_tokens": 34415700.0, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.39952153110047844, | |
| "grad_norm": 0.5775794108416966, | |
| "learning_rate": 7.185729670371605e-06, | |
| "loss": 1.015, | |
| "num_tokens": 34605985.0, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.4019138755980861, | |
| "grad_norm": 0.6312411170295402, | |
| "learning_rate": 7.153314456821942e-06, | |
| "loss": 0.922, | |
| "num_tokens": 34748670.0, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.4043062200956938, | |
| "grad_norm": 0.5132788880980301, | |
| "learning_rate": 7.120799761621198e-06, | |
| "loss": 1.2394, | |
| "num_tokens": 34976413.0, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.40669856459330145, | |
| "grad_norm": 0.5618840133734496, | |
| "learning_rate": 7.08818754121241e-06, | |
| "loss": 1.0443, | |
| "num_tokens": 35182351.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.4090909090909091, | |
| "grad_norm": 0.5771799652861468, | |
| "learning_rate": 7.0554797579068155e-06, | |
| "loss": 1.0114, | |
| "num_tokens": 35384554.0, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.41148325358851673, | |
| "grad_norm": 0.4649122455940863, | |
| "learning_rate": 7.022678379765766e-06, | |
| "loss": 1.2349, | |
| "num_tokens": 35658712.0, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.4138755980861244, | |
| "grad_norm": 0.57386723032485, | |
| "learning_rate": 6.989785380482313e-06, | |
| "loss": 1.0024, | |
| "num_tokens": 35853348.0, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.41626794258373206, | |
| "grad_norm": 0.5785841074184913, | |
| "learning_rate": 6.956802739262446e-06, | |
| "loss": 1.1307, | |
| "num_tokens": 36048889.0, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.41866028708133973, | |
| "grad_norm": 0.5209762559962196, | |
| "learning_rate": 6.923732440706005e-06, | |
| "loss": 1.032, | |
| "num_tokens": 36250421.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.42105263157894735, | |
| "grad_norm": 0.49999578979845366, | |
| "learning_rate": 6.890576474687264e-06, | |
| "loss": 1.3027, | |
| "num_tokens": 36467223.0, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.423444976076555, | |
| "grad_norm": 0.44607951021905534, | |
| "learning_rate": 6.857336836235195e-06, | |
| "loss": 1.2908, | |
| "num_tokens": 36786228.0, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.4258373205741627, | |
| "grad_norm": 0.5405149465909439, | |
| "learning_rate": 6.824015525413428e-06, | |
| "loss": 1.2206, | |
| "num_tokens": 36987436.0, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.42822966507177035, | |
| "grad_norm": 0.5101094166751247, | |
| "learning_rate": 6.790614547199908e-06, | |
| "loss": 1.3338, | |
| "num_tokens": 37173969.0, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.430622009569378, | |
| "grad_norm": 0.5018404262587114, | |
| "learning_rate": 6.7571359113662405e-06, | |
| "loss": 0.9635, | |
| "num_tokens": 37430838.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.43301435406698563, | |
| "grad_norm": 0.5186179578093245, | |
| "learning_rate": 6.723581632356783e-06, | |
| "loss": 1.1317, | |
| "num_tokens": 37614321.0, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.4354066985645933, | |
| "grad_norm": 0.5092089036024817, | |
| "learning_rate": 6.689953729167411e-06, | |
| "loss": 1.1989, | |
| "num_tokens": 37828436.0, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.43779904306220097, | |
| "grad_norm": 0.5779182575588276, | |
| "learning_rate": 6.65625422522405e-06, | |
| "loss": 1.0699, | |
| "num_tokens": 37994173.0, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.44019138755980863, | |
| "grad_norm": 0.5213748156719571, | |
| "learning_rate": 6.622485148260916e-06, | |
| "loss": 1.142, | |
| "num_tokens": 38226513.0, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.44258373205741625, | |
| "grad_norm": 0.5124918281868935, | |
| "learning_rate": 6.588648530198505e-06, | |
| "loss": 1.0789, | |
| "num_tokens": 38424535.0, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.4449760765550239, | |
| "grad_norm": 0.4965284532552029, | |
| "learning_rate": 6.554746407021332e-06, | |
| "loss": 1.2216, | |
| "num_tokens": 38662320.0, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.4473684210526316, | |
| "grad_norm": 0.5776130784552208, | |
| "learning_rate": 6.520780818655421e-06, | |
| "loss": 1.2425, | |
| "num_tokens": 38852666.0, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.44976076555023925, | |
| "grad_norm": 0.5433597025027418, | |
| "learning_rate": 6.486753808845565e-06, | |
| "loss": 1.1762, | |
| "num_tokens": 39020645.0, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.45215311004784686, | |
| "grad_norm": 0.5851211313289845, | |
| "learning_rate": 6.45266742503235e-06, | |
| "loss": 1.1301, | |
| "num_tokens": 39229647.0, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.45454545454545453, | |
| "grad_norm": 0.5580553839960908, | |
| "learning_rate": 6.418523718228952e-06, | |
| "loss": 1.1287, | |
| "num_tokens": 39423404.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.4569377990430622, | |
| "grad_norm": 0.5702668222438311, | |
| "learning_rate": 6.3843247428977365e-06, | |
| "loss": 1.1402, | |
| "num_tokens": 39603933.0, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.45933014354066987, | |
| "grad_norm": 0.5524617168766218, | |
| "learning_rate": 6.350072556826632e-06, | |
| "loss": 1.0908, | |
| "num_tokens": 39799631.0, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.46172248803827753, | |
| "grad_norm": 0.5054083920538464, | |
| "learning_rate": 6.315769221005313e-06, | |
| "loss": 1.1696, | |
| "num_tokens": 40042491.0, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.46411483253588515, | |
| "grad_norm": 0.4984596483043875, | |
| "learning_rate": 6.281416799501188e-06, | |
| "loss": 0.9211, | |
| "num_tokens": 40228565.0, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.4665071770334928, | |
| "grad_norm": 0.5341608488908804, | |
| "learning_rate": 6.247017359335199e-06, | |
| "loss": 1.2083, | |
| "num_tokens": 40410247.0, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.4688995215311005, | |
| "grad_norm": 0.5046486493573384, | |
| "learning_rate": 6.2125729703574534e-06, | |
| "loss": 1.2149, | |
| "num_tokens": 40651771.0, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.47129186602870815, | |
| "grad_norm": 0.6097314899954371, | |
| "learning_rate": 6.178085705122675e-06, | |
| "loss": 1.0858, | |
| "num_tokens": 40855435.0, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.47368421052631576, | |
| "grad_norm": 0.5774665348623625, | |
| "learning_rate": 6.143557638765494e-06, | |
| "loss": 1.122, | |
| "num_tokens": 41030495.0, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.47607655502392343, | |
| "grad_norm": 0.48860350341505726, | |
| "learning_rate": 6.108990848875591e-06, | |
| "loss": 1.3412, | |
| "num_tokens": 41277045.0, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.4784688995215311, | |
| "grad_norm": 0.5361962907700251, | |
| "learning_rate": 6.074387415372677e-06, | |
| "loss": 1.0927, | |
| "num_tokens": 41500279.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.48086124401913877, | |
| "grad_norm": 0.6039231448287091, | |
| "learning_rate": 6.039749420381349e-06, | |
| "loss": 1.1362, | |
| "num_tokens": 41677455.0, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.48325358851674644, | |
| "grad_norm": 0.5131741268531921, | |
| "learning_rate": 6.005078948105808e-06, | |
| "loss": 1.2406, | |
| "num_tokens": 41894065.0, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.48564593301435405, | |
| "grad_norm": 0.47724842296291775, | |
| "learning_rate": 5.970378084704441e-06, | |
| "loss": 1.0304, | |
| "num_tokens": 42128139.0, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.4880382775119617, | |
| "grad_norm": 0.5240356233196276, | |
| "learning_rate": 5.935648918164308e-06, | |
| "loss": 1.0814, | |
| "num_tokens": 42333521.0, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.4904306220095694, | |
| "grad_norm": 0.5251041508662586, | |
| "learning_rate": 5.90089353817549e-06, | |
| "loss": 1.1679, | |
| "num_tokens": 42533301.0, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.49282296650717705, | |
| "grad_norm": 0.6532050533136743, | |
| "learning_rate": 5.866114036005363e-06, | |
| "loss": 0.9818, | |
| "num_tokens": 42694701.0, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.49521531100478466, | |
| "grad_norm": 0.6836388656935797, | |
| "learning_rate": 5.831312504372762e-06, | |
| "loss": 1.0012, | |
| "num_tokens": 42809151.0, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.49760765550239233, | |
| "grad_norm": 0.5030489700232146, | |
| "learning_rate": 5.796491037322054e-06, | |
| "loss": 1.1244, | |
| "num_tokens": 43035639.0, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.5562880150972886, | |
| "learning_rate": 5.761651730097142e-06, | |
| "loss": 1.3298, | |
| "num_tokens": 43207069.0, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.5023923444976076, | |
| "grad_norm": 0.5324885775750403, | |
| "learning_rate": 5.726796679015392e-06, | |
| "loss": 1.3305, | |
| "num_tokens": 43475398.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5047846889952153, | |
| "grad_norm": 0.6085427119073632, | |
| "learning_rate": 5.691927981341488e-06, | |
| "loss": 1.0097, | |
| "num_tokens": 43641183.0, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.507177033492823, | |
| "grad_norm": 0.6541524113634078, | |
| "learning_rate": 5.657047735161256e-06, | |
| "loss": 0.7888, | |
| "num_tokens": 43820730.0, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.5095693779904307, | |
| "grad_norm": 0.5724267971985464, | |
| "learning_rate": 5.622158039255394e-06, | |
| "loss": 1.1429, | |
| "num_tokens": 44013162.0, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.5119617224880383, | |
| "grad_norm": 0.4888491874482519, | |
| "learning_rate": 5.58726099297321e-06, | |
| "loss": 1.0386, | |
| "num_tokens": 44259910.0, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.5143540669856459, | |
| "grad_norm": 0.5678338260313958, | |
| "learning_rate": 5.552358696106288e-06, | |
| "loss": 1.175, | |
| "num_tokens": 44480685.0, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.5167464114832536, | |
| "grad_norm": 0.5262339117176533, | |
| "learning_rate": 5.517453248762142e-06, | |
| "loss": 1.233, | |
| "num_tokens": 44690652.0, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.5191387559808612, | |
| "grad_norm": 0.5686946242510297, | |
| "learning_rate": 5.482546751237859e-06, | |
| "loss": 0.9377, | |
| "num_tokens": 44905510.0, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.5215311004784688, | |
| "grad_norm": 0.5096154075649568, | |
| "learning_rate": 5.447641303893715e-06, | |
| "loss": 0.9606, | |
| "num_tokens": 45121618.0, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.5239234449760766, | |
| "grad_norm": 0.5027532121976238, | |
| "learning_rate": 5.412739007026791e-06, | |
| "loss": 1.3208, | |
| "num_tokens": 45328957.0, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.5263157894736842, | |
| "grad_norm": 0.5955398795356434, | |
| "learning_rate": 5.377841960744607e-06, | |
| "loss": 1.0519, | |
| "num_tokens": 45470498.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5287081339712919, | |
| "grad_norm": 0.5632402633040062, | |
| "learning_rate": 5.342952264838748e-06, | |
| "loss": 1.0009, | |
| "num_tokens": 45690586.0, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.5311004784688995, | |
| "grad_norm": 0.5530392228322656, | |
| "learning_rate": 5.308072018658512e-06, | |
| "loss": 1.0197, | |
| "num_tokens": 45915829.0, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.5334928229665071, | |
| "grad_norm": 0.5560740776916706, | |
| "learning_rate": 5.273203320984611e-06, | |
| "loss": 1.0086, | |
| "num_tokens": 46125336.0, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.5358851674641149, | |
| "grad_norm": 0.47936312873685966, | |
| "learning_rate": 5.23834826990286e-06, | |
| "loss": 1.2004, | |
| "num_tokens": 46386175.0, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.5382775119617225, | |
| "grad_norm": 0.5451628089579803, | |
| "learning_rate": 5.203508962677947e-06, | |
| "loss": 1.1559, | |
| "num_tokens": 46618828.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.5406698564593302, | |
| "grad_norm": 0.5352825379096331, | |
| "learning_rate": 5.168687495627239e-06, | |
| "loss": 1.1977, | |
| "num_tokens": 46873878.0, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.5430622009569378, | |
| "grad_norm": 0.5328607455361074, | |
| "learning_rate": 5.1338859639946396e-06, | |
| "loss": 1.0719, | |
| "num_tokens": 47110612.0, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.5454545454545454, | |
| "grad_norm": 0.5355655159606746, | |
| "learning_rate": 5.099106461824513e-06, | |
| "loss": 1.1536, | |
| "num_tokens": 47297604.0, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.5478468899521531, | |
| "grad_norm": 0.652585538954601, | |
| "learning_rate": 5.064351081835695e-06, | |
| "loss": 1.1744, | |
| "num_tokens": 47508300.0, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.5502392344497608, | |
| "grad_norm": 0.5726602885947132, | |
| "learning_rate": 5.02962191529556e-06, | |
| "loss": 0.9178, | |
| "num_tokens": 47674186.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5526315789473685, | |
| "grad_norm": 0.5227349746690181, | |
| "learning_rate": 4.9949210518941945e-06, | |
| "loss": 1.0537, | |
| "num_tokens": 47869064.0, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.5550239234449761, | |
| "grad_norm": 0.5270482777761917, | |
| "learning_rate": 4.960250579618652e-06, | |
| "loss": 1.1318, | |
| "num_tokens": 48073543.0, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.5574162679425837, | |
| "grad_norm": 0.5628820736414913, | |
| "learning_rate": 4.925612584627325e-06, | |
| "loss": 1.0542, | |
| "num_tokens": 48249518.0, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.5598086124401914, | |
| "grad_norm": 0.5460319678028444, | |
| "learning_rate": 4.8910091511244115e-06, | |
| "loss": 1.0131, | |
| "num_tokens": 48471001.0, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.562200956937799, | |
| "grad_norm": 0.5503254822986171, | |
| "learning_rate": 4.856442361234507e-06, | |
| "loss": 1.0773, | |
| "num_tokens": 48720980.0, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.5645933014354066, | |
| "grad_norm": 0.5091545126296911, | |
| "learning_rate": 4.821914294877327e-06, | |
| "loss": 1.1478, | |
| "num_tokens": 48922782.0, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.5669856459330144, | |
| "grad_norm": 0.5074108889085012, | |
| "learning_rate": 4.787427029642549e-06, | |
| "loss": 1.2534, | |
| "num_tokens": 49149522.0, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.569377990430622, | |
| "grad_norm": 0.5849957930987398, | |
| "learning_rate": 4.752982640664804e-06, | |
| "loss": 1.0202, | |
| "num_tokens": 49321177.0, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.5717703349282297, | |
| "grad_norm": 0.5347992211342384, | |
| "learning_rate": 4.718583200498814e-06, | |
| "loss": 1.2032, | |
| "num_tokens": 49544634.0, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.5741626794258373, | |
| "grad_norm": 0.5280959102930131, | |
| "learning_rate": 4.684230778994688e-06, | |
| "loss": 1.1751, | |
| "num_tokens": 49724091.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5765550239234449, | |
| "grad_norm": 0.5164476203177735, | |
| "learning_rate": 4.64992744317337e-06, | |
| "loss": 1.1098, | |
| "num_tokens": 49929099.0, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.5789473684210527, | |
| "grad_norm": 0.6667023806983443, | |
| "learning_rate": 4.615675257102265e-06, | |
| "loss": 0.9402, | |
| "num_tokens": 50081941.0, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.5813397129186603, | |
| "grad_norm": 0.5023784414967131, | |
| "learning_rate": 4.58147628177105e-06, | |
| "loss": 1.01, | |
| "num_tokens": 50306579.0, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.583732057416268, | |
| "grad_norm": 0.5370878293075974, | |
| "learning_rate": 4.547332574967653e-06, | |
| "loss": 1.079, | |
| "num_tokens": 50544895.0, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.5861244019138756, | |
| "grad_norm": 0.5090426584844939, | |
| "learning_rate": 4.513246191154434e-06, | |
| "loss": 1.1825, | |
| "num_tokens": 50788203.0, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.5885167464114832, | |
| "grad_norm": 0.4792828066902539, | |
| "learning_rate": 4.479219181344579e-06, | |
| "loss": 1.2301, | |
| "num_tokens": 51053982.0, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.5909090909090909, | |
| "grad_norm": 0.49219719144165075, | |
| "learning_rate": 4.44525359297867e-06, | |
| "loss": 1.1711, | |
| "num_tokens": 51259911.0, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.5933014354066986, | |
| "grad_norm": 0.5340406735561365, | |
| "learning_rate": 4.4113514698014955e-06, | |
| "loss": 1.1956, | |
| "num_tokens": 51473886.0, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.5956937799043063, | |
| "grad_norm": 0.5702889032524951, | |
| "learning_rate": 4.377514851739085e-06, | |
| "loss": 1.1091, | |
| "num_tokens": 51735586.0, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.5980861244019139, | |
| "grad_norm": 0.5115029340630267, | |
| "learning_rate": 4.3437457747759515e-06, | |
| "loss": 1.1343, | |
| "num_tokens": 51923001.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6004784688995215, | |
| "grad_norm": 0.4738251807559482, | |
| "learning_rate": 4.310046270832592e-06, | |
| "loss": 1.07, | |
| "num_tokens": 52167211.0, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.6028708133971292, | |
| "grad_norm": 0.562569354089248, | |
| "learning_rate": 4.276418367643218e-06, | |
| "loss": 0.9359, | |
| "num_tokens": 52345300.0, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.6052631578947368, | |
| "grad_norm": 0.6492878859321651, | |
| "learning_rate": 4.242864088633762e-06, | |
| "loss": 0.8908, | |
| "num_tokens": 52537210.0, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.6076555023923444, | |
| "grad_norm": 0.6078233345214087, | |
| "learning_rate": 4.2093854528000955e-06, | |
| "loss": 0.8913, | |
| "num_tokens": 52695428.0, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.6100478468899522, | |
| "grad_norm": 0.5115019352055596, | |
| "learning_rate": 4.175984474586572e-06, | |
| "loss": 1.0335, | |
| "num_tokens": 52945131.0, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.6124401913875598, | |
| "grad_norm": 0.5875660189403787, | |
| "learning_rate": 4.142663163764806e-06, | |
| "loss": 0.941, | |
| "num_tokens": 53101160.0, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.6148325358851675, | |
| "grad_norm": 0.5230885907461125, | |
| "learning_rate": 4.109423525312738e-06, | |
| "loss": 1.1472, | |
| "num_tokens": 53341330.0, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.6172248803827751, | |
| "grad_norm": 0.676100542426314, | |
| "learning_rate": 4.076267559293996e-06, | |
| "loss": 0.9226, | |
| "num_tokens": 53477820.0, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.6196172248803827, | |
| "grad_norm": 0.6027764896908601, | |
| "learning_rate": 4.043197260737556e-06, | |
| "loss": 1.1615, | |
| "num_tokens": 53655177.0, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.6220095693779905, | |
| "grad_norm": 0.5114599101755669, | |
| "learning_rate": 4.0102146195176895e-06, | |
| "loss": 1.0848, | |
| "num_tokens": 53871093.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6244019138755981, | |
| "grad_norm": 0.5616877393452973, | |
| "learning_rate": 3.977321620234236e-06, | |
| "loss": 1.1293, | |
| "num_tokens": 54051884.0, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.6267942583732058, | |
| "grad_norm": 0.5951828000342995, | |
| "learning_rate": 3.944520242093186e-06, | |
| "loss": 1.1116, | |
| "num_tokens": 54243302.0, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.6291866028708134, | |
| "grad_norm": 0.5533241097093147, | |
| "learning_rate": 3.911812458787592e-06, | |
| "loss": 1.0339, | |
| "num_tokens": 54449587.0, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.631578947368421, | |
| "grad_norm": 0.6391714671501187, | |
| "learning_rate": 3.8792002383788044e-06, | |
| "loss": 1.0188, | |
| "num_tokens": 54573282.0, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.6339712918660287, | |
| "grad_norm": 0.48381850337769244, | |
| "learning_rate": 3.846685543178058e-06, | |
| "loss": 1.2549, | |
| "num_tokens": 54826368.0, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.6363636363636364, | |
| "grad_norm": 0.49990948075130837, | |
| "learning_rate": 3.8142703296283954e-06, | |
| "loss": 1.1331, | |
| "num_tokens": 55080391.0, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.638755980861244, | |
| "grad_norm": 0.5427808072503959, | |
| "learning_rate": 3.7819565481869426e-06, | |
| "loss": 1.1618, | |
| "num_tokens": 55285642.0, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.6411483253588517, | |
| "grad_norm": 0.5747721632491769, | |
| "learning_rate": 3.7497461432075477e-06, | |
| "loss": 1.1053, | |
| "num_tokens": 55481520.0, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.6435406698564593, | |
| "grad_norm": 0.5301204962544379, | |
| "learning_rate": 3.717641052823795e-06, | |
| "loss": 1.1108, | |
| "num_tokens": 55706780.0, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.645933014354067, | |
| "grad_norm": 0.5775776454615925, | |
| "learning_rate": 3.6856432088323746e-06, | |
| "loss": 1.1119, | |
| "num_tokens": 55902431.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6483253588516746, | |
| "grad_norm": 0.5001600002488803, | |
| "learning_rate": 3.6537545365768543e-06, | |
| "loss": 0.9535, | |
| "num_tokens": 56104220.0, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.6507177033492823, | |
| "grad_norm": 0.5699808255124916, | |
| "learning_rate": 3.6219769548318205e-06, | |
| "loss": 1.0524, | |
| "num_tokens": 56257950.0, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.65311004784689, | |
| "grad_norm": 0.5003276838892392, | |
| "learning_rate": 3.5903123756874315e-06, | |
| "loss": 1.1485, | |
| "num_tokens": 56488654.0, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.6555023923444976, | |
| "grad_norm": 0.6033119191336221, | |
| "learning_rate": 3.558762704434361e-06, | |
| "loss": 1.024, | |
| "num_tokens": 56686270.0, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.6578947368421053, | |
| "grad_norm": 0.4693280395015428, | |
| "learning_rate": 3.527329839449152e-06, | |
| "loss": 1.136, | |
| "num_tokens": 56931317.0, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.6602870813397129, | |
| "grad_norm": 0.5278398302464965, | |
| "learning_rate": 3.496015672079998e-06, | |
| "loss": 1.1571, | |
| "num_tokens": 57127263.0, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.6626794258373205, | |
| "grad_norm": 0.49190545922349904, | |
| "learning_rate": 3.4648220865329312e-06, | |
| "loss": 1.0427, | |
| "num_tokens": 57354122.0, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.6650717703349283, | |
| "grad_norm": 0.4934205228618601, | |
| "learning_rate": 3.4337509597584466e-06, | |
| "loss": 1.2705, | |
| "num_tokens": 57579975.0, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.6674641148325359, | |
| "grad_norm": 0.6046200272271364, | |
| "learning_rate": 3.402804161338577e-06, | |
| "loss": 0.9143, | |
| "num_tokens": 57767139.0, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.6698564593301436, | |
| "grad_norm": 0.5256841221145759, | |
| "learning_rate": 3.371983553374375e-06, | |
| "loss": 1.0864, | |
| "num_tokens": 57969542.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6722488038277512, | |
| "grad_norm": 0.5879727234811725, | |
| "learning_rate": 3.3412909903738937e-06, | |
| "loss": 0.9625, | |
| "num_tokens": 58145028.0, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.6746411483253588, | |
| "grad_norm": 0.6263377798428889, | |
| "learning_rate": 3.310728319140581e-06, | |
| "loss": 0.9234, | |
| "num_tokens": 58312705.0, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.6770334928229665, | |
| "grad_norm": 0.5407307381090947, | |
| "learning_rate": 3.2802973786621665e-06, | |
| "loss": 1.0687, | |
| "num_tokens": 58527623.0, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.6794258373205742, | |
| "grad_norm": 0.5502001614125057, | |
| "learning_rate": 3.2500000000000015e-06, | |
| "loss": 1.1427, | |
| "num_tokens": 58772116.0, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.6818181818181818, | |
| "grad_norm": 0.5686855066649326, | |
| "learning_rate": 3.2198380061788803e-06, | |
| "loss": 1.031, | |
| "num_tokens": 58948693.0, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.6842105263157895, | |
| "grad_norm": 0.56427208726594, | |
| "learning_rate": 3.1898132120773566e-06, | |
| "loss": 1.0001, | |
| "num_tokens": 59160106.0, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.6866028708133971, | |
| "grad_norm": 0.5178015680501699, | |
| "learning_rate": 3.1599274243185314e-06, | |
| "loss": 1.2459, | |
| "num_tokens": 59393828.0, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.6889952153110048, | |
| "grad_norm": 0.6161696867803992, | |
| "learning_rate": 3.1301824411613473e-06, | |
| "loss": 1.077, | |
| "num_tokens": 59592707.0, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.6913875598086124, | |
| "grad_norm": 0.49780237640470854, | |
| "learning_rate": 3.1005800523923906e-06, | |
| "loss": 1.1431, | |
| "num_tokens": 59812582.0, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.69377990430622, | |
| "grad_norm": 0.5031207474545651, | |
| "learning_rate": 3.071122039218194e-06, | |
| "loss": 1.1467, | |
| "num_tokens": 60043641.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6961722488038278, | |
| "grad_norm": 0.574254924525526, | |
| "learning_rate": 3.0418101741580586e-06, | |
| "loss": 1.1918, | |
| "num_tokens": 60234442.0, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.6985645933014354, | |
| "grad_norm": 0.5016769304104969, | |
| "learning_rate": 3.012646220937403e-06, | |
| "loss": 1.31, | |
| "num_tokens": 60456123.0, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.7009569377990431, | |
| "grad_norm": 0.5058935049560537, | |
| "learning_rate": 2.98363193438164e-06, | |
| "loss": 0.9371, | |
| "num_tokens": 60672710.0, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.7033492822966507, | |
| "grad_norm": 0.5351125304814696, | |
| "learning_rate": 2.9547690603105774e-06, | |
| "loss": 1.0698, | |
| "num_tokens": 60894772.0, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.7057416267942583, | |
| "grad_norm": 0.5128628418090031, | |
| "learning_rate": 2.926059335433378e-06, | |
| "loss": 1.2298, | |
| "num_tokens": 61142587.0, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.7081339712918661, | |
| "grad_norm": 0.5144613524379172, | |
| "learning_rate": 2.897504487244061e-06, | |
| "loss": 0.9337, | |
| "num_tokens": 61352129.0, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.7105263157894737, | |
| "grad_norm": 0.5861410143772018, | |
| "learning_rate": 2.8691062339175512e-06, | |
| "loss": 0.9923, | |
| "num_tokens": 61498549.0, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.7129186602870813, | |
| "grad_norm": 0.5481256980886055, | |
| "learning_rate": 2.8408662842063002e-06, | |
| "loss": 1.0957, | |
| "num_tokens": 61687826.0, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.715311004784689, | |
| "grad_norm": 0.5582805882931381, | |
| "learning_rate": 2.8127863373374637e-06, | |
| "loss": 1.09, | |
| "num_tokens": 61877628.0, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.7177033492822966, | |
| "grad_norm": 0.5983921444578938, | |
| "learning_rate": 2.7848680829106602e-06, | |
| "loss": 1.0968, | |
| "num_tokens": 62078858.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7200956937799043, | |
| "grad_norm": 0.5339997006585953, | |
| "learning_rate": 2.7571132007963074e-06, | |
| "loss": 1.1891, | |
| "num_tokens": 62265457.0, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.722488038277512, | |
| "grad_norm": 0.5449456499746453, | |
| "learning_rate": 2.7295233610345384e-06, | |
| "loss": 1.0269, | |
| "num_tokens": 62488733.0, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.7248803827751196, | |
| "grad_norm": 0.5699604526936535, | |
| "learning_rate": 2.7021002237347206e-06, | |
| "loss": 1.1336, | |
| "num_tokens": 62714416.0, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.7272727272727273, | |
| "grad_norm": 0.6413790402904914, | |
| "learning_rate": 2.6748454389755576e-06, | |
| "loss": 0.9382, | |
| "num_tokens": 62890365.0, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.7296650717703349, | |
| "grad_norm": 0.5390387726292147, | |
| "learning_rate": 2.647760646705804e-06, | |
| "loss": 1.0829, | |
| "num_tokens": 63120765.0, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.7320574162679426, | |
| "grad_norm": 0.5984653976738545, | |
| "learning_rate": 2.620847476645594e-06, | |
| "loss": 0.9221, | |
| "num_tokens": 63320228.0, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.7344497607655502, | |
| "grad_norm": 0.5801251118440074, | |
| "learning_rate": 2.5941075481883705e-06, | |
| "loss": 1.1212, | |
| "num_tokens": 63509873.0, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.7368421052631579, | |
| "grad_norm": 0.5636489099209283, | |
| "learning_rate": 2.567542470303452e-06, | |
| "loss": 1.078, | |
| "num_tokens": 63745029.0, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.7392344497607656, | |
| "grad_norm": 0.48725639119647585, | |
| "learning_rate": 2.5411538414392146e-06, | |
| "loss": 1.2125, | |
| "num_tokens": 63953310.0, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.7416267942583732, | |
| "grad_norm": 0.549253240822144, | |
| "learning_rate": 2.5149432494269134e-06, | |
| "loss": 1.1192, | |
| "num_tokens": 64147381.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7440191387559809, | |
| "grad_norm": 0.5491580770023559, | |
| "learning_rate": 2.4889122713851397e-06, | |
| "loss": 0.9919, | |
| "num_tokens": 64340436.0, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.7464114832535885, | |
| "grad_norm": 0.5164385106756677, | |
| "learning_rate": 2.463062473624927e-06, | |
| "loss": 1.0476, | |
| "num_tokens": 64568538.0, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.7488038277511961, | |
| "grad_norm": 0.583840880433391, | |
| "learning_rate": 2.437395411555504e-06, | |
| "loss": 1.1016, | |
| "num_tokens": 64759586.0, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.7511961722488039, | |
| "grad_norm": 0.5528719370540063, | |
| "learning_rate": 2.4119126295906997e-06, | |
| "loss": 1.1974, | |
| "num_tokens": 64942864.0, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.7535885167464115, | |
| "grad_norm": 0.6028168080715274, | |
| "learning_rate": 2.3866156610560186e-06, | |
| "loss": 1.0019, | |
| "num_tokens": 65142788.0, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.7559808612440191, | |
| "grad_norm": 0.5816986940686796, | |
| "learning_rate": 2.3615060280963797e-06, | |
| "loss": 1.2118, | |
| "num_tokens": 65362360.0, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.7583732057416268, | |
| "grad_norm": 0.5809244671898545, | |
| "learning_rate": 2.3365852415845225e-06, | |
| "loss": 1.1267, | |
| "num_tokens": 65547922.0, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.7607655502392344, | |
| "grad_norm": 0.5262370165475527, | |
| "learning_rate": 2.3118548010301015e-06, | |
| "loss": 1.1893, | |
| "num_tokens": 65731553.0, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.7631578947368421, | |
| "grad_norm": 0.5357040610680347, | |
| "learning_rate": 2.2873161944894552e-06, | |
| "loss": 1.1869, | |
| "num_tokens": 65951250.0, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.7655502392344498, | |
| "grad_norm": 0.5570433795031379, | |
| "learning_rate": 2.262970898476071e-06, | |
| "loss": 0.9916, | |
| "num_tokens": 66175000.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.7679425837320574, | |
| "grad_norm": 0.604494546666767, | |
| "learning_rate": 2.2388203778717407e-06, | |
| "loss": 1.1347, | |
| "num_tokens": 66357517.0, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.7703349282296651, | |
| "grad_norm": 0.5827904281357608, | |
| "learning_rate": 2.2148660858384147e-06, | |
| "loss": 1.0356, | |
| "num_tokens": 66566078.0, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.7727272727272727, | |
| "grad_norm": 0.5218976553836495, | |
| "learning_rate": 2.1911094637307715e-06, | |
| "loss": 1.1124, | |
| "num_tokens": 66784937.0, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.7751196172248804, | |
| "grad_norm": 0.49417380874831474, | |
| "learning_rate": 2.1675519410094803e-06, | |
| "loss": 1.1203, | |
| "num_tokens": 67057361.0, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.777511961722488, | |
| "grad_norm": 0.6319926280044286, | |
| "learning_rate": 2.144194935155192e-06, | |
| "loss": 1.038, | |
| "num_tokens": 67276459.0, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.7799043062200957, | |
| "grad_norm": 0.553450207558276, | |
| "learning_rate": 2.121039851583254e-06, | |
| "loss": 1.0843, | |
| "num_tokens": 67454638.0, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.7822966507177034, | |
| "grad_norm": 0.5159208111364086, | |
| "learning_rate": 2.098088083559135e-06, | |
| "loss": 0.9358, | |
| "num_tokens": 67667938.0, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.784688995215311, | |
| "grad_norm": 0.5059115925994171, | |
| "learning_rate": 2.0753410121145984e-06, | |
| "loss": 1.1579, | |
| "num_tokens": 67859669.0, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.7870813397129187, | |
| "grad_norm": 0.5613491350937895, | |
| "learning_rate": 2.0528000059646e-06, | |
| "loss": 1.0022, | |
| "num_tokens": 68056005.0, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.7894736842105263, | |
| "grad_norm": 0.5374042116513947, | |
| "learning_rate": 2.0304664214249326e-06, | |
| "loss": 1.0718, | |
| "num_tokens": 68255467.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7918660287081339, | |
| "grad_norm": 0.503580387927313, | |
| "learning_rate": 2.0083416023306163e-06, | |
| "loss": 1.1493, | |
| "num_tokens": 68469900.0, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.7942583732057417, | |
| "grad_norm": 0.5884447457044938, | |
| "learning_rate": 1.986426879955034e-06, | |
| "loss": 0.9502, | |
| "num_tokens": 68685343.0, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.7966507177033493, | |
| "grad_norm": 0.6834427409407543, | |
| "learning_rate": 1.9647235729298346e-06, | |
| "loss": 0.9018, | |
| "num_tokens": 68834514.0, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.7990430622009569, | |
| "grad_norm": 0.5189288186456062, | |
| "learning_rate": 1.9432329871655837e-06, | |
| "loss": 1.2691, | |
| "num_tokens": 69046003.0, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.8014354066985646, | |
| "grad_norm": 0.516776960640009, | |
| "learning_rate": 1.9219564157731848e-06, | |
| "loss": 1.0057, | |
| "num_tokens": 69272731.0, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.8038277511961722, | |
| "grad_norm": 0.4831598833288486, | |
| "learning_rate": 1.9008951389860785e-06, | |
| "loss": 1.1143, | |
| "num_tokens": 69508303.0, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.80622009569378, | |
| "grad_norm": 0.5753229158728437, | |
| "learning_rate": 1.8800504240832012e-06, | |
| "loss": 1.1146, | |
| "num_tokens": 69706781.0, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.8086124401913876, | |
| "grad_norm": 0.5983941033127453, | |
| "learning_rate": 1.8594235253127373e-06, | |
| "loss": 1.1979, | |
| "num_tokens": 69926110.0, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.8110047846889952, | |
| "grad_norm": 0.5114846230853078, | |
| "learning_rate": 1.8390156838166464e-06, | |
| "loss": 1.016, | |
| "num_tokens": 70133509.0, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.8133971291866029, | |
| "grad_norm": 0.5260668256751079, | |
| "learning_rate": 1.8188281275559866e-06, | |
| "loss": 1.0266, | |
| "num_tokens": 70365768.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.8157894736842105, | |
| "grad_norm": 0.5595038468322735, | |
| "learning_rate": 1.7988620712370197e-06, | |
| "loss": 1.1005, | |
| "num_tokens": 70548685.0, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.8181818181818182, | |
| "grad_norm": 0.6890712705743423, | |
| "learning_rate": 1.7791187162381325e-06, | |
| "loss": 1.0739, | |
| "num_tokens": 70725591.0, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.8205741626794258, | |
| "grad_norm": 0.5344037158436257, | |
| "learning_rate": 1.759599250537534e-06, | |
| "loss": 1.1548, | |
| "num_tokens": 70943507.0, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.8229665071770335, | |
| "grad_norm": 0.5589105656078766, | |
| "learning_rate": 1.740304848641787e-06, | |
| "loss": 1.0402, | |
| "num_tokens": 71137045.0, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.8253588516746412, | |
| "grad_norm": 0.5768929116638776, | |
| "learning_rate": 1.7212366715151263e-06, | |
| "loss": 0.9768, | |
| "num_tokens": 71350643.0, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.8277511961722488, | |
| "grad_norm": 0.6276817700534357, | |
| "learning_rate": 1.702395866509612e-06, | |
| "loss": 0.9183, | |
| "num_tokens": 71539784.0, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.8301435406698564, | |
| "grad_norm": 0.5484078243741392, | |
| "learning_rate": 1.6837835672960834e-06, | |
| "loss": 1.1514, | |
| "num_tokens": 71742614.0, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.8325358851674641, | |
| "grad_norm": 0.5193578245554346, | |
| "learning_rate": 1.6654008937959498e-06, | |
| "loss": 0.9674, | |
| "num_tokens": 71994797.0, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.8349282296650717, | |
| "grad_norm": 0.528358256622246, | |
| "learning_rate": 1.6472489521138016e-06, | |
| "loss": 1.108, | |
| "num_tokens": 72191401.0, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.8373205741626795, | |
| "grad_norm": 0.5611551275004363, | |
| "learning_rate": 1.629328834470857e-06, | |
| "loss": 1.1481, | |
| "num_tokens": 72346485.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8397129186602871, | |
| "grad_norm": 0.4671315072196002, | |
| "learning_rate": 1.611641619139238e-06, | |
| "loss": 1.1736, | |
| "num_tokens": 72601665.0, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.8421052631578947, | |
| "grad_norm": 0.5555560185216512, | |
| "learning_rate": 1.5941883703770968e-06, | |
| "loss": 1.1533, | |
| "num_tokens": 72836095.0, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.8444976076555024, | |
| "grad_norm": 0.5288816745801785, | |
| "learning_rate": 1.57697013836457e-06, | |
| "loss": 1.0494, | |
| "num_tokens": 73049430.0, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.84688995215311, | |
| "grad_norm": 0.6233482042563366, | |
| "learning_rate": 1.5599879591405917e-06, | |
| "loss": 1.0147, | |
| "num_tokens": 73196007.0, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.8492822966507177, | |
| "grad_norm": 0.5363849538121136, | |
| "learning_rate": 1.5432428545405554e-06, | |
| "loss": 1.1694, | |
| "num_tokens": 73396469.0, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.8516746411483254, | |
| "grad_norm": 0.5932100916233094, | |
| "learning_rate": 1.526735832134829e-06, | |
| "loss": 1.0174, | |
| "num_tokens": 73584128.0, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.854066985645933, | |
| "grad_norm": 0.6127092810753643, | |
| "learning_rate": 1.5104678851681253e-06, | |
| "loss": 0.8168, | |
| "num_tokens": 73717071.0, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.8564593301435407, | |
| "grad_norm": 0.6293206669166083, | |
| "learning_rate": 1.4944399924997372e-06, | |
| "loss": 0.7752, | |
| "num_tokens": 73883367.0, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.8588516746411483, | |
| "grad_norm": 0.531317141972036, | |
| "learning_rate": 1.4786531185446455e-06, | |
| "loss": 1.1077, | |
| "num_tokens": 74123207.0, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.861244019138756, | |
| "grad_norm": 0.44768314533679704, | |
| "learning_rate": 1.4631082132154806e-06, | |
| "loss": 1.2024, | |
| "num_tokens": 74395731.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8636363636363636, | |
| "grad_norm": 0.4788316306745224, | |
| "learning_rate": 1.4478062118653703e-06, | |
| "loss": 1.1751, | |
| "num_tokens": 74663304.0, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.8660287081339713, | |
| "grad_norm": 0.4783192674308249, | |
| "learning_rate": 1.4327480352316581e-06, | |
| "loss": 1.1805, | |
| "num_tokens": 74907925.0, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.868421052631579, | |
| "grad_norm": 0.5707901460896949, | |
| "learning_rate": 1.417934589380498e-06, | |
| "loss": 1.0742, | |
| "num_tokens": 75130243.0, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.8708133971291866, | |
| "grad_norm": 0.6017414939136261, | |
| "learning_rate": 1.4033667656523405e-06, | |
| "loss": 0.9557, | |
| "num_tokens": 75352077.0, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.8732057416267942, | |
| "grad_norm": 0.4853066070350836, | |
| "learning_rate": 1.389045440608296e-06, | |
| "loss": 1.08, | |
| "num_tokens": 75592089.0, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.8755980861244019, | |
| "grad_norm": 0.5253451321715548, | |
| "learning_rate": 1.374971475977394e-06, | |
| "loss": 1.2071, | |
| "num_tokens": 75818956.0, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.8779904306220095, | |
| "grad_norm": 0.5659204983119508, | |
| "learning_rate": 1.361145718604731e-06, | |
| "loss": 1.1936, | |
| "num_tokens": 76017603.0, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.8803827751196173, | |
| "grad_norm": 0.479841142759106, | |
| "learning_rate": 1.3475690004005098e-06, | |
| "loss": 1.191, | |
| "num_tokens": 76290864.0, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.8827751196172249, | |
| "grad_norm": 0.5872255230326239, | |
| "learning_rate": 1.3342421382899936e-06, | |
| "loss": 1.0301, | |
| "num_tokens": 76529427.0, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.8851674641148325, | |
| "grad_norm": 0.5029097871572791, | |
| "learning_rate": 1.3211659341643412e-06, | |
| "loss": 1.2066, | |
| "num_tokens": 76742589.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.8875598086124402, | |
| "grad_norm": 0.584840618113796, | |
| "learning_rate": 1.308341174832359e-06, | |
| "loss": 0.9768, | |
| "num_tokens": 76939827.0, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.8899521531100478, | |
| "grad_norm": 0.44994308377297715, | |
| "learning_rate": 1.2957686319731623e-06, | |
| "loss": 1.2925, | |
| "num_tokens": 77190390.0, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.8923444976076556, | |
| "grad_norm": 0.614291349507059, | |
| "learning_rate": 1.2834490620897342e-06, | |
| "loss": 1.0009, | |
| "num_tokens": 77368607.0, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.8947368421052632, | |
| "grad_norm": 0.5540701345571359, | |
| "learning_rate": 1.2713832064634127e-06, | |
| "loss": 1.281, | |
| "num_tokens": 77595326.0, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.8971291866028708, | |
| "grad_norm": 0.5930336764639087, | |
| "learning_rate": 1.259571791109285e-06, | |
| "loss": 1.1882, | |
| "num_tokens": 77757257.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.8995215311004785, | |
| "grad_norm": 0.5601557384818509, | |
| "learning_rate": 1.2480155267325039e-06, | |
| "loss": 0.9335, | |
| "num_tokens": 77966559.0, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.9019138755980861, | |
| "grad_norm": 0.5146670174651209, | |
| "learning_rate": 1.2367151086855187e-06, | |
| "loss": 1.1928, | |
| "num_tokens": 78180912.0, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.9043062200956937, | |
| "grad_norm": 0.585671381043156, | |
| "learning_rate": 1.2256712169262415e-06, | |
| "loss": 1.0569, | |
| "num_tokens": 78336709.0, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.9066985645933014, | |
| "grad_norm": 0.5144842875674174, | |
| "learning_rate": 1.2148845159771311e-06, | |
| "loss": 1.0092, | |
| "num_tokens": 78603450.0, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.9090909090909091, | |
| "grad_norm": 0.594728768695324, | |
| "learning_rate": 1.2043556548852065e-06, | |
| "loss": 1.0245, | |
| "num_tokens": 78852293.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.9114832535885168, | |
| "grad_norm": 0.46010783326706295, | |
| "learning_rate": 1.1940852671829938e-06, | |
| "loss": 1.2352, | |
| "num_tokens": 79112672.0, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.9138755980861244, | |
| "grad_norm": 0.601262109893317, | |
| "learning_rate": 1.184073970850408e-06, | |
| "loss": 1.1504, | |
| "num_tokens": 79319617.0, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.916267942583732, | |
| "grad_norm": 0.5038692624203227, | |
| "learning_rate": 1.174322368277565e-06, | |
| "loss": 1.1967, | |
| "num_tokens": 79549771.0, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.9186602870813397, | |
| "grad_norm": 0.5753103173201497, | |
| "learning_rate": 1.1648310462285386e-06, | |
| "loss": 1.1225, | |
| "num_tokens": 79738016.0, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.9210526315789473, | |
| "grad_norm": 0.614917920007612, | |
| "learning_rate": 1.1556005758060517e-06, | |
| "loss": 0.9872, | |
| "num_tokens": 79913100.0, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.9234449760765551, | |
| "grad_norm": 0.5342918968914316, | |
| "learning_rate": 1.146631512417113e-06, | |
| "loss": 1.0676, | |
| "num_tokens": 80103047.0, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.9258373205741627, | |
| "grad_norm": 0.5439716109099237, | |
| "learning_rate": 1.1379243957395987e-06, | |
| "loss": 1.0585, | |
| "num_tokens": 80292737.0, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.9282296650717703, | |
| "grad_norm": 0.5342393003750865, | |
| "learning_rate": 1.1294797496897786e-06, | |
| "loss": 1.1836, | |
| "num_tokens": 80512263.0, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.930622009569378, | |
| "grad_norm": 0.4855841313887977, | |
| "learning_rate": 1.121298082390793e-06, | |
| "loss": 1.0198, | |
| "num_tokens": 80713362.0, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.9330143540669856, | |
| "grad_norm": 0.5404438942427807, | |
| "learning_rate": 1.113379886142075e-06, | |
| "loss": 0.9669, | |
| "num_tokens": 80921168.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9354066985645934, | |
| "grad_norm": 0.5507820902601309, | |
| "learning_rate": 1.105725637389732e-06, | |
| "loss": 1.0652, | |
| "num_tokens": 81149885.0, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.937799043062201, | |
| "grad_norm": 0.5015294273795851, | |
| "learning_rate": 1.0983357966978747e-06, | |
| "loss": 1.1452, | |
| "num_tokens": 81384820.0, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.9401913875598086, | |
| "grad_norm": 0.5530079510762682, | |
| "learning_rate": 1.0912108087209075e-06, | |
| "loss": 1.0865, | |
| "num_tokens": 81577699.0, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.9425837320574163, | |
| "grad_norm": 0.49796992979545124, | |
| "learning_rate": 1.084351102176769e-06, | |
| "loss": 0.9428, | |
| "num_tokens": 81803396.0, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.9449760765550239, | |
| "grad_norm": 0.5777758192642776, | |
| "learning_rate": 1.0777570898211406e-06, | |
| "loss": 1.0373, | |
| "num_tokens": 81968827.0, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.9473684210526315, | |
| "grad_norm": 0.5754456579892182, | |
| "learning_rate": 1.0714291684226054e-06, | |
| "loss": 1.0265, | |
| "num_tokens": 82166516.0, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.9497607655502392, | |
| "grad_norm": 0.558633769969428, | |
| "learning_rate": 1.0653677187387787e-06, | |
| "loss": 1.0473, | |
| "num_tokens": 82338824.0, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.9521531100478469, | |
| "grad_norm": 0.6176260102445734, | |
| "learning_rate": 1.0595731054933937e-06, | |
| "loss": 1.0043, | |
| "num_tokens": 82531186.0, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.9545454545454546, | |
| "grad_norm": 0.5126700946523376, | |
| "learning_rate": 1.0540456773543596e-06, | |
| "loss": 1.2646, | |
| "num_tokens": 82735927.0, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.9569377990430622, | |
| "grad_norm": 0.5671634428425157, | |
| "learning_rate": 1.0487857669127782e-06, | |
| "loss": 1.1623, | |
| "num_tokens": 82904745.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.9593301435406698, | |
| "grad_norm": 0.559489922062985, | |
| "learning_rate": 1.0437936906629336e-06, | |
| "loss": 1.0435, | |
| "num_tokens": 83074515.0, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 0.9617224880382775, | |
| "grad_norm": 0.5577904608135668, | |
| "learning_rate": 1.039069748983248e-06, | |
| "loss": 0.7559, | |
| "num_tokens": 83243340.0, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.9641148325358851, | |
| "grad_norm": 0.5215879777836743, | |
| "learning_rate": 1.0346142261182064e-06, | |
| "loss": 1.1583, | |
| "num_tokens": 83474214.0, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 0.9665071770334929, | |
| "grad_norm": 0.5509462473469403, | |
| "learning_rate": 1.0304273901612566e-06, | |
| "loss": 1.0304, | |
| "num_tokens": 83644954.0, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.9688995215311005, | |
| "grad_norm": 0.536818549153514, | |
| "learning_rate": 1.0265094930386741e-06, | |
| "loss": 1.2204, | |
| "num_tokens": 83861919.0, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.9712918660287081, | |
| "grad_norm": 0.5740452675590582, | |
| "learning_rate": 1.0228607704944048e-06, | |
| "loss": 0.9858, | |
| "num_tokens": 84024816.0, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.9736842105263158, | |
| "grad_norm": 0.5261150137396471, | |
| "learning_rate": 1.0194814420758806e-06, | |
| "loss": 1.1349, | |
| "num_tokens": 84239403.0, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 0.9760765550239234, | |
| "grad_norm": 0.6448679502450355, | |
| "learning_rate": 1.0163717111208086e-06, | |
| "loss": 0.9748, | |
| "num_tokens": 84432507.0, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.9784688995215312, | |
| "grad_norm": 0.5218518317378777, | |
| "learning_rate": 1.0135317647449362e-06, | |
| "loss": 0.9739, | |
| "num_tokens": 84644408.0, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 0.9808612440191388, | |
| "grad_norm": 0.5596368200732923, | |
| "learning_rate": 1.0109617738307914e-06, | |
| "loss": 1.0414, | |
| "num_tokens": 84854304.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.9832535885167464, | |
| "grad_norm": 0.5348740586634487, | |
| "learning_rate": 1.0086618930174011e-06, | |
| "loss": 1.1507, | |
| "num_tokens": 85056365.0, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.9856459330143541, | |
| "grad_norm": 0.554299617798691, | |
| "learning_rate": 1.006632260690988e-06, | |
| "loss": 1.0713, | |
| "num_tokens": 85211462.0, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.9880382775119617, | |
| "grad_norm": 0.5659307655892759, | |
| "learning_rate": 1.0048729989766396e-06, | |
| "loss": 0.9576, | |
| "num_tokens": 85413979.0, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.9904306220095693, | |
| "grad_norm": 0.6180230319552571, | |
| "learning_rate": 1.0033842137309649e-06, | |
| "loss": 0.9867, | |
| "num_tokens": 85564746.0, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.992822966507177, | |
| "grad_norm": 0.49348429130589355, | |
| "learning_rate": 1.0021659945357202e-06, | |
| "loss": 1.2502, | |
| "num_tokens": 85821465.0, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.9952153110047847, | |
| "grad_norm": 0.5397948420594149, | |
| "learning_rate": 1.0012184146924225e-06, | |
| "loss": 1.1626, | |
| "num_tokens": 86064119.0, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.9976076555023924, | |
| "grad_norm": 0.7144358111953418, | |
| "learning_rate": 1.0005415312179367e-06, | |
| "loss": 0.8718, | |
| "num_tokens": 86205361.0, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.546093577829937, | |
| "learning_rate": 1.0001353848410461e-06, | |
| "loss": 1.0204, | |
| "num_tokens": 86399088.0, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.6695132851600647, | |
| "eval_num_tokens": 86399088.0, | |
| "eval_runtime": 101.4457, | |
| "eval_samples_per_second": 29.296, | |
| "eval_steps_per_second": 3.667, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 418, | |
| "total_flos": 290901703622656.0, | |
| "train_loss": 1.1731020922295785, | |
| "train_runtime": 3083.067, | |
| "train_samples_per_second": 8.674, | |
| "train_steps_per_second": 0.136 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 418, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 290901703622656.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |