{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 1672, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0023923444976076554, "grad_norm": 6.9691807760923075, "learning_rate": 0.0, "loss": 1.7686, "num_tokens": 224939.0, "step": 1 }, { "epoch": 0.004784688995215311, "grad_norm": 5.696067520260917, "learning_rate": 1.9607843137254904e-07, "loss": 1.7074, "num_tokens": 466553.0, "step": 2 }, { "epoch": 0.007177033492822967, "grad_norm": 7.218883969227019, "learning_rate": 3.921568627450981e-07, "loss": 1.7878, "num_tokens": 649823.0, "step": 3 }, { "epoch": 0.009569377990430622, "grad_norm": 6.599319930344236, "learning_rate": 5.882352941176471e-07, "loss": 1.776, "num_tokens": 862983.0, "step": 4 }, { "epoch": 0.011961722488038277, "grad_norm": 6.600002893691311, "learning_rate": 7.843137254901962e-07, "loss": 1.8101, "num_tokens": 1090598.0, "step": 5 }, { "epoch": 0.014354066985645933, "grad_norm": 6.1947461987357215, "learning_rate": 9.80392156862745e-07, "loss": 1.7604, "num_tokens": 1309816.0, "step": 6 }, { "epoch": 0.01674641148325359, "grad_norm": 7.322579343479808, "learning_rate": 1.1764705882352942e-06, "loss": 1.7636, "num_tokens": 1501898.0, "step": 7 }, { "epoch": 0.019138755980861243, "grad_norm": 5.565852999878923, "learning_rate": 1.3725490196078434e-06, "loss": 1.747, "num_tokens": 1690771.0, "step": 8 }, { "epoch": 0.0215311004784689, "grad_norm": 6.293532003483547, "learning_rate": 1.5686274509803923e-06, "loss": 1.745, "num_tokens": 1900964.0, "step": 9 }, { "epoch": 0.023923444976076555, "grad_norm": 6.163171111681885, "learning_rate": 1.7647058823529414e-06, "loss": 1.744, "num_tokens": 2072545.0, "step": 10 }, { "epoch": 0.02631578947368421, "grad_norm": 4.642248066242216, "learning_rate": 1.96078431372549e-06, "loss": 1.6107, "num_tokens": 2230117.0, "step": 11 }, { "epoch": 0.028708133971291867, "grad_norm": 2.938871677963385, "learning_rate": 2.1568627450980393e-06, "loss": 1.6337, "num_tokens": 2418536.0, "step": 12 }, { "epoch": 0.03110047846889952, "grad_norm": 3.352461014645353, "learning_rate": 2.3529411764705885e-06, "loss": 1.6248, "num_tokens": 2610436.0, "step": 13 }, { "epoch": 0.03349282296650718, "grad_norm": 2.4423792859075024, "learning_rate": 2.549019607843137e-06, "loss": 1.5582, "num_tokens": 2840410.0, "step": 14 }, { "epoch": 0.03588516746411483, "grad_norm": 3.6326614999386093, "learning_rate": 2.7450980392156867e-06, "loss": 1.4153, "num_tokens": 3039241.0, "step": 15 }, { "epoch": 0.03827751196172249, "grad_norm": 2.309721427859542, "learning_rate": 2.9411764705882355e-06, "loss": 1.4088, "num_tokens": 3284539.0, "step": 16 }, { "epoch": 0.04066985645933014, "grad_norm": 2.8925752012878223, "learning_rate": 3.1372549019607846e-06, "loss": 1.5011, "num_tokens": 3485634.0, "step": 17 }, { "epoch": 0.0430622009569378, "grad_norm": 2.4261545230518626, "learning_rate": 3.3333333333333333e-06, "loss": 1.4527, "num_tokens": 3672944.0, "step": 18 }, { "epoch": 0.045454545454545456, "grad_norm": 4.355994016148595, "learning_rate": 3.529411764705883e-06, "loss": 1.2766, "num_tokens": 3843273.0, "step": 19 }, { "epoch": 0.04784688995215311, "grad_norm": 4.628861544958714, "learning_rate": 3.7254901960784316e-06, "loss": 1.2748, "num_tokens": 4009616.0, "step": 20 }, { "epoch": 0.050239234449760764, "grad_norm": 2.2063220407104116, "learning_rate": 3.92156862745098e-06, "loss": 1.284, "num_tokens": 4196040.0, "step": 21 }, { "epoch": 0.05263157894736842, "grad_norm": 2.083766163581579, "learning_rate": 4.11764705882353e-06, "loss": 1.3234, "num_tokens": 4426192.0, "step": 22 }, { "epoch": 0.05502392344497608, "grad_norm": 1.4244871277633704, "learning_rate": 4.313725490196079e-06, "loss": 1.2551, "num_tokens": 4692793.0, "step": 23 }, { "epoch": 0.05741626794258373, "grad_norm": 1.4458640545451162, "learning_rate": 4.509803921568628e-06, "loss": 1.2925, "num_tokens": 4922578.0, "step": 24 }, { "epoch": 0.05980861244019139, "grad_norm": 1.2401785717364968, "learning_rate": 4.705882352941177e-06, "loss": 1.2988, "num_tokens": 5103273.0, "step": 25 }, { "epoch": 0.06220095693779904, "grad_norm": 1.2308788644379343, "learning_rate": 4.901960784313726e-06, "loss": 1.1625, "num_tokens": 5321035.0, "step": 26 }, { "epoch": 0.0645933014354067, "grad_norm": 1.070148340321906, "learning_rate": 5.098039215686274e-06, "loss": 1.1461, "num_tokens": 5559536.0, "step": 27 }, { "epoch": 0.06698564593301436, "grad_norm": 1.03341304889324, "learning_rate": 5.294117647058824e-06, "loss": 1.1232, "num_tokens": 5746590.0, "step": 28 }, { "epoch": 0.06937799043062201, "grad_norm": 1.0269837046140493, "learning_rate": 5.4901960784313735e-06, "loss": 1.1695, "num_tokens": 5978411.0, "step": 29 }, { "epoch": 0.07177033492822966, "grad_norm": 1.188053126912019, "learning_rate": 5.686274509803922e-06, "loss": 1.1733, "num_tokens": 6204023.0, "step": 30 }, { "epoch": 0.07416267942583732, "grad_norm": 1.3792751984185245, "learning_rate": 5.882352941176471e-06, "loss": 1.1271, "num_tokens": 6418366.0, "step": 31 }, { "epoch": 0.07655502392344497, "grad_norm": 1.2984724277479058, "learning_rate": 6.07843137254902e-06, "loss": 1.0451, "num_tokens": 6613243.0, "step": 32 }, { "epoch": 0.07894736842105263, "grad_norm": 0.9146841559197153, "learning_rate": 6.274509803921569e-06, "loss": 1.2392, "num_tokens": 6833041.0, "step": 33 }, { "epoch": 0.08133971291866028, "grad_norm": 1.0033903976372076, "learning_rate": 6.470588235294119e-06, "loss": 1.079, "num_tokens": 7030942.0, "step": 34 }, { "epoch": 0.08373205741626795, "grad_norm": 1.1388077906522334, "learning_rate": 6.666666666666667e-06, "loss": 1.0549, "num_tokens": 7237519.0, "step": 35 }, { "epoch": 0.0861244019138756, "grad_norm": 0.9842459198671744, "learning_rate": 6.862745098039216e-06, "loss": 1.0202, "num_tokens": 7435884.0, "step": 36 }, { "epoch": 0.08851674641148326, "grad_norm": 284.7649201492139, "learning_rate": 7.058823529411766e-06, "loss": 1.0647, "num_tokens": 7635319.0, "step": 37 }, { "epoch": 0.09090909090909091, "grad_norm": 0.9557715780751038, "learning_rate": 7.2549019607843145e-06, "loss": 1.1426, "num_tokens": 7862270.0, "step": 38 }, { "epoch": 0.09330143540669857, "grad_norm": 0.915320008451191, "learning_rate": 7.450980392156863e-06, "loss": 1.1608, "num_tokens": 8052102.0, "step": 39 }, { "epoch": 0.09569377990430622, "grad_norm": 0.8989126142994759, "learning_rate": 7.647058823529411e-06, "loss": 1.0364, "num_tokens": 8239372.0, "step": 40 }, { "epoch": 0.09808612440191387, "grad_norm": 0.8329537974290705, "learning_rate": 7.84313725490196e-06, "loss": 1.1433, "num_tokens": 8465829.0, "step": 41 }, { "epoch": 0.10047846889952153, "grad_norm": 0.8398997141542524, "learning_rate": 8.03921568627451e-06, "loss": 1.156, "num_tokens": 8642972.0, "step": 42 }, { "epoch": 0.10287081339712918, "grad_norm": 0.7585422097649339, "learning_rate": 8.23529411764706e-06, "loss": 1.0503, "num_tokens": 8847697.0, "step": 43 }, { "epoch": 0.10526315789473684, "grad_norm": 0.7622670839009824, "learning_rate": 8.43137254901961e-06, "loss": 0.9813, "num_tokens": 9066935.0, "step": 44 }, { "epoch": 0.1076555023923445, "grad_norm": 0.8193765838996492, "learning_rate": 8.627450980392157e-06, "loss": 1.0981, "num_tokens": 9284399.0, "step": 45 }, { "epoch": 0.11004784688995216, "grad_norm": 0.7925794880440407, "learning_rate": 8.823529411764707e-06, "loss": 1.0236, "num_tokens": 9508366.0, "step": 46 }, { "epoch": 0.11244019138755981, "grad_norm": 1.2045774927138393, "learning_rate": 9.019607843137256e-06, "loss": 1.0526, "num_tokens": 9694057.0, "step": 47 }, { "epoch": 0.11483253588516747, "grad_norm": 0.9084407634375898, "learning_rate": 9.215686274509804e-06, "loss": 1.0955, "num_tokens": 9929798.0, "step": 48 }, { "epoch": 0.11722488038277512, "grad_norm": 0.8240735702666425, "learning_rate": 9.411764705882354e-06, "loss": 0.9807, "num_tokens": 10180029.0, "step": 49 }, { "epoch": 0.11961722488038277, "grad_norm": 0.8097524297757899, "learning_rate": 9.607843137254903e-06, "loss": 1.1265, "num_tokens": 10419750.0, "step": 50 }, { "epoch": 0.12200956937799043, "grad_norm": 0.8353633432968253, "learning_rate": 9.803921568627451e-06, "loss": 1.1419, "num_tokens": 10605541.0, "step": 51 }, { "epoch": 0.12440191387559808, "grad_norm": 0.817154674246075, "learning_rate": 1e-05, "loss": 1.0961, "num_tokens": 10814676.0, "step": 52 }, { "epoch": 0.12679425837320574, "grad_norm": 0.8612859746077591, "learning_rate": 9.999991548844401e-06, "loss": 1.1244, "num_tokens": 11035241.0, "step": 53 }, { "epoch": 0.1291866028708134, "grad_norm": 0.8154431514013905, "learning_rate": 9.999966195409343e-06, "loss": 0.9722, "num_tokens": 11195242.0, "step": 54 }, { "epoch": 0.13157894736842105, "grad_norm": 0.7860112239586271, "learning_rate": 9.99992393979006e-06, "loss": 1.0957, "num_tokens": 11401885.0, "step": 55 }, { "epoch": 0.1339712918660287, "grad_norm": 0.7767268725169872, "learning_rate": 9.999864782145264e-06, "loss": 0.901, "num_tokens": 11587966.0, "step": 56 }, { "epoch": 0.13636363636363635, "grad_norm": 0.9737729561858515, "learning_rate": 9.999788722697156e-06, "loss": 1.2394, "num_tokens": 11765601.0, "step": 57 }, { "epoch": 0.13875598086124402, "grad_norm": 0.8847294438412543, "learning_rate": 9.999695761731418e-06, "loss": 0.9228, "num_tokens": 11942390.0, "step": 58 }, { "epoch": 0.14114832535885166, "grad_norm": 0.7337378919952546, "learning_rate": 9.999585899597224e-06, "loss": 1.0978, "num_tokens": 12165372.0, "step": 59 }, { "epoch": 0.14354066985645933, "grad_norm": 0.7638475691265573, "learning_rate": 9.999459136707217e-06, "loss": 0.9959, "num_tokens": 12414971.0, "step": 60 }, { "epoch": 0.145933014354067, "grad_norm": 0.7949985672748343, "learning_rate": 9.999315473537532e-06, "loss": 1.0083, "num_tokens": 12595239.0, "step": 61 }, { "epoch": 0.14832535885167464, "grad_norm": 0.8626640964385176, "learning_rate": 9.999154910627776e-06, "loss": 1.1222, "num_tokens": 12789753.0, "step": 62 }, { "epoch": 0.1507177033492823, "grad_norm": 0.7389387597807686, "learning_rate": 9.998977448581031e-06, "loss": 0.9927, "num_tokens": 13029643.0, "step": 63 }, { "epoch": 0.15311004784688995, "grad_norm": 0.9337215647236915, "learning_rate": 9.998783088063864e-06, "loss": 0.6957, "num_tokens": 13155991.0, "step": 64 }, { "epoch": 0.15550239234449761, "grad_norm": 0.8247995409977257, "learning_rate": 9.998571829806301e-06, "loss": 1.0184, "num_tokens": 13310880.0, "step": 65 }, { "epoch": 0.15789473684210525, "grad_norm": 0.8340681998808398, "learning_rate": 9.998343674601845e-06, "loss": 1.0168, "num_tokens": 13489712.0, "step": 66 }, { "epoch": 0.16028708133971292, "grad_norm": 0.8370767723991338, "learning_rate": 9.998098623307461e-06, "loss": 0.9239, "num_tokens": 13684647.0, "step": 67 }, { "epoch": 0.16267942583732056, "grad_norm": 0.7288813191624135, "learning_rate": 9.99783667684358e-06, "loss": 1.1175, "num_tokens": 13907099.0, "step": 68 }, { "epoch": 0.16507177033492823, "grad_norm": 0.8504439815246466, "learning_rate": 9.997557836194091e-06, "loss": 0.9208, "num_tokens": 14130272.0, "step": 69 }, { "epoch": 0.1674641148325359, "grad_norm": 0.8451053143388385, "learning_rate": 9.997262102406337e-06, "loss": 1.0591, "num_tokens": 14346591.0, "step": 70 }, { "epoch": 0.16985645933014354, "grad_norm": 0.7361210479126754, "learning_rate": 9.996949476591117e-06, "loss": 0.8635, "num_tokens": 14560238.0, "step": 71 }, { "epoch": 0.1722488038277512, "grad_norm": 0.8577939566262116, "learning_rate": 9.996619959922675e-06, "loss": 0.8466, "num_tokens": 14755604.0, "step": 72 }, { "epoch": 0.17464114832535885, "grad_norm": 0.8381521666289012, "learning_rate": 9.996273553638695e-06, "loss": 0.7695, "num_tokens": 14933946.0, "step": 73 }, { "epoch": 0.17703349282296652, "grad_norm": 0.7754686982261173, "learning_rate": 9.995910259040306e-06, "loss": 0.9558, "num_tokens": 15113851.0, "step": 74 }, { "epoch": 0.17942583732057416, "grad_norm": 0.7725675853421875, "learning_rate": 9.99553007749207e-06, "loss": 0.9318, "num_tokens": 15317677.0, "step": 75 }, { "epoch": 0.18181818181818182, "grad_norm": 0.7639475931469095, "learning_rate": 9.995133010421968e-06, "loss": 1.0368, "num_tokens": 15509536.0, "step": 76 }, { "epoch": 0.18421052631578946, "grad_norm": 0.8319629992606972, "learning_rate": 9.99471905932142e-06, "loss": 0.9003, "num_tokens": 15715725.0, "step": 77 }, { "epoch": 0.18660287081339713, "grad_norm": 0.7882397528734394, "learning_rate": 9.994288225745249e-06, "loss": 0.9092, "num_tokens": 15952916.0, "step": 78 }, { "epoch": 0.18899521531100477, "grad_norm": 0.770940813293932, "learning_rate": 9.993840511311695e-06, "loss": 0.9546, "num_tokens": 16118037.0, "step": 79 }, { "epoch": 0.19138755980861244, "grad_norm": 0.7714491446159815, "learning_rate": 9.993375917702409e-06, "loss": 0.9709, "num_tokens": 16304381.0, "step": 80 }, { "epoch": 0.1937799043062201, "grad_norm": 0.7786320649310523, "learning_rate": 9.992894446662434e-06, "loss": 1.0758, "num_tokens": 16485104.0, "step": 81 }, { "epoch": 0.19617224880382775, "grad_norm": 0.7798209889697421, "learning_rate": 9.992396100000208e-06, "loss": 0.9808, "num_tokens": 16658375.0, "step": 82 }, { "epoch": 0.19856459330143542, "grad_norm": 0.7546944385505975, "learning_rate": 9.99188087958756e-06, "loss": 1.0092, "num_tokens": 16839821.0, "step": 83 }, { "epoch": 0.20095693779904306, "grad_norm": 0.774709240553806, "learning_rate": 9.991348787359686e-06, "loss": 0.9415, "num_tokens": 17010092.0, "step": 84 }, { "epoch": 0.20334928229665072, "grad_norm": 0.860230274595423, "learning_rate": 9.990799825315166e-06, "loss": 0.8845, "num_tokens": 17171471.0, "step": 85 }, { "epoch": 0.20574162679425836, "grad_norm": 0.821110135970871, "learning_rate": 9.990233995515939e-06, "loss": 0.7484, "num_tokens": 17349168.0, "step": 86 }, { "epoch": 0.20813397129186603, "grad_norm": 0.7685272559332614, "learning_rate": 9.989651300087301e-06, "loss": 1.0519, "num_tokens": 17560749.0, "step": 87 }, { "epoch": 0.21052631578947367, "grad_norm": 0.8151856217138742, "learning_rate": 9.989051741217895e-06, "loss": 1.1785, "num_tokens": 17765678.0, "step": 88 }, { "epoch": 0.21291866028708134, "grad_norm": 0.6934729593230965, "learning_rate": 9.988435321159706e-06, "loss": 0.8942, "num_tokens": 18017748.0, "step": 89 }, { "epoch": 0.215311004784689, "grad_norm": 0.8466118922234218, "learning_rate": 9.987802042228052e-06, "loss": 0.9131, "num_tokens": 18166766.0, "step": 90 }, { "epoch": 0.21770334928229665, "grad_norm": 0.7725523726706883, "learning_rate": 9.98715190680157e-06, "loss": 1.2444, "num_tokens": 18358833.0, "step": 91 }, { "epoch": 0.22009569377990432, "grad_norm": 0.7889212005298246, "learning_rate": 9.986484917322216e-06, "loss": 1.0743, "num_tokens": 18578155.0, "step": 92 }, { "epoch": 0.22248803827751196, "grad_norm": 0.7757930472253841, "learning_rate": 9.985801076295245e-06, "loss": 1.0136, "num_tokens": 18807884.0, "step": 93 }, { "epoch": 0.22488038277511962, "grad_norm": 0.7619186356991171, "learning_rate": 9.985100386289217e-06, "loss": 0.9538, "num_tokens": 18987783.0, "step": 94 }, { "epoch": 0.22727272727272727, "grad_norm": 0.7560512807570609, "learning_rate": 9.984382849935966e-06, "loss": 1.0655, "num_tokens": 19191375.0, "step": 95 }, { "epoch": 0.22966507177033493, "grad_norm": 0.8273044076025551, "learning_rate": 9.983648469930613e-06, "loss": 0.9171, "num_tokens": 19385570.0, "step": 96 }, { "epoch": 0.23205741626794257, "grad_norm": 0.8441992541867371, "learning_rate": 9.982897249031536e-06, "loss": 0.8167, "num_tokens": 19567794.0, "step": 97 }, { "epoch": 0.23444976076555024, "grad_norm": 0.7796712666919974, "learning_rate": 9.982129190060376e-06, "loss": 0.8066, "num_tokens": 19762696.0, "step": 98 }, { "epoch": 0.23684210526315788, "grad_norm": 0.6861892375802735, "learning_rate": 9.981344295902013e-06, "loss": 0.9534, "num_tokens": 20017766.0, "step": 99 }, { "epoch": 0.23923444976076555, "grad_norm": 0.7613445310920244, "learning_rate": 9.980542569504566e-06, "loss": 1.0992, "num_tokens": 20233456.0, "step": 100 }, { "epoch": 0.24162679425837322, "grad_norm": 0.7178060201919961, "learning_rate": 9.979724013879373e-06, "loss": 0.9877, "num_tokens": 20465617.0, "step": 101 }, { "epoch": 0.24401913875598086, "grad_norm": 0.7365437362689727, "learning_rate": 9.978888632100986e-06, "loss": 1.0293, "num_tokens": 20665957.0, "step": 102 }, { "epoch": 0.24641148325358853, "grad_norm": 0.676827848041371, "learning_rate": 9.978036427307158e-06, "loss": 1.1481, "num_tokens": 20888246.0, "step": 103 }, { "epoch": 0.24880382775119617, "grad_norm": 0.7580628326295507, "learning_rate": 9.977167402698828e-06, "loss": 0.987, "num_tokens": 21099480.0, "step": 104 }, { "epoch": 0.2511961722488038, "grad_norm": 0.7221725341556634, "learning_rate": 9.976281561540112e-06, "loss": 1.0031, "num_tokens": 21299349.0, "step": 105 }, { "epoch": 0.2535885167464115, "grad_norm": 0.7374394457768372, "learning_rate": 9.975378907158291e-06, "loss": 0.924, "num_tokens": 21516131.0, "step": 106 }, { "epoch": 0.25598086124401914, "grad_norm": 0.8372803563503946, "learning_rate": 9.9744594429438e-06, "loss": 1.1085, "num_tokens": 21657586.0, "step": 107 }, { "epoch": 0.2583732057416268, "grad_norm": 0.8474797225142054, "learning_rate": 9.973523172350205e-06, "loss": 0.7941, "num_tokens": 21808026.0, "step": 108 }, { "epoch": 0.2607655502392344, "grad_norm": 0.716388521294015, "learning_rate": 9.972570098894208e-06, "loss": 0.8706, "num_tokens": 22050205.0, "step": 109 }, { "epoch": 0.2631578947368421, "grad_norm": 0.7391113763396667, "learning_rate": 9.971600226155614e-06, "loss": 0.84, "num_tokens": 22268351.0, "step": 110 }, { "epoch": 0.26555023923444976, "grad_norm": 0.7735135057092313, "learning_rate": 9.970613557777336e-06, "loss": 0.8856, "num_tokens": 22464789.0, "step": 111 }, { "epoch": 0.2679425837320574, "grad_norm": 0.8223949250475577, "learning_rate": 9.96961009746537e-06, "loss": 0.719, "num_tokens": 22620151.0, "step": 112 }, { "epoch": 0.2703349282296651, "grad_norm": 0.7173732870433329, "learning_rate": 9.968589848988778e-06, "loss": 0.9263, "num_tokens": 22852967.0, "step": 113 }, { "epoch": 0.2727272727272727, "grad_norm": 0.658820584842679, "learning_rate": 9.967552816179684e-06, "loss": 0.8595, "num_tokens": 23082712.0, "step": 114 }, { "epoch": 0.2751196172248804, "grad_norm": 0.8291768755324592, "learning_rate": 9.966499002933257e-06, "loss": 0.7979, "num_tokens": 23236129.0, "step": 115 }, { "epoch": 0.27751196172248804, "grad_norm": 0.6891612480479086, "learning_rate": 9.965428413207696e-06, "loss": 1.116, "num_tokens": 23482677.0, "step": 116 }, { "epoch": 0.2799043062200957, "grad_norm": 0.7316914455895902, "learning_rate": 9.964341051024202e-06, "loss": 0.7551, "num_tokens": 23687754.0, "step": 117 }, { "epoch": 0.2822966507177033, "grad_norm": 0.6565088792008686, "learning_rate": 9.963236920466987e-06, "loss": 0.8169, "num_tokens": 23937964.0, "step": 118 }, { "epoch": 0.284688995215311, "grad_norm": 0.7421437162178479, "learning_rate": 9.962116025683241e-06, "loss": 0.8436, "num_tokens": 24127868.0, "step": 119 }, { "epoch": 0.28708133971291866, "grad_norm": 0.7006473705081104, "learning_rate": 9.960978370883123e-06, "loss": 0.9349, "num_tokens": 24354281.0, "step": 120 }, { "epoch": 0.2894736842105263, "grad_norm": 0.708154430058184, "learning_rate": 9.959823960339744e-06, "loss": 0.9994, "num_tokens": 24591341.0, "step": 121 }, { "epoch": 0.291866028708134, "grad_norm": 0.7523976805120025, "learning_rate": 9.958652798389148e-06, "loss": 0.9759, "num_tokens": 24808421.0, "step": 122 }, { "epoch": 0.2942583732057416, "grad_norm": 0.695503522009102, "learning_rate": 9.957464889430301e-06, "loss": 1.0527, "num_tokens": 25047341.0, "step": 123 }, { "epoch": 0.2966507177033493, "grad_norm": 0.7321236634606988, "learning_rate": 9.956260237925069e-06, "loss": 1.0856, "num_tokens": 25262803.0, "step": 124 }, { "epoch": 0.29904306220095694, "grad_norm": 0.6124241419692257, "learning_rate": 9.955038848398211e-06, "loss": 0.8271, "num_tokens": 25485650.0, "step": 125 }, { "epoch": 0.3014354066985646, "grad_norm": 0.8281763110793705, "learning_rate": 9.953800725437347e-06, "loss": 0.73, "num_tokens": 25664886.0, "step": 126 }, { "epoch": 0.3038277511961722, "grad_norm": 0.8118511520964532, "learning_rate": 9.952545873692953e-06, "loss": 0.9294, "num_tokens": 25847077.0, "step": 127 }, { "epoch": 0.3062200956937799, "grad_norm": 0.6694662698941429, "learning_rate": 9.951274297878341e-06, "loss": 0.9563, "num_tokens": 26088392.0, "step": 128 }, { "epoch": 0.30861244019138756, "grad_norm": 0.6923316125065374, "learning_rate": 9.949986002769633e-06, "loss": 0.9151, "num_tokens": 26307138.0, "step": 129 }, { "epoch": 0.31100478468899523, "grad_norm": 0.6309626818691494, "learning_rate": 9.94868099320576e-06, "loss": 0.7817, "num_tokens": 26592132.0, "step": 130 }, { "epoch": 0.3133971291866029, "grad_norm": 0.7429883494359573, "learning_rate": 9.947359274088427e-06, "loss": 1.0797, "num_tokens": 26763373.0, "step": 131 }, { "epoch": 0.3157894736842105, "grad_norm": 0.6943111995834695, "learning_rate": 9.946020850382098e-06, "loss": 0.951, "num_tokens": 26994962.0, "step": 132 }, { "epoch": 0.3181818181818182, "grad_norm": 0.6817616892645713, "learning_rate": 9.94466572711399e-06, "loss": 0.8817, "num_tokens": 27205603.0, "step": 133 }, { "epoch": 0.32057416267942584, "grad_norm": 0.6955201958805303, "learning_rate": 9.943293909374036e-06, "loss": 1.0069, "num_tokens": 27412792.0, "step": 134 }, { "epoch": 0.3229665071770335, "grad_norm": 0.694019942449106, "learning_rate": 9.94190540231488e-06, "loss": 0.7741, "num_tokens": 27625644.0, "step": 135 }, { "epoch": 0.3253588516746411, "grad_norm": 0.7987750635682109, "learning_rate": 9.940500211151853e-06, "loss": 0.9964, "num_tokens": 27809952.0, "step": 136 }, { "epoch": 0.3277511961722488, "grad_norm": 0.788259855652086, "learning_rate": 9.939078341162947e-06, "loss": 0.9231, "num_tokens": 27971074.0, "step": 137 }, { "epoch": 0.33014354066985646, "grad_norm": 0.6494876305365771, "learning_rate": 9.937639797688807e-06, "loss": 0.9048, "num_tokens": 28168165.0, "step": 138 }, { "epoch": 0.33253588516746413, "grad_norm": 0.7100895039199577, "learning_rate": 9.936184586132696e-06, "loss": 0.9842, "num_tokens": 28361420.0, "step": 139 }, { "epoch": 0.3349282296650718, "grad_norm": 0.6206399951244527, "learning_rate": 9.934712711960496e-06, "loss": 1.0349, "num_tokens": 28679853.0, "step": 140 }, { "epoch": 0.3373205741626794, "grad_norm": 0.7407205278350535, "learning_rate": 9.933224180700661e-06, "loss": 0.8706, "num_tokens": 28855036.0, "step": 141 }, { "epoch": 0.3397129186602871, "grad_norm": 0.6760210344304235, "learning_rate": 9.931718997944226e-06, "loss": 0.7435, "num_tokens": 29055967.0, "step": 142 }, { "epoch": 0.34210526315789475, "grad_norm": 0.730294279383326, "learning_rate": 9.930197169344753e-06, "loss": 0.8819, "num_tokens": 29265413.0, "step": 143 }, { "epoch": 0.3444976076555024, "grad_norm": 0.7982518259307247, "learning_rate": 9.928658700618343e-06, "loss": 1.0037, "num_tokens": 29459699.0, "step": 144 }, { "epoch": 0.34688995215311, "grad_norm": 0.6642614534122017, "learning_rate": 9.927103597543584e-06, "loss": 0.6234, "num_tokens": 29670531.0, "step": 145 }, { "epoch": 0.3492822966507177, "grad_norm": 0.7726991449535662, "learning_rate": 9.925531865961557e-06, "loss": 0.802, "num_tokens": 29841104.0, "step": 146 }, { "epoch": 0.35167464114832536, "grad_norm": 0.7190746964648677, "learning_rate": 9.923943511775788e-06, "loss": 0.8837, "num_tokens": 30083772.0, "step": 147 }, { "epoch": 0.35406698564593303, "grad_norm": 0.6773961708968047, "learning_rate": 9.922338540952251e-06, "loss": 0.9064, "num_tokens": 30318998.0, "step": 148 }, { "epoch": 0.35645933014354064, "grad_norm": 0.7165474452525965, "learning_rate": 9.920716959519326e-06, "loss": 0.8625, "num_tokens": 30511589.0, "step": 149 }, { "epoch": 0.3588516746411483, "grad_norm": 0.6962013623862424, "learning_rate": 9.919078773567784e-06, "loss": 1.0303, "num_tokens": 30704626.0, "step": 150 }, { "epoch": 0.361244019138756, "grad_norm": 0.701112474981306, "learning_rate": 9.917423989250762e-06, "loss": 1.1225, "num_tokens": 30941731.0, "step": 151 }, { "epoch": 0.36363636363636365, "grad_norm": 0.574854019614758, "learning_rate": 9.91575261278375e-06, "loss": 1.0329, "num_tokens": 31203750.0, "step": 152 }, { "epoch": 0.3660287081339713, "grad_norm": 0.7243129631207466, "learning_rate": 9.914064650444549e-06, "loss": 0.9106, "num_tokens": 31415131.0, "step": 153 }, { "epoch": 0.3684210526315789, "grad_norm": 0.7656940105326371, "learning_rate": 9.912360108573265e-06, "loss": 1.0737, "num_tokens": 31621021.0, "step": 154 }, { "epoch": 0.3708133971291866, "grad_norm": 0.7642923662971779, "learning_rate": 9.910638993572275e-06, "loss": 1.1397, "num_tokens": 31825774.0, "step": 155 }, { "epoch": 0.37320574162679426, "grad_norm": 0.8550286241707264, "learning_rate": 9.908901311906202e-06, "loss": 0.993, "num_tokens": 31998156.0, "step": 156 }, { "epoch": 0.37559808612440193, "grad_norm": 0.7186526140526422, "learning_rate": 9.907147070101906e-06, "loss": 1.0599, "num_tokens": 32219085.0, "step": 157 }, { "epoch": 0.37799043062200954, "grad_norm": 0.7167027687359431, "learning_rate": 9.905376274748433e-06, "loss": 0.8874, "num_tokens": 32451875.0, "step": 158 }, { "epoch": 0.3803827751196172, "grad_norm": 0.652018878770897, "learning_rate": 9.90358893249702e-06, "loss": 0.8651, "num_tokens": 32728142.0, "step": 159 }, { "epoch": 0.3827751196172249, "grad_norm": 0.7805318753632378, "learning_rate": 9.901785050061041e-06, "loss": 1.03, "num_tokens": 32954822.0, "step": 160 }, { "epoch": 0.38516746411483255, "grad_norm": 0.7527518503203819, "learning_rate": 9.899964634216006e-06, "loss": 1.0222, "num_tokens": 33154057.0, "step": 161 }, { "epoch": 0.3875598086124402, "grad_norm": 0.7433980271420518, "learning_rate": 9.898127691799521e-06, "loss": 0.8789, "num_tokens": 33317780.0, "step": 162 }, { "epoch": 0.38995215311004783, "grad_norm": 0.7680691692838955, "learning_rate": 9.896274229711276e-06, "loss": 1.0284, "num_tokens": 33552472.0, "step": 163 }, { "epoch": 0.3923444976076555, "grad_norm": 0.7722033859950767, "learning_rate": 9.894404254912994e-06, "loss": 1.037, "num_tokens": 33756438.0, "step": 164 }, { "epoch": 0.39473684210526316, "grad_norm": 0.737004315990203, "learning_rate": 9.892517774428434e-06, "loss": 0.9836, "num_tokens": 33979635.0, "step": 165 }, { "epoch": 0.39712918660287083, "grad_norm": 0.6691182389869809, "learning_rate": 9.89061479534335e-06, "loss": 1.0604, "num_tokens": 34273884.0, "step": 166 }, { "epoch": 0.39952153110047844, "grad_norm": 0.6790421569992825, "learning_rate": 9.888695324805453e-06, "loss": 0.9092, "num_tokens": 34464735.0, "step": 167 }, { "epoch": 0.4019138755980861, "grad_norm": 0.8288654161356374, "learning_rate": 9.88675937002442e-06, "loss": 1.1103, "num_tokens": 34686097.0, "step": 168 }, { "epoch": 0.4043062200956938, "grad_norm": 0.7965114072662832, "learning_rate": 9.88480693827182e-06, "loss": 0.8898, "num_tokens": 34854045.0, "step": 169 }, { "epoch": 0.40669856459330145, "grad_norm": 0.7434096681118705, "learning_rate": 9.882838036881128e-06, "loss": 0.8313, "num_tokens": 35069557.0, "step": 170 }, { "epoch": 0.4090909090909091, "grad_norm": 0.7239308188160857, "learning_rate": 9.880852673247671e-06, "loss": 0.9248, "num_tokens": 35263232.0, "step": 171 }, { "epoch": 0.41148325358851673, "grad_norm": 0.6944813811953502, "learning_rate": 9.878850854828614e-06, "loss": 1.0155, "num_tokens": 35508355.0, "step": 172 }, { "epoch": 0.4138755980861244, "grad_norm": 0.7508945276214297, "learning_rate": 9.876832589142925e-06, "loss": 0.8899, "num_tokens": 35718913.0, "step": 173 }, { "epoch": 0.41626794258373206, "grad_norm": 0.8067676235321118, "learning_rate": 9.87479788377135e-06, "loss": 0.8972, "num_tokens": 35917104.0, "step": 174 }, { "epoch": 0.41866028708133973, "grad_norm": 0.7477163711045044, "learning_rate": 9.87274674635638e-06, "loss": 0.8003, "num_tokens": 36084425.0, "step": 175 }, { "epoch": 0.42105263157894735, "grad_norm": 0.7262231697306871, "learning_rate": 9.870679184602233e-06, "loss": 0.9849, "num_tokens": 36311216.0, "step": 176 }, { "epoch": 0.423444976076555, "grad_norm": 0.8119435832767165, "learning_rate": 9.868595206274812e-06, "loss": 1.0225, "num_tokens": 36478396.0, "step": 177 }, { "epoch": 0.4258373205741627, "grad_norm": 0.767066338561648, "learning_rate": 9.866494819201682e-06, "loss": 0.9216, "num_tokens": 36672268.0, "step": 178 }, { "epoch": 0.42822966507177035, "grad_norm": 0.7011545360538196, "learning_rate": 9.864378031272045e-06, "loss": 0.9498, "num_tokens": 36904693.0, "step": 179 }, { "epoch": 0.430622009569378, "grad_norm": 0.7773922051063414, "learning_rate": 9.862244850436702e-06, "loss": 0.6955, "num_tokens": 37057970.0, "step": 180 }, { "epoch": 0.43301435406698563, "grad_norm": 0.7869339535084815, "learning_rate": 9.860095284708028e-06, "loss": 0.878, "num_tokens": 37226986.0, "step": 181 }, { "epoch": 0.4354066985645933, "grad_norm": 0.6978575252288622, "learning_rate": 9.857929342159938e-06, "loss": 0.8166, "num_tokens": 37454734.0, "step": 182 }, { "epoch": 0.43779904306220097, "grad_norm": 0.7859244195562889, "learning_rate": 9.855747030927866e-06, "loss": 0.6955, "num_tokens": 37598443.0, "step": 183 }, { "epoch": 0.44019138755980863, "grad_norm": 0.7367509293500684, "learning_rate": 9.853548359208722e-06, "loss": 0.827, "num_tokens": 37772689.0, "step": 184 }, { "epoch": 0.44258373205741625, "grad_norm": 0.7415296718711833, "learning_rate": 9.851333335260869e-06, "loss": 0.9512, "num_tokens": 38004549.0, "step": 185 }, { "epoch": 0.4449760765550239, "grad_norm": 0.8674192800405747, "learning_rate": 9.84910196740409e-06, "loss": 0.8545, "num_tokens": 38159721.0, "step": 186 }, { "epoch": 0.4473684210526316, "grad_norm": 0.8172487094325894, "learning_rate": 9.846854264019556e-06, "loss": 0.9048, "num_tokens": 38361874.0, "step": 187 }, { "epoch": 0.44976076555023925, "grad_norm": 0.749271142770708, "learning_rate": 9.844590233549797e-06, "loss": 0.7697, "num_tokens": 38532356.0, "step": 188 }, { "epoch": 0.45215311004784686, "grad_norm": 0.7006067463192553, "learning_rate": 9.842309884498672e-06, "loss": 0.8987, "num_tokens": 38771369.0, "step": 189 }, { "epoch": 0.45454545454545453, "grad_norm": 0.5978765943982213, "learning_rate": 9.840013225431322e-06, "loss": 1.0488, "num_tokens": 39038278.0, "step": 190 }, { "epoch": 0.4569377990430622, "grad_norm": 0.7887229239731655, "learning_rate": 9.837700264974163e-06, "loss": 0.8302, "num_tokens": 39240942.0, "step": 191 }, { "epoch": 0.45933014354066987, "grad_norm": 0.7502651179572393, "learning_rate": 9.835371011814832e-06, "loss": 1.0493, "num_tokens": 39391888.0, "step": 192 }, { "epoch": 0.46172248803827753, "grad_norm": 0.648210560375548, "learning_rate": 9.833025474702169e-06, "loss": 0.8346, "num_tokens": 39645186.0, "step": 193 }, { "epoch": 0.46411483253588515, "grad_norm": 0.6635996305419883, "learning_rate": 9.830663662446166e-06, "loss": 0.9651, "num_tokens": 39872583.0, "step": 194 }, { "epoch": 0.4665071770334928, "grad_norm": 0.7512488474508062, "learning_rate": 9.828285583917962e-06, "loss": 0.724, "num_tokens": 40030081.0, "step": 195 }, { "epoch": 0.4688995215311005, "grad_norm": 0.7620024583744265, "learning_rate": 9.825891248049776e-06, "loss": 0.9252, "num_tokens": 40225498.0, "step": 196 }, { "epoch": 0.47129186602870815, "grad_norm": 0.7381021014120716, "learning_rate": 9.823480663834905e-06, "loss": 0.9246, "num_tokens": 40419013.0, "step": 197 }, { "epoch": 0.47368421052631576, "grad_norm": 0.7310934363198373, "learning_rate": 9.821053840327668e-06, "loss": 0.8483, "num_tokens": 40638483.0, "step": 198 }, { "epoch": 0.47607655502392343, "grad_norm": 0.6369502362340661, "learning_rate": 9.818610786643383e-06, "loss": 1.1045, "num_tokens": 40851402.0, "step": 199 }, { "epoch": 0.4784688995215311, "grad_norm": 0.6662173138687252, "learning_rate": 9.816151511958326e-06, "loss": 0.8642, "num_tokens": 41076074.0, "step": 200 }, { "epoch": 0.48086124401913877, "grad_norm": 0.7069311757824951, "learning_rate": 9.813676025509705e-06, "loss": 0.8348, "num_tokens": 41281733.0, "step": 201 }, { "epoch": 0.48325358851674644, "grad_norm": 0.7715896724886065, "learning_rate": 9.811184336595618e-06, "loss": 0.7638, "num_tokens": 41469321.0, "step": 202 }, { "epoch": 0.48564593301435405, "grad_norm": 0.7300586617529341, "learning_rate": 9.808676454575021e-06, "loss": 0.8712, "num_tokens": 41655589.0, "step": 203 }, { "epoch": 0.4880382775119617, "grad_norm": 0.7433895285131304, "learning_rate": 9.806152388867691e-06, "loss": 0.9764, "num_tokens": 41884531.0, "step": 204 }, { "epoch": 0.4904306220095694, "grad_norm": 0.729313387708453, "learning_rate": 9.803612148954196e-06, "loss": 0.89, "num_tokens": 42067360.0, "step": 205 }, { "epoch": 0.49282296650717705, "grad_norm": 0.7368107493572161, "learning_rate": 9.80105574437585e-06, "loss": 0.6557, "num_tokens": 42220024.0, "step": 206 }, { "epoch": 0.49521531100478466, "grad_norm": 0.7182944346849723, "learning_rate": 9.798483184734685e-06, "loss": 0.999, "num_tokens": 42412973.0, "step": 207 }, { "epoch": 0.49760765550239233, "grad_norm": 0.6838259373624938, "learning_rate": 9.795894479693416e-06, "loss": 0.7809, "num_tokens": 42635600.0, "step": 208 }, { "epoch": 0.5, "grad_norm": 0.6596178695787379, "learning_rate": 9.793289638975396e-06, "loss": 0.8985, "num_tokens": 42829803.0, "step": 209 }, { "epoch": 0.5023923444976076, "grad_norm": 0.7819562505016303, "learning_rate": 9.790668672364588e-06, "loss": 1.0058, "num_tokens": 42987359.0, "step": 210 }, { "epoch": 0.5047846889952153, "grad_norm": 0.6376009810615048, "learning_rate": 9.788031589705522e-06, "loss": 0.825, "num_tokens": 43222234.0, "step": 211 }, { "epoch": 0.507177033492823, "grad_norm": 0.7170548940637805, "learning_rate": 9.785378400903265e-06, "loss": 0.8579, "num_tokens": 43415580.0, "step": 212 }, { "epoch": 0.5095693779904307, "grad_norm": 0.6943966856575938, "learning_rate": 9.782709115923377e-06, "loss": 0.8678, "num_tokens": 43636143.0, "step": 213 }, { "epoch": 0.5119617224880383, "grad_norm": 0.7777442170195412, "learning_rate": 9.780023744791875e-06, "loss": 0.9754, "num_tokens": 43788003.0, "step": 214 }, { "epoch": 0.5143540669856459, "grad_norm": 0.7565902699333324, "learning_rate": 9.777322297595203e-06, "loss": 0.8631, "num_tokens": 43936330.0, "step": 215 }, { "epoch": 0.5167464114832536, "grad_norm": 0.6723572251989621, "learning_rate": 9.77460478448018e-06, "loss": 0.8151, "num_tokens": 44164196.0, "step": 216 }, { "epoch": 0.5191387559808612, "grad_norm": 0.7222983115738448, "learning_rate": 9.771871215653973e-06, "loss": 0.9845, "num_tokens": 44362899.0, "step": 217 }, { "epoch": 0.5215311004784688, "grad_norm": 0.7125065875645107, "learning_rate": 9.769121601384057e-06, "loss": 1.0482, "num_tokens": 44601312.0, "step": 218 }, { "epoch": 0.5239234449760766, "grad_norm": 0.707677277074084, "learning_rate": 9.766355951998172e-06, "loss": 1.0536, "num_tokens": 44795812.0, "step": 219 }, { "epoch": 0.5263157894736842, "grad_norm": 0.6953123625587964, "learning_rate": 9.76357427788429e-06, "loss": 1.0325, "num_tokens": 45011494.0, "step": 220 }, { "epoch": 0.5287081339712919, "grad_norm": 0.6209070758822427, "learning_rate": 9.760776589490571e-06, "loss": 0.8775, "num_tokens": 45271878.0, "step": 221 }, { "epoch": 0.5311004784688995, "grad_norm": 0.8181333695786353, "learning_rate": 9.757962897325327e-06, "loss": 0.7584, "num_tokens": 45395391.0, "step": 222 }, { "epoch": 0.5334928229665071, "grad_norm": 0.7353051768668135, "learning_rate": 9.755133211956976e-06, "loss": 0.997, "num_tokens": 45590083.0, "step": 223 }, { "epoch": 0.5358851674641149, "grad_norm": 0.6917954967989495, "learning_rate": 9.752287544014016e-06, "loss": 0.8478, "num_tokens": 45809050.0, "step": 224 }, { "epoch": 0.5382775119617225, "grad_norm": 0.6310463226895536, "learning_rate": 9.749425904184973e-06, "loss": 0.8349, "num_tokens": 46026582.0, "step": 225 }, { "epoch": 0.5406698564593302, "grad_norm": 0.7496878034969545, "learning_rate": 9.746548303218361e-06, "loss": 0.7081, "num_tokens": 46217058.0, "step": 226 }, { "epoch": 0.5430622009569378, "grad_norm": 0.7613261565472779, "learning_rate": 9.74365475192265e-06, "loss": 0.9549, "num_tokens": 46405604.0, "step": 227 }, { "epoch": 0.5454545454545454, "grad_norm": 0.7118103212558673, "learning_rate": 9.74074526116622e-06, "loss": 0.7397, "num_tokens": 46609383.0, "step": 228 }, { "epoch": 0.5478468899521531, "grad_norm": 0.7886609868852241, "learning_rate": 9.737819841877319e-06, "loss": 0.6614, "num_tokens": 46774027.0, "step": 229 }, { "epoch": 0.5502392344497608, "grad_norm": 0.7512297618411289, "learning_rate": 9.734878505044022e-06, "loss": 0.8708, "num_tokens": 46934136.0, "step": 230 }, { "epoch": 0.5526315789473685, "grad_norm": 0.6467942759269715, "learning_rate": 9.731921261714195e-06, "loss": 0.9094, "num_tokens": 47166689.0, "step": 231 }, { "epoch": 0.5550239234449761, "grad_norm": 0.6186151914065757, "learning_rate": 9.728948122995448e-06, "loss": 0.74, "num_tokens": 47407182.0, "step": 232 }, { "epoch": 0.5574162679425837, "grad_norm": 0.7546119364082358, "learning_rate": 9.725959100055095e-06, "loss": 0.8622, "num_tokens": 47621230.0, "step": 233 }, { "epoch": 0.5598086124401914, "grad_norm": 0.7016789780653074, "learning_rate": 9.722954204120116e-06, "loss": 1.1317, "num_tokens": 47839958.0, "step": 234 }, { "epoch": 0.562200956937799, "grad_norm": 0.6830572301668759, "learning_rate": 9.719933446477104e-06, "loss": 1.0093, "num_tokens": 48044769.0, "step": 235 }, { "epoch": 0.5645933014354066, "grad_norm": 0.6848757588607501, "learning_rate": 9.716896838472237e-06, "loss": 1.0054, "num_tokens": 48254718.0, "step": 236 }, { "epoch": 0.5669856459330144, "grad_norm": 0.7931488360108984, "learning_rate": 9.71384439151122e-06, "loss": 0.5908, "num_tokens": 48403707.0, "step": 237 }, { "epoch": 0.569377990430622, "grad_norm": 0.7468278674122727, "learning_rate": 9.710776117059262e-06, "loss": 0.8507, "num_tokens": 48580654.0, "step": 238 }, { "epoch": 0.5717703349282297, "grad_norm": 0.7928730950381853, "learning_rate": 9.707692026641007e-06, "loss": 1.0811, "num_tokens": 48757864.0, "step": 239 }, { "epoch": 0.5741626794258373, "grad_norm": 0.8167247736000246, "learning_rate": 9.704592131840517e-06, "loss": 0.8996, "num_tokens": 48948795.0, "step": 240 }, { "epoch": 0.5765550239234449, "grad_norm": 0.7576343998163653, "learning_rate": 9.701476444301208e-06, "loss": 0.8986, "num_tokens": 49137680.0, "step": 241 }, { "epoch": 0.5789473684210527, "grad_norm": 0.7136634589206409, "learning_rate": 9.69834497572582e-06, "loss": 0.969, "num_tokens": 49330369.0, "step": 242 }, { "epoch": 0.5813397129186603, "grad_norm": 0.6736332498636337, "learning_rate": 9.695197737876364e-06, "loss": 0.9318, "num_tokens": 49545446.0, "step": 243 }, { "epoch": 0.583732057416268, "grad_norm": 0.6714841653229451, "learning_rate": 9.692034742574082e-06, "loss": 0.8462, "num_tokens": 49779498.0, "step": 244 }, { "epoch": 0.5861244019138756, "grad_norm": 0.8175525625058889, "learning_rate": 9.688856001699407e-06, "loss": 0.9611, "num_tokens": 49952841.0, "step": 245 }, { "epoch": 0.5885167464114832, "grad_norm": 0.7409257480013611, "learning_rate": 9.685661527191905e-06, "loss": 0.9577, "num_tokens": 50151322.0, "step": 246 }, { "epoch": 0.5909090909090909, "grad_norm": 0.7504645386701574, "learning_rate": 9.682451331050246e-06, "loss": 1.0545, "num_tokens": 50349864.0, "step": 247 }, { "epoch": 0.5933014354066986, "grad_norm": 0.693369456244369, "learning_rate": 9.679225425332147e-06, "loss": 0.9629, "num_tokens": 50584303.0, "step": 248 }, { "epoch": 0.5956937799043063, "grad_norm": 0.646410500745444, "learning_rate": 9.675983822154336e-06, "loss": 0.8581, "num_tokens": 50818111.0, "step": 249 }, { "epoch": 0.5980861244019139, "grad_norm": 0.7492761839244249, "learning_rate": 9.672726533692496e-06, "loss": 0.9365, "num_tokens": 51034934.0, "step": 250 }, { "epoch": 0.6004784688995215, "grad_norm": 0.7875923284327673, "learning_rate": 9.66945357218123e-06, "loss": 0.8799, "num_tokens": 51196696.0, "step": 251 }, { "epoch": 0.6028708133971292, "grad_norm": 0.6890802350922746, "learning_rate": 9.666164949914003e-06, "loss": 0.8263, "num_tokens": 51394657.0, "step": 252 }, { "epoch": 0.6052631578947368, "grad_norm": 0.6609546018572526, "learning_rate": 9.662860679243112e-06, "loss": 0.7737, "num_tokens": 51629548.0, "step": 253 }, { "epoch": 0.6076555023923444, "grad_norm": 0.6388346290233738, "learning_rate": 9.659540772579626e-06, "loss": 0.7746, "num_tokens": 51859831.0, "step": 254 }, { "epoch": 0.6100478468899522, "grad_norm": 0.7630946831333012, "learning_rate": 9.656205242393341e-06, "loss": 0.8517, "num_tokens": 52014662.0, "step": 255 }, { "epoch": 0.6124401913875598, "grad_norm": 0.7099062445428919, "learning_rate": 9.652854101212744e-06, "loss": 1.1071, "num_tokens": 52240815.0, "step": 256 }, { "epoch": 0.6148325358851675, "grad_norm": 0.6754326767149219, "learning_rate": 9.64948736162495e-06, "loss": 1.019, "num_tokens": 52474992.0, "step": 257 }, { "epoch": 0.6172248803827751, "grad_norm": 0.739542831715403, "learning_rate": 9.646105036275663e-06, "loss": 0.7532, "num_tokens": 52651126.0, "step": 258 }, { "epoch": 0.6196172248803827, "grad_norm": 0.728643865556642, "learning_rate": 9.642707137869136e-06, "loss": 0.8722, "num_tokens": 52820774.0, "step": 259 }, { "epoch": 0.6220095693779905, "grad_norm": 0.7086257685596635, "learning_rate": 9.639293679168105e-06, "loss": 0.6187, "num_tokens": 53018322.0, "step": 260 }, { "epoch": 0.6244019138755981, "grad_norm": 0.7399683012373918, "learning_rate": 9.635864672993764e-06, "loss": 0.872, "num_tokens": 53219194.0, "step": 261 }, { "epoch": 0.6267942583732058, "grad_norm": 0.7661507351405599, "learning_rate": 9.632420132225691e-06, "loss": 0.6345, "num_tokens": 53387444.0, "step": 262 }, { "epoch": 0.6291866028708134, "grad_norm": 0.7069621149226814, "learning_rate": 9.628960069801825e-06, "loss": 0.8452, "num_tokens": 53626979.0, "step": 263 }, { "epoch": 0.631578947368421, "grad_norm": 0.797463821355159, "learning_rate": 9.625484498718392e-06, "loss": 0.7205, "num_tokens": 53774781.0, "step": 264 }, { "epoch": 0.6339712918660287, "grad_norm": 0.7005373002761818, "learning_rate": 9.621993432029884e-06, "loss": 1.0427, "num_tokens": 53957662.0, "step": 265 }, { "epoch": 0.6363636363636364, "grad_norm": 0.7101631369249962, "learning_rate": 9.618486882848988e-06, "loss": 0.7358, "num_tokens": 54144418.0, "step": 266 }, { "epoch": 0.638755980861244, "grad_norm": 0.7439018504530075, "learning_rate": 9.614964864346541e-06, "loss": 0.7923, "num_tokens": 54321948.0, "step": 267 }, { "epoch": 0.6411483253588517, "grad_norm": 0.704791131563084, "learning_rate": 9.611427389751494e-06, "loss": 0.6775, "num_tokens": 54511012.0, "step": 268 }, { "epoch": 0.6435406698564593, "grad_norm": 0.718086612467513, "learning_rate": 9.607874472350842e-06, "loss": 0.6723, "num_tokens": 54715882.0, "step": 269 }, { "epoch": 0.645933014354067, "grad_norm": 0.7143312089646857, "learning_rate": 9.60430612548959e-06, "loss": 1.0111, "num_tokens": 54914319.0, "step": 270 }, { "epoch": 0.6483253588516746, "grad_norm": 0.6755764277181526, "learning_rate": 9.600722362570694e-06, "loss": 1.0113, "num_tokens": 55122172.0, "step": 271 }, { "epoch": 0.6507177033492823, "grad_norm": 0.7761302927431148, "learning_rate": 9.59712319705502e-06, "loss": 0.8245, "num_tokens": 55323698.0, "step": 272 }, { "epoch": 0.65311004784689, "grad_norm": 0.7340335032058501, "learning_rate": 9.593508642461276e-06, "loss": 0.9043, "num_tokens": 55531439.0, "step": 273 }, { "epoch": 0.6555023923444976, "grad_norm": 0.7588348203486196, "learning_rate": 9.589878712365982e-06, "loss": 0.624, "num_tokens": 55692316.0, "step": 274 }, { "epoch": 0.6578947368421053, "grad_norm": 0.7550464009356935, "learning_rate": 9.586233420403407e-06, "loss": 1.1432, "num_tokens": 55928859.0, "step": 275 }, { "epoch": 0.6602870813397129, "grad_norm": 0.7135616341229885, "learning_rate": 9.582572780265521e-06, "loss": 0.7436, "num_tokens": 56121111.0, "step": 276 }, { "epoch": 0.6626794258373205, "grad_norm": 0.6371530652389988, "learning_rate": 9.578896805701939e-06, "loss": 0.9986, "num_tokens": 56359401.0, "step": 277 }, { "epoch": 0.6650717703349283, "grad_norm": 0.683186850513734, "learning_rate": 9.575205510519878e-06, "loss": 0.8857, "num_tokens": 56581344.0, "step": 278 }, { "epoch": 0.6674641148325359, "grad_norm": 0.6810286701061241, "learning_rate": 9.571498908584097e-06, "loss": 1.0477, "num_tokens": 56788478.0, "step": 279 }, { "epoch": 0.6698564593301436, "grad_norm": 0.6142445432056164, "learning_rate": 9.567777013816846e-06, "loss": 1.1127, "num_tokens": 57018493.0, "step": 280 }, { "epoch": 0.6722488038277512, "grad_norm": 0.7105090692349689, "learning_rate": 9.564039840197824e-06, "loss": 0.9211, "num_tokens": 57201946.0, "step": 281 }, { "epoch": 0.6746411483253588, "grad_norm": 0.7441838628071439, "learning_rate": 9.560287401764109e-06, "loss": 0.8588, "num_tokens": 57378278.0, "step": 282 }, { "epoch": 0.6770334928229665, "grad_norm": 0.616622351957538, "learning_rate": 9.556519712610122e-06, "loss": 0.9045, "num_tokens": 57597843.0, "step": 283 }, { "epoch": 0.6794258373205742, "grad_norm": 0.6650978269716845, "learning_rate": 9.552736786887566e-06, "loss": 0.8173, "num_tokens": 57812696.0, "step": 284 }, { "epoch": 0.6818181818181818, "grad_norm": 0.6669101745672817, "learning_rate": 9.548938638805369e-06, "loss": 0.8258, "num_tokens": 58031586.0, "step": 285 }, { "epoch": 0.6842105263157895, "grad_norm": 0.712052565123226, "learning_rate": 9.545125282629639e-06, "loss": 0.6606, "num_tokens": 58219867.0, "step": 286 }, { "epoch": 0.6866028708133971, "grad_norm": 0.6903410307218643, "learning_rate": 9.541296732683601e-06, "loss": 0.9946, "num_tokens": 58455804.0, "step": 287 }, { "epoch": 0.6889952153110048, "grad_norm": 0.7028207533903529, "learning_rate": 9.537453003347562e-06, "loss": 0.7778, "num_tokens": 58647209.0, "step": 288 }, { "epoch": 0.6913875598086124, "grad_norm": 0.5935012950917158, "learning_rate": 9.533594109058827e-06, "loss": 0.6958, "num_tokens": 58876476.0, "step": 289 }, { "epoch": 0.69377990430622, "grad_norm": 0.7147877252504475, "learning_rate": 9.529720064311674e-06, "loss": 0.8571, "num_tokens": 59054211.0, "step": 290 }, { "epoch": 0.6961722488038278, "grad_norm": 0.6165616093448105, "learning_rate": 9.525830883657283e-06, "loss": 0.9089, "num_tokens": 59274415.0, "step": 291 }, { "epoch": 0.6985645933014354, "grad_norm": 0.7262837935686104, "learning_rate": 9.521926581703682e-06, "loss": 0.8738, "num_tokens": 59494463.0, "step": 292 }, { "epoch": 0.7009569377990431, "grad_norm": 0.7227392632343834, "learning_rate": 9.518007173115705e-06, "loss": 0.9422, "num_tokens": 59683825.0, "step": 293 }, { "epoch": 0.7033492822966507, "grad_norm": 0.657716296887696, "learning_rate": 9.514072672614915e-06, "loss": 0.9122, "num_tokens": 59900990.0, "step": 294 }, { "epoch": 0.7057416267942583, "grad_norm": 0.6950627976694991, "learning_rate": 9.510123094979575e-06, "loss": 0.7463, "num_tokens": 60100020.0, "step": 295 }, { "epoch": 0.7081339712918661, "grad_norm": 0.7417107599439859, "learning_rate": 9.506158455044567e-06, "loss": 0.6647, "num_tokens": 60267424.0, "step": 296 }, { "epoch": 0.7105263157894737, "grad_norm": 0.7175495863527874, "learning_rate": 9.502178767701353e-06, "loss": 0.9446, "num_tokens": 60452992.0, "step": 297 }, { "epoch": 0.7129186602870813, "grad_norm": 0.7618820197933083, "learning_rate": 9.498184047897915e-06, "loss": 0.919, "num_tokens": 60618749.0, "step": 298 }, { "epoch": 0.715311004784689, "grad_norm": 0.6170834471553733, "learning_rate": 9.494174310638697e-06, "loss": 0.8629, "num_tokens": 60883079.0, "step": 299 }, { "epoch": 0.7177033492822966, "grad_norm": 0.7814822668163934, "learning_rate": 9.490149570984549e-06, "loss": 0.8538, "num_tokens": 61050703.0, "step": 300 }, { "epoch": 0.7200956937799043, "grad_norm": 0.7973106331385097, "learning_rate": 9.48610984405267e-06, "loss": 0.9464, "num_tokens": 61281941.0, "step": 301 }, { "epoch": 0.722488038277512, "grad_norm": 0.7386439891400786, "learning_rate": 9.482055145016557e-06, "loss": 1.0107, "num_tokens": 61500702.0, "step": 302 }, { "epoch": 0.7248803827751196, "grad_norm": 0.7576189557606913, "learning_rate": 9.47798548910594e-06, "loss": 0.7948, "num_tokens": 61677831.0, "step": 303 }, { "epoch": 0.7272727272727273, "grad_norm": 0.582639751482507, "learning_rate": 9.473900891606724e-06, "loss": 1.0505, "num_tokens": 61943094.0, "step": 304 }, { "epoch": 0.7296650717703349, "grad_norm": 0.6975691338736785, "learning_rate": 9.469801367860944e-06, "loss": 0.8864, "num_tokens": 62162858.0, "step": 305 }, { "epoch": 0.7320574162679426, "grad_norm": 0.6001353096390548, "learning_rate": 9.46568693326669e-06, "loss": 0.8886, "num_tokens": 62440988.0, "step": 306 }, { "epoch": 0.7344497607655502, "grad_norm": 0.6694234470054652, "learning_rate": 9.461557603278066e-06, "loss": 0.8452, "num_tokens": 62675134.0, "step": 307 }, { "epoch": 0.7368421052631579, "grad_norm": 0.6920337040450307, "learning_rate": 9.457413393405122e-06, "loss": 0.8468, "num_tokens": 62839459.0, "step": 308 }, { "epoch": 0.7392344497607656, "grad_norm": 0.6664718554483587, "learning_rate": 9.453254319213794e-06, "loss": 0.9225, "num_tokens": 63032332.0, "step": 309 }, { "epoch": 0.7416267942583732, "grad_norm": 0.7044180002802695, "learning_rate": 9.449080396325855e-06, "loss": 0.6374, "num_tokens": 63244272.0, "step": 310 }, { "epoch": 0.7440191387559809, "grad_norm": 0.7003159969901962, "learning_rate": 9.444891640418847e-06, "loss": 0.9323, "num_tokens": 63466930.0, "step": 311 }, { "epoch": 0.7464114832535885, "grad_norm": 0.6724163707523774, "learning_rate": 9.440688067226025e-06, "loss": 0.957, "num_tokens": 63666983.0, "step": 312 }, { "epoch": 0.7488038277511961, "grad_norm": 0.6990957415189266, "learning_rate": 9.436469692536303e-06, "loss": 1.1225, "num_tokens": 63921481.0, "step": 313 }, { "epoch": 0.7511961722488039, "grad_norm": 0.7267577309004162, "learning_rate": 9.432236532194188e-06, "loss": 0.9527, "num_tokens": 64108035.0, "step": 314 }, { "epoch": 0.7535885167464115, "grad_norm": 0.6565759412319756, "learning_rate": 9.427988602099722e-06, "loss": 0.8312, "num_tokens": 64347582.0, "step": 315 }, { "epoch": 0.7559808612440191, "grad_norm": 0.6989421029736691, "learning_rate": 9.423725918208422e-06, "loss": 1.0384, "num_tokens": 64565695.0, "step": 316 }, { "epoch": 0.7583732057416268, "grad_norm": 0.8075590633967065, "learning_rate": 9.41944849653123e-06, "loss": 0.7874, "num_tokens": 64736228.0, "step": 317 }, { "epoch": 0.7607655502392344, "grad_norm": 0.7420969172241604, "learning_rate": 9.41515635313443e-06, "loss": 0.8776, "num_tokens": 64919765.0, "step": 318 }, { "epoch": 0.7631578947368421, "grad_norm": 0.8074410953534803, "learning_rate": 9.410849504139615e-06, "loss": 0.8007, "num_tokens": 65093331.0, "step": 319 }, { "epoch": 0.7655502392344498, "grad_norm": 0.7359810070319559, "learning_rate": 9.406527965723606e-06, "loss": 0.775, "num_tokens": 65281368.0, "step": 320 }, { "epoch": 0.7679425837320574, "grad_norm": 0.671250608829433, "learning_rate": 9.402191754118398e-06, "loss": 0.8672, "num_tokens": 65474208.0, "step": 321 }, { "epoch": 0.7703349282296651, "grad_norm": 0.70087002173681, "learning_rate": 9.397840885611105e-06, "loss": 0.8645, "num_tokens": 65700409.0, "step": 322 }, { "epoch": 0.7727272727272727, "grad_norm": 0.6634659621815793, "learning_rate": 9.393475376543888e-06, "loss": 0.9032, "num_tokens": 65968264.0, "step": 323 }, { "epoch": 0.7751196172248804, "grad_norm": 0.6855924238506387, "learning_rate": 9.389095243313901e-06, "loss": 0.7714, "num_tokens": 66153787.0, "step": 324 }, { "epoch": 0.777511961722488, "grad_norm": 0.6892245239453134, "learning_rate": 9.384700502373227e-06, "loss": 0.6299, "num_tokens": 66343926.0, "step": 325 }, { "epoch": 0.7799043062200957, "grad_norm": 0.7020557300227046, "learning_rate": 9.380291170228819e-06, "loss": 0.7912, "num_tokens": 66543908.0, "step": 326 }, { "epoch": 0.7822966507177034, "grad_norm": 0.7371763474019104, "learning_rate": 9.375867263442433e-06, "loss": 0.9272, "num_tokens": 66771074.0, "step": 327 }, { "epoch": 0.784688995215311, "grad_norm": 0.7796597325368293, "learning_rate": 9.371428798630567e-06, "loss": 0.8503, "num_tokens": 66937799.0, "step": 328 }, { "epoch": 0.7870813397129187, "grad_norm": 0.750139825760097, "learning_rate": 9.366975792464402e-06, "loss": 0.8718, "num_tokens": 67141282.0, "step": 329 }, { "epoch": 0.7894736842105263, "grad_norm": 0.7573777835614758, "learning_rate": 9.362508261669739e-06, "loss": 0.7777, "num_tokens": 67344223.0, "step": 330 }, { "epoch": 0.7918660287081339, "grad_norm": 0.7196830972315318, "learning_rate": 9.358026223026931e-06, "loss": 0.9235, "num_tokens": 67523494.0, "step": 331 }, { "epoch": 0.7942583732057417, "grad_norm": 0.7148268968783641, "learning_rate": 9.353529693370827e-06, "loss": 0.5793, "num_tokens": 67707250.0, "step": 332 }, { "epoch": 0.7966507177033493, "grad_norm": 0.7183862252835895, "learning_rate": 9.3490186895907e-06, "loss": 0.9153, "num_tokens": 67887746.0, "step": 333 }, { "epoch": 0.7990430622009569, "grad_norm": 0.6783981868743665, "learning_rate": 9.344493228630194e-06, "loss": 1.0557, "num_tokens": 68117792.0, "step": 334 }, { "epoch": 0.8014354066985646, "grad_norm": 0.6366624212064868, "learning_rate": 9.339953327487254e-06, "loss": 0.7942, "num_tokens": 68366492.0, "step": 335 }, { "epoch": 0.8038277511961722, "grad_norm": 0.7078016914757955, "learning_rate": 9.335399003214062e-06, "loss": 0.5402, "num_tokens": 68548241.0, "step": 336 }, { "epoch": 0.80622009569378, "grad_norm": 0.6522725955130363, "learning_rate": 9.330830272916974e-06, "loss": 0.9855, "num_tokens": 68763750.0, "step": 337 }, { "epoch": 0.8086124401913876, "grad_norm": 0.6214168725386654, "learning_rate": 9.326247153756459e-06, "loss": 0.86, "num_tokens": 69001165.0, "step": 338 }, { "epoch": 0.8110047846889952, "grad_norm": 0.7314720468232042, "learning_rate": 9.321649662947027e-06, "loss": 0.5351, "num_tokens": 69178248.0, "step": 339 }, { "epoch": 0.8133971291866029, "grad_norm": 0.65272447368887, "learning_rate": 9.317037817757175e-06, "loss": 0.8351, "num_tokens": 69428803.0, "step": 340 }, { "epoch": 0.8157894736842105, "grad_norm": 0.5839621781348253, "learning_rate": 9.312411635509309e-06, "loss": 1.0737, "num_tokens": 69712302.0, "step": 341 }, { "epoch": 0.8181818181818182, "grad_norm": 0.7016886773959953, "learning_rate": 9.307771133579691e-06, "loss": 0.9811, "num_tokens": 69928563.0, "step": 342 }, { "epoch": 0.8205741626794258, "grad_norm": 0.6965166415390514, "learning_rate": 9.303116329398367e-06, "loss": 0.842, "num_tokens": 70161341.0, "step": 343 }, { "epoch": 0.8229665071770335, "grad_norm": 0.6875678411924595, "learning_rate": 9.298447240449104e-06, "loss": 0.7473, "num_tokens": 70381060.0, "step": 344 }, { "epoch": 0.8253588516746412, "grad_norm": 0.6438807693082683, "learning_rate": 9.293763884269321e-06, "loss": 0.8741, "num_tokens": 70598036.0, "step": 345 }, { "epoch": 0.8277511961722488, "grad_norm": 0.7613247244695108, "learning_rate": 9.28906627845003e-06, "loss": 0.7014, "num_tokens": 70784049.0, "step": 346 }, { "epoch": 0.8301435406698564, "grad_norm": 0.6738748732421641, "learning_rate": 9.284354440635763e-06, "loss": 0.7834, "num_tokens": 70976591.0, "step": 347 }, { "epoch": 0.8325358851674641, "grad_norm": 0.699827251012226, "learning_rate": 9.279628388524506e-06, "loss": 0.5531, "num_tokens": 71141435.0, "step": 348 }, { "epoch": 0.8349282296650717, "grad_norm": 0.6478605379617053, "learning_rate": 9.274888139867642e-06, "loss": 0.6004, "num_tokens": 71392389.0, "step": 349 }, { "epoch": 0.8373205741626795, "grad_norm": 0.6628070766800735, "learning_rate": 9.27013371246987e-06, "loss": 0.6012, "num_tokens": 71589359.0, "step": 350 }, { "epoch": 0.8397129186602871, "grad_norm": 0.6747088798563603, "learning_rate": 9.265365124189147e-06, "loss": 0.793, "num_tokens": 71781509.0, "step": 351 }, { "epoch": 0.8421052631578947, "grad_norm": 0.6633362054226132, "learning_rate": 9.26058239293662e-06, "loss": 0.841, "num_tokens": 72024732.0, "step": 352 }, { "epoch": 0.8444976076555024, "grad_norm": 0.6728638102656455, "learning_rate": 9.255785536676563e-06, "loss": 0.9799, "num_tokens": 72239477.0, "step": 353 }, { "epoch": 0.84688995215311, "grad_norm": 0.7031237520856609, "learning_rate": 9.250974573426298e-06, "loss": 0.6896, "num_tokens": 72394017.0, "step": 354 }, { "epoch": 0.8492822966507177, "grad_norm": 0.6852896746420252, "learning_rate": 9.24614952125613e-06, "loss": 0.8719, "num_tokens": 72580756.0, "step": 355 }, { "epoch": 0.8516746411483254, "grad_norm": 0.6138862436489377, "learning_rate": 9.241310398289298e-06, "loss": 0.5569, "num_tokens": 72818717.0, "step": 356 }, { "epoch": 0.854066985645933, "grad_norm": 0.6957775906573956, "learning_rate": 9.236457222701877e-06, "loss": 0.9885, "num_tokens": 73026311.0, "step": 357 }, { "epoch": 0.8564593301435407, "grad_norm": 0.6881414684962115, "learning_rate": 9.23159001272273e-06, "loss": 0.9276, "num_tokens": 73229574.0, "step": 358 }, { "epoch": 0.8588516746411483, "grad_norm": 0.6506924307578823, "learning_rate": 9.226708786633436e-06, "loss": 0.8401, "num_tokens": 73470348.0, "step": 359 }, { "epoch": 0.861244019138756, "grad_norm": 0.6825146769213832, "learning_rate": 9.221813562768215e-06, "loss": 0.8945, "num_tokens": 73651355.0, "step": 360 }, { "epoch": 0.8636363636363636, "grad_norm": 0.6636653793892663, "learning_rate": 9.21690435951387e-06, "loss": 1.0136, "num_tokens": 73867323.0, "step": 361 }, { "epoch": 0.8660287081339713, "grad_norm": 0.682534402054643, "learning_rate": 9.211981195309706e-06, "loss": 0.745, "num_tokens": 74056459.0, "step": 362 }, { "epoch": 0.868421052631579, "grad_norm": 0.675060886089477, "learning_rate": 9.207044088647469e-06, "loss": 0.9638, "num_tokens": 74297192.0, "step": 363 }, { "epoch": 0.8708133971291866, "grad_norm": 0.6614026961374179, "learning_rate": 9.202093058071271e-06, "loss": 0.7406, "num_tokens": 74493323.0, "step": 364 }, { "epoch": 0.8732057416267942, "grad_norm": 0.7020816511860245, "learning_rate": 9.197128122177529e-06, "loss": 0.7018, "num_tokens": 74657888.0, "step": 365 }, { "epoch": 0.8755980861244019, "grad_norm": 0.7344090939912941, "learning_rate": 9.192149299614881e-06, "loss": 0.9503, "num_tokens": 74843306.0, "step": 366 }, { "epoch": 0.8779904306220095, "grad_norm": 0.7127202448586417, "learning_rate": 9.187156609084132e-06, "loss": 0.8734, "num_tokens": 75041637.0, "step": 367 }, { "epoch": 0.8803827751196173, "grad_norm": 0.7554609285292794, "learning_rate": 9.182150069338173e-06, "loss": 0.8169, "num_tokens": 75216247.0, "step": 368 }, { "epoch": 0.8827751196172249, "grad_norm": 0.6837024175271184, "learning_rate": 9.177129699181911e-06, "loss": 0.8137, "num_tokens": 75397921.0, "step": 369 }, { "epoch": 0.8851674641148325, "grad_norm": 0.6828943305704646, "learning_rate": 9.172095517472207e-06, "loss": 0.7669, "num_tokens": 75582214.0, "step": 370 }, { "epoch": 0.8875598086124402, "grad_norm": 0.7170012830419972, "learning_rate": 9.167047543117792e-06, "loss": 0.7986, "num_tokens": 75749744.0, "step": 371 }, { "epoch": 0.8899521531100478, "grad_norm": 0.6160468357211861, "learning_rate": 9.16198579507921e-06, "loss": 0.7408, "num_tokens": 75977317.0, "step": 372 }, { "epoch": 0.8923444976076556, "grad_norm": 0.5876983959606766, "learning_rate": 9.156910292368735e-06, "loss": 0.7504, "num_tokens": 76181889.0, "step": 373 }, { "epoch": 0.8947368421052632, "grad_norm": 0.5773610843835477, "learning_rate": 9.151821054050304e-06, "loss": 0.8454, "num_tokens": 76438027.0, "step": 374 }, { "epoch": 0.8971291866028708, "grad_norm": 0.6552968917204041, "learning_rate": 9.146718099239453e-06, "loss": 0.8435, "num_tokens": 76623156.0, "step": 375 }, { "epoch": 0.8995215311004785, "grad_norm": 0.5899716503345066, "learning_rate": 9.14160144710323e-06, "loss": 0.8287, "num_tokens": 76850856.0, "step": 376 }, { "epoch": 0.9019138755980861, "grad_norm": 0.6987492577459424, "learning_rate": 9.136471116860135e-06, "loss": 0.9, "num_tokens": 77008225.0, "step": 377 }, { "epoch": 0.9043062200956937, "grad_norm": 0.7290004187885213, "learning_rate": 9.131327127780042e-06, "loss": 0.7641, "num_tokens": 77219739.0, "step": 378 }, { "epoch": 0.9066985645933014, "grad_norm": 0.6480714471754585, "learning_rate": 9.126169499184133e-06, "loss": 0.6829, "num_tokens": 77397964.0, "step": 379 }, { "epoch": 0.9090909090909091, "grad_norm": 0.690068715632962, "learning_rate": 9.120998250444814e-06, "loss": 0.8802, "num_tokens": 77587726.0, "step": 380 }, { "epoch": 0.9114832535885168, "grad_norm": 0.6432928968716475, "learning_rate": 9.115813400985655e-06, "loss": 0.7432, "num_tokens": 77811303.0, "step": 381 }, { "epoch": 0.9138755980861244, "grad_norm": 0.6914007018250378, "learning_rate": 9.110614970281308e-06, "loss": 1.0123, "num_tokens": 78007317.0, "step": 382 }, { "epoch": 0.916267942583732, "grad_norm": 0.613689907821979, "learning_rate": 9.105402977857438e-06, "loss": 0.9744, "num_tokens": 78274135.0, "step": 383 }, { "epoch": 0.9186602870813397, "grad_norm": 0.7804443703479781, "learning_rate": 9.100177443290651e-06, "loss": 1.0173, "num_tokens": 78412125.0, "step": 384 }, { "epoch": 0.9210526315789473, "grad_norm": 0.6498222123812953, "learning_rate": 9.094938386208413e-06, "loss": 0.8756, "num_tokens": 78639268.0, "step": 385 }, { "epoch": 0.9234449760765551, "grad_norm": 0.7035590974736037, "learning_rate": 9.089685826288988e-06, "loss": 1.0546, "num_tokens": 78874435.0, "step": 386 }, { "epoch": 0.9258373205741627, "grad_norm": 0.6759814498789215, "learning_rate": 9.084419783261352e-06, "loss": 0.739, "num_tokens": 79069513.0, "step": 387 }, { "epoch": 0.9282296650717703, "grad_norm": 0.7725332697456008, "learning_rate": 9.07914027690513e-06, "loss": 0.8059, "num_tokens": 79220713.0, "step": 388 }, { "epoch": 0.930622009569378, "grad_norm": 0.6396922845104327, "learning_rate": 9.073847327050508e-06, "loss": 0.8805, "num_tokens": 79421216.0, "step": 389 }, { "epoch": 0.9330143540669856, "grad_norm": 0.6816021616837575, "learning_rate": 9.068540953578177e-06, "loss": 0.889, "num_tokens": 79617679.0, "step": 390 }, { "epoch": 0.9354066985645934, "grad_norm": 0.6625189742716404, "learning_rate": 9.063221176419239e-06, "loss": 0.7465, "num_tokens": 79815971.0, "step": 391 }, { "epoch": 0.937799043062201, "grad_norm": 0.7346141410671172, "learning_rate": 9.057888015555146e-06, "loss": 0.9849, "num_tokens": 79998530.0, "step": 392 }, { "epoch": 0.9401913875598086, "grad_norm": 0.7256211133967554, "learning_rate": 9.052541491017617e-06, "loss": 0.9619, "num_tokens": 80177175.0, "step": 393 }, { "epoch": 0.9425837320574163, "grad_norm": 0.6160901991706935, "learning_rate": 9.047181622888573e-06, "loss": 0.9088, "num_tokens": 80414212.0, "step": 394 }, { "epoch": 0.9449760765550239, "grad_norm": 0.6347176991976088, "learning_rate": 9.041808431300045e-06, "loss": 0.6943, "num_tokens": 80641688.0, "step": 395 }, { "epoch": 0.9473684210526315, "grad_norm": 0.7262184312989681, "learning_rate": 9.036421936434113e-06, "loss": 0.6596, "num_tokens": 80799917.0, "step": 396 }, { "epoch": 0.9497607655502392, "grad_norm": 0.7600322039283162, "learning_rate": 9.031022158522825e-06, "loss": 0.7362, "num_tokens": 80966302.0, "step": 397 }, { "epoch": 0.9521531100478469, "grad_norm": 0.6116472259730488, "learning_rate": 9.025609117848121e-06, "loss": 0.8165, "num_tokens": 81200995.0, "step": 398 }, { "epoch": 0.9545454545454546, "grad_norm": 0.6381990133240583, "learning_rate": 9.020182834741753e-06, "loss": 0.7013, "num_tokens": 81402899.0, "step": 399 }, { "epoch": 0.9569377990430622, "grad_norm": 0.6650882585682993, "learning_rate": 9.014743329585219e-06, "loss": 0.7528, "num_tokens": 81596933.0, "step": 400 }, { "epoch": 0.9593301435406698, "grad_norm": 0.7178489094483574, "learning_rate": 9.009290622809675e-06, "loss": 0.8122, "num_tokens": 81787868.0, "step": 401 }, { "epoch": 0.9617224880382775, "grad_norm": 0.678302286870584, "learning_rate": 9.003824734895865e-06, "loss": 0.8968, "num_tokens": 81985553.0, "step": 402 }, { "epoch": 0.9641148325358851, "grad_norm": 0.5812234524449984, "learning_rate": 8.998345686374041e-06, "loss": 0.4995, "num_tokens": 82196566.0, "step": 403 }, { "epoch": 0.9665071770334929, "grad_norm": 0.6680187444668858, "learning_rate": 8.992853497823892e-06, "loss": 0.9803, "num_tokens": 82416154.0, "step": 404 }, { "epoch": 0.9688995215311005, "grad_norm": 0.5808799825851646, "learning_rate": 8.987348189874452e-06, "loss": 0.8295, "num_tokens": 82679639.0, "step": 405 }, { "epoch": 0.9712918660287081, "grad_norm": 0.7747877111516284, "learning_rate": 8.981829783204043e-06, "loss": 0.8987, "num_tokens": 82865891.0, "step": 406 }, { "epoch": 0.9736842105263158, "grad_norm": 0.6578557909185174, "learning_rate": 8.97629829854018e-06, "loss": 0.7227, "num_tokens": 83036757.0, "step": 407 }, { "epoch": 0.9760765550239234, "grad_norm": 0.7083961242499112, "learning_rate": 8.970753756659503e-06, "loss": 0.823, "num_tokens": 83209010.0, "step": 408 }, { "epoch": 0.9784688995215312, "grad_norm": 0.679889621867238, "learning_rate": 8.965196178387694e-06, "loss": 1.0539, "num_tokens": 83414309.0, "step": 409 }, { "epoch": 0.9808612440191388, "grad_norm": 0.7007964954333357, "learning_rate": 8.959625584599402e-06, "loss": 0.85, "num_tokens": 83592167.0, "step": 410 }, { "epoch": 0.9832535885167464, "grad_norm": 0.6438718189253033, "learning_rate": 8.954041996218161e-06, "loss": 0.7298, "num_tokens": 83805708.0, "step": 411 }, { "epoch": 0.9856459330143541, "grad_norm": 0.7316281539767336, "learning_rate": 8.948445434216318e-06, "loss": 0.7838, "num_tokens": 83949063.0, "step": 412 }, { "epoch": 0.9880382775119617, "grad_norm": 0.6926173532035703, "learning_rate": 8.942835919614944e-06, "loss": 0.6193, "num_tokens": 84162009.0, "step": 413 }, { "epoch": 0.9904306220095693, "grad_norm": 0.7817319996989844, "learning_rate": 8.937213473483764e-06, "loss": 0.8445, "num_tokens": 84364009.0, "step": 414 }, { "epoch": 0.992822966507177, "grad_norm": 0.7552374300980621, "learning_rate": 8.931578116941076e-06, "loss": 0.7978, "num_tokens": 84546258.0, "step": 415 }, { "epoch": 0.9952153110047847, "grad_norm": 0.6591622274548676, "learning_rate": 8.925929871153669e-06, "loss": 0.8502, "num_tokens": 84772093.0, "step": 416 }, { "epoch": 0.9976076555023924, "grad_norm": 0.6609823889041686, "learning_rate": 8.920268757336743e-06, "loss": 0.9196, "num_tokens": 85012261.0, "step": 417 }, { "epoch": 1.0, "grad_norm": 0.6485021614358057, "learning_rate": 8.914594796753835e-06, "loss": 0.7938, "num_tokens": 85251468.0, "step": 418 }, { "epoch": 1.0, "eval_loss": 0.5268629193305969, "eval_num_tokens": 85251468.0, "eval_runtime": 110.895, "eval_samples_per_second": 26.8, "eval_steps_per_second": 3.355, "step": 418 }, { "epoch": 1.0023923444976077, "grad_norm": 0.8833626539066562, "learning_rate": 8.908908010716731e-06, "loss": 0.7925, "num_tokens": 85384390.0, "step": 419 }, { "epoch": 1.0047846889952152, "grad_norm": 0.6501794385479605, "learning_rate": 8.903208420585396e-06, "loss": 0.7715, "num_tokens": 85609801.0, "step": 420 }, { "epoch": 1.007177033492823, "grad_norm": 0.6943673036447846, "learning_rate": 8.89749604776788e-06, "loss": 0.7712, "num_tokens": 85790579.0, "step": 421 }, { "epoch": 1.0095693779904307, "grad_norm": 0.7449979227556588, "learning_rate": 8.891770913720254e-06, "loss": 0.5588, "num_tokens": 85982217.0, "step": 422 }, { "epoch": 1.0119617224880382, "grad_norm": 0.7280868590547691, "learning_rate": 8.886033039946517e-06, "loss": 0.675, "num_tokens": 86236063.0, "step": 423 }, { "epoch": 1.014354066985646, "grad_norm": 0.6303280123936105, "learning_rate": 8.88028244799852e-06, "loss": 0.6639, "num_tokens": 86421835.0, "step": 424 }, { "epoch": 1.0167464114832536, "grad_norm": 0.6853812289146318, "learning_rate": 8.874519159475883e-06, "loss": 0.7649, "num_tokens": 86622859.0, "step": 425 }, { "epoch": 1.0191387559808613, "grad_norm": 0.59208146860945, "learning_rate": 8.868743196025916e-06, "loss": 0.7123, "num_tokens": 86881615.0, "step": 426 }, { "epoch": 1.0215311004784688, "grad_norm": 0.7469783342201638, "learning_rate": 8.862954579343536e-06, "loss": 0.7227, "num_tokens": 87041292.0, "step": 427 }, { "epoch": 1.0239234449760766, "grad_norm": 0.5881653131003812, "learning_rate": 8.85715333117119e-06, "loss": 0.6107, "num_tokens": 87285738.0, "step": 428 }, { "epoch": 1.0263157894736843, "grad_norm": 0.5975544652225052, "learning_rate": 8.851339473298768e-06, "loss": 0.7007, "num_tokens": 87490754.0, "step": 429 }, { "epoch": 1.0287081339712918, "grad_norm": 0.6413104534073729, "learning_rate": 8.845513027563519e-06, "loss": 0.6546, "num_tokens": 87651942.0, "step": 430 }, { "epoch": 1.0311004784688995, "grad_norm": 0.6314558446671377, "learning_rate": 8.839674015849979e-06, "loss": 0.7543, "num_tokens": 87890353.0, "step": 431 }, { "epoch": 1.0334928229665072, "grad_norm": 0.7629826412960697, "learning_rate": 8.833822460089878e-06, "loss": 0.824, "num_tokens": 88067568.0, "step": 432 }, { "epoch": 1.0358851674641147, "grad_norm": 0.6075050455387743, "learning_rate": 8.827958382262065e-06, "loss": 0.6635, "num_tokens": 88287133.0, "step": 433 }, { "epoch": 1.0382775119617225, "grad_norm": 0.6196621512852998, "learning_rate": 8.822081804392422e-06, "loss": 0.5884, "num_tokens": 88486840.0, "step": 434 }, { "epoch": 1.0406698564593302, "grad_norm": 0.6961183655503131, "learning_rate": 8.81619274855378e-06, "loss": 0.7386, "num_tokens": 88673475.0, "step": 435 }, { "epoch": 1.0430622009569377, "grad_norm": 0.6729951014404428, "learning_rate": 8.810291236865841e-06, "loss": 0.6455, "num_tokens": 88845660.0, "step": 436 }, { "epoch": 1.0454545454545454, "grad_norm": 0.6342083555311104, "learning_rate": 8.804377291495093e-06, "loss": 0.8274, "num_tokens": 89088548.0, "step": 437 }, { "epoch": 1.0478468899521531, "grad_norm": 0.6701006397404995, "learning_rate": 8.798450934654722e-06, "loss": 0.6701, "num_tokens": 89260368.0, "step": 438 }, { "epoch": 1.0502392344497609, "grad_norm": 0.7404737999856115, "learning_rate": 8.792512188604536e-06, "loss": 0.7418, "num_tokens": 89419609.0, "step": 439 }, { "epoch": 1.0526315789473684, "grad_norm": 0.6448113889129491, "learning_rate": 8.78656107565087e-06, "loss": 0.655, "num_tokens": 89596692.0, "step": 440 }, { "epoch": 1.055023923444976, "grad_norm": 0.558939448736452, "learning_rate": 8.780597618146524e-06, "loss": 0.7539, "num_tokens": 89894246.0, "step": 441 }, { "epoch": 1.0574162679425838, "grad_norm": 0.7842683298759312, "learning_rate": 8.774621838490652e-06, "loss": 0.5933, "num_tokens": 90034436.0, "step": 442 }, { "epoch": 1.0598086124401913, "grad_norm": 0.7153205331680785, "learning_rate": 8.768633759128696e-06, "loss": 0.4877, "num_tokens": 90204583.0, "step": 443 }, { "epoch": 1.062200956937799, "grad_norm": 0.6230069749775586, "learning_rate": 8.762633402552296e-06, "loss": 0.5801, "num_tokens": 90371035.0, "step": 444 }, { "epoch": 1.0645933014354068, "grad_norm": 0.5450852716876841, "learning_rate": 8.75662079129921e-06, "loss": 0.6347, "num_tokens": 90592327.0, "step": 445 }, { "epoch": 1.0669856459330143, "grad_norm": 0.6532394649960671, "learning_rate": 8.750595947953215e-06, "loss": 0.5904, "num_tokens": 90824557.0, "step": 446 }, { "epoch": 1.069377990430622, "grad_norm": 0.6187711188149183, "learning_rate": 8.744558895144044e-06, "loss": 0.7607, "num_tokens": 91073154.0, "step": 447 }, { "epoch": 1.0717703349282297, "grad_norm": 0.6980624446279815, "learning_rate": 8.738509655547284e-06, "loss": 0.7618, "num_tokens": 91258963.0, "step": 448 }, { "epoch": 1.0741626794258372, "grad_norm": 0.7024003253964036, "learning_rate": 8.732448251884296e-06, "loss": 0.7303, "num_tokens": 91423256.0, "step": 449 }, { "epoch": 1.076555023923445, "grad_norm": 0.6634897373813881, "learning_rate": 8.726374706922133e-06, "loss": 0.7738, "num_tokens": 91637291.0, "step": 450 }, { "epoch": 1.0789473684210527, "grad_norm": 0.6804572080651244, "learning_rate": 8.72028904347345e-06, "loss": 0.7228, "num_tokens": 91841448.0, "step": 451 }, { "epoch": 1.0813397129186604, "grad_norm": 0.6518235040560235, "learning_rate": 8.714191284396417e-06, "loss": 0.8062, "num_tokens": 92053059.0, "step": 452 }, { "epoch": 1.0837320574162679, "grad_norm": 0.6080056046652248, "learning_rate": 8.708081452594644e-06, "loss": 0.6634, "num_tokens": 92261100.0, "step": 453 }, { "epoch": 1.0861244019138756, "grad_norm": 0.6544718325563127, "learning_rate": 8.701959571017075e-06, "loss": 0.6247, "num_tokens": 92464515.0, "step": 454 }, { "epoch": 1.0885167464114833, "grad_norm": 0.7381644122745519, "learning_rate": 8.695825662657926e-06, "loss": 0.8531, "num_tokens": 92644031.0, "step": 455 }, { "epoch": 1.0909090909090908, "grad_norm": 0.6061504580235114, "learning_rate": 8.68967975055658e-06, "loss": 0.687, "num_tokens": 92853081.0, "step": 456 }, { "epoch": 1.0933014354066986, "grad_norm": 0.657190432984543, "learning_rate": 8.683521857797506e-06, "loss": 0.8088, "num_tokens": 93084673.0, "step": 457 }, { "epoch": 1.0956937799043063, "grad_norm": 0.6531620668793157, "learning_rate": 8.677352007510175e-06, "loss": 0.6926, "num_tokens": 93272082.0, "step": 458 }, { "epoch": 1.0980861244019138, "grad_norm": 0.5469254561243377, "learning_rate": 8.671170222868973e-06, "loss": 0.8201, "num_tokens": 93532996.0, "step": 459 }, { "epoch": 1.1004784688995215, "grad_norm": 0.7224234205027193, "learning_rate": 8.664976527093111e-06, "loss": 0.6561, "num_tokens": 93700551.0, "step": 460 }, { "epoch": 1.1028708133971292, "grad_norm": 0.6103043834106922, "learning_rate": 8.658770943446537e-06, "loss": 0.6106, "num_tokens": 93897552.0, "step": 461 }, { "epoch": 1.1052631578947367, "grad_norm": 0.6388200976277861, "learning_rate": 8.652553495237854e-06, "loss": 0.6085, "num_tokens": 94091727.0, "step": 462 }, { "epoch": 1.1076555023923444, "grad_norm": 0.6688215830036501, "learning_rate": 8.646324205820225e-06, "loss": 0.7937, "num_tokens": 94323488.0, "step": 463 }, { "epoch": 1.1100478468899522, "grad_norm": 0.6499235099896005, "learning_rate": 8.640083098591295e-06, "loss": 0.6834, "num_tokens": 94504378.0, "step": 464 }, { "epoch": 1.11244019138756, "grad_norm": 0.6141690793822103, "learning_rate": 8.633830196993091e-06, "loss": 0.6812, "num_tokens": 94704650.0, "step": 465 }, { "epoch": 1.1148325358851674, "grad_norm": 0.711438564340125, "learning_rate": 8.627565524511947e-06, "loss": 0.5216, "num_tokens": 94940873.0, "step": 466 }, { "epoch": 1.1172248803827751, "grad_norm": 0.6167376352678712, "learning_rate": 8.621289104678402e-06, "loss": 0.7062, "num_tokens": 95151213.0, "step": 467 }, { "epoch": 1.1196172248803828, "grad_norm": 0.7384760378511186, "learning_rate": 8.615000961067127e-06, "loss": 0.5648, "num_tokens": 95285914.0, "step": 468 }, { "epoch": 1.1220095693779903, "grad_norm": 0.700556827023194, "learning_rate": 8.608701117296823e-06, "loss": 0.5138, "num_tokens": 95444044.0, "step": 469 }, { "epoch": 1.124401913875598, "grad_norm": 0.671132609284084, "learning_rate": 8.602389597030137e-06, "loss": 0.7428, "num_tokens": 95680518.0, "step": 470 }, { "epoch": 1.1267942583732058, "grad_norm": 0.5442552642709221, "learning_rate": 8.596066423973581e-06, "loss": 0.595, "num_tokens": 95923658.0, "step": 471 }, { "epoch": 1.1291866028708135, "grad_norm": 0.6531706451873329, "learning_rate": 8.589731621877422e-06, "loss": 0.6666, "num_tokens": 96114735.0, "step": 472 }, { "epoch": 1.131578947368421, "grad_norm": 0.6088153066934165, "learning_rate": 8.583385214535623e-06, "loss": 0.6571, "num_tokens": 96344220.0, "step": 473 }, { "epoch": 1.1339712918660287, "grad_norm": 0.6312706973960832, "learning_rate": 8.577027225785722e-06, "loss": 0.707, "num_tokens": 96532999.0, "step": 474 }, { "epoch": 1.1363636363636362, "grad_norm": 0.7723559287093527, "learning_rate": 8.57065767950877e-06, "loss": 0.8366, "num_tokens": 96724160.0, "step": 475 }, { "epoch": 1.138755980861244, "grad_norm": 0.6292499138689865, "learning_rate": 8.56427659962922e-06, "loss": 0.6214, "num_tokens": 96921531.0, "step": 476 }, { "epoch": 1.1411483253588517, "grad_norm": 0.6214894130873831, "learning_rate": 8.55788401011485e-06, "loss": 0.8551, "num_tokens": 97135552.0, "step": 477 }, { "epoch": 1.1435406698564594, "grad_norm": 0.6259093670614493, "learning_rate": 8.551479934976668e-06, "loss": 0.6917, "num_tokens": 97345841.0, "step": 478 }, { "epoch": 1.145933014354067, "grad_norm": 0.7168111716307588, "learning_rate": 8.545064398268826e-06, "loss": 0.8033, "num_tokens": 97509658.0, "step": 479 }, { "epoch": 1.1483253588516746, "grad_norm": 0.5987584068142854, "learning_rate": 8.53863742408852e-06, "loss": 0.5887, "num_tokens": 97731018.0, "step": 480 }, { "epoch": 1.1507177033492824, "grad_norm": 0.6900729370961642, "learning_rate": 8.53219903657591e-06, "loss": 0.8195, "num_tokens": 97933426.0, "step": 481 }, { "epoch": 1.1531100478468899, "grad_norm": 0.6355834178889574, "learning_rate": 8.525749259914024e-06, "loss": 0.7962, "num_tokens": 98169526.0, "step": 482 }, { "epoch": 1.1555023923444976, "grad_norm": 0.6731062920620434, "learning_rate": 8.519288118328672e-06, "loss": 0.7054, "num_tokens": 98324262.0, "step": 483 }, { "epoch": 1.1578947368421053, "grad_norm": 0.6674576791981732, "learning_rate": 8.512815636088347e-06, "loss": 0.669, "num_tokens": 98515046.0, "step": 484 }, { "epoch": 1.160287081339713, "grad_norm": 0.6869492423073407, "learning_rate": 8.50633183750414e-06, "loss": 0.7411, "num_tokens": 98716661.0, "step": 485 }, { "epoch": 1.1626794258373205, "grad_norm": 0.6834157769460931, "learning_rate": 8.499836746929648e-06, "loss": 0.6594, "num_tokens": 98880085.0, "step": 486 }, { "epoch": 1.1650717703349283, "grad_norm": 0.6164602298440709, "learning_rate": 8.493330388760878e-06, "loss": 0.7071, "num_tokens": 99126250.0, "step": 487 }, { "epoch": 1.167464114832536, "grad_norm": 0.5616797077224229, "learning_rate": 8.486812787436162e-06, "loss": 0.7958, "num_tokens": 99369635.0, "step": 488 }, { "epoch": 1.1698564593301435, "grad_norm": 0.715505280608796, "learning_rate": 8.480283967436063e-06, "loss": 0.7958, "num_tokens": 99542434.0, "step": 489 }, { "epoch": 1.1722488038277512, "grad_norm": 0.6040191308287042, "learning_rate": 8.473743953283279e-06, "loss": 0.5818, "num_tokens": 99728793.0, "step": 490 }, { "epoch": 1.174641148325359, "grad_norm": 0.633301702571846, "learning_rate": 8.467192769542555e-06, "loss": 0.703, "num_tokens": 99929158.0, "step": 491 }, { "epoch": 1.1770334928229664, "grad_norm": 0.670950489427574, "learning_rate": 8.460630440820593e-06, "loss": 0.7055, "num_tokens": 100157655.0, "step": 492 }, { "epoch": 1.1794258373205742, "grad_norm": 0.616738962710494, "learning_rate": 8.454056991765949e-06, "loss": 0.5882, "num_tokens": 100334982.0, "step": 493 }, { "epoch": 1.1818181818181819, "grad_norm": 0.701693251512557, "learning_rate": 8.447472447068957e-06, "loss": 0.7114, "num_tokens": 100520642.0, "step": 494 }, { "epoch": 1.1842105263157894, "grad_norm": 0.6910368950769709, "learning_rate": 8.440876831461615e-06, "loss": 0.817, "num_tokens": 100729788.0, "step": 495 }, { "epoch": 1.186602870813397, "grad_norm": 0.6007792355517747, "learning_rate": 8.434270169717521e-06, "loss": 0.6787, "num_tokens": 100979908.0, "step": 496 }, { "epoch": 1.1889952153110048, "grad_norm": 0.7122991248891785, "learning_rate": 8.427652486651749e-06, "loss": 0.7728, "num_tokens": 101151147.0, "step": 497 }, { "epoch": 1.1913875598086126, "grad_norm": 0.6917781088203647, "learning_rate": 8.42102380712077e-06, "loss": 0.7039, "num_tokens": 101312071.0, "step": 498 }, { "epoch": 1.19377990430622, "grad_norm": 0.58665825021754, "learning_rate": 8.41438415602237e-06, "loss": 0.5133, "num_tokens": 101493495.0, "step": 499 }, { "epoch": 1.1961722488038278, "grad_norm": 0.6161205811479572, "learning_rate": 8.407733558295534e-06, "loss": 0.6248, "num_tokens": 101690177.0, "step": 500 }, { "epoch": 1.1985645933014355, "grad_norm": 0.6348422730130996, "learning_rate": 8.401072038920364e-06, "loss": 0.9154, "num_tokens": 101940187.0, "step": 501 }, { "epoch": 1.200956937799043, "grad_norm": 0.5887797500918408, "learning_rate": 8.394399622917993e-06, "loss": 0.8369, "num_tokens": 102137576.0, "step": 502 }, { "epoch": 1.2033492822966507, "grad_norm": 0.622168916271061, "learning_rate": 8.387716335350476e-06, "loss": 0.7483, "num_tokens": 102334478.0, "step": 503 }, { "epoch": 1.2057416267942584, "grad_norm": 0.5362188064080355, "learning_rate": 8.3810222013207e-06, "loss": 0.6779, "num_tokens": 102580656.0, "step": 504 }, { "epoch": 1.208133971291866, "grad_norm": 0.6236816878968677, "learning_rate": 8.374317245972297e-06, "loss": 0.5656, "num_tokens": 102754539.0, "step": 505 }, { "epoch": 1.2105263157894737, "grad_norm": 0.6710884082075333, "learning_rate": 8.367601494489544e-06, "loss": 0.7807, "num_tokens": 102956836.0, "step": 506 }, { "epoch": 1.2129186602870814, "grad_norm": 0.5160236694619329, "learning_rate": 8.360874972097267e-06, "loss": 0.627, "num_tokens": 103222597.0, "step": 507 }, { "epoch": 1.215311004784689, "grad_norm": 0.6392481521941529, "learning_rate": 8.354137704060751e-06, "loss": 0.7102, "num_tokens": 103432011.0, "step": 508 }, { "epoch": 1.2177033492822966, "grad_norm": 0.5609407707750648, "learning_rate": 8.347389715685638e-06, "loss": 0.5622, "num_tokens": 103662118.0, "step": 509 }, { "epoch": 1.2200956937799043, "grad_norm": 0.5857654279746968, "learning_rate": 8.34063103231784e-06, "loss": 0.7978, "num_tokens": 103935876.0, "step": 510 }, { "epoch": 1.222488038277512, "grad_norm": 0.6427090354154541, "learning_rate": 8.33386167934344e-06, "loss": 0.8944, "num_tokens": 104179417.0, "step": 511 }, { "epoch": 1.2248803827751196, "grad_norm": 0.5388067729124021, "learning_rate": 8.327081682188594e-06, "loss": 0.6181, "num_tokens": 104404008.0, "step": 512 }, { "epoch": 1.2272727272727273, "grad_norm": 0.5968370274704392, "learning_rate": 8.320291066319442e-06, "loss": 0.8699, "num_tokens": 104627408.0, "step": 513 }, { "epoch": 1.229665071770335, "grad_norm": 0.5878293652309775, "learning_rate": 8.313489857242006e-06, "loss": 0.792, "num_tokens": 104875829.0, "step": 514 }, { "epoch": 1.2320574162679425, "grad_norm": 0.6157999004770994, "learning_rate": 8.306678080502099e-06, "loss": 0.679, "num_tokens": 105046140.0, "step": 515 }, { "epoch": 1.2344497607655502, "grad_norm": 0.5526340028957575, "learning_rate": 8.299855761685223e-06, "loss": 0.6366, "num_tokens": 105270581.0, "step": 516 }, { "epoch": 1.236842105263158, "grad_norm": 0.5659621965437514, "learning_rate": 8.293022926416482e-06, "loss": 0.6995, "num_tokens": 105485976.0, "step": 517 }, { "epoch": 1.2392344497607655, "grad_norm": 0.611032283336834, "learning_rate": 8.286179600360476e-06, "loss": 0.6958, "num_tokens": 105700505.0, "step": 518 }, { "epoch": 1.2416267942583732, "grad_norm": 0.6139639547074152, "learning_rate": 8.27932580922121e-06, "loss": 0.7304, "num_tokens": 105903407.0, "step": 519 }, { "epoch": 1.244019138755981, "grad_norm": 0.7850060555839244, "learning_rate": 8.272461578742001e-06, "loss": 0.6765, "num_tokens": 106052358.0, "step": 520 }, { "epoch": 1.2464114832535884, "grad_norm": 0.6629378613315833, "learning_rate": 8.265586934705371e-06, "loss": 0.5863, "num_tokens": 106237968.0, "step": 521 }, { "epoch": 1.2488038277511961, "grad_norm": 0.628395091568339, "learning_rate": 8.258701902932957e-06, "loss": 0.9289, "num_tokens": 106450419.0, "step": 522 }, { "epoch": 1.2511961722488039, "grad_norm": 0.633356030726603, "learning_rate": 8.251806509285418e-06, "loss": 0.9009, "num_tokens": 106702311.0, "step": 523 }, { "epoch": 1.2535885167464116, "grad_norm": 0.6917776003797732, "learning_rate": 8.244900779662326e-06, "loss": 0.8736, "num_tokens": 106917705.0, "step": 524 }, { "epoch": 1.255980861244019, "grad_norm": 0.5759491864770921, "learning_rate": 8.237984740002084e-06, "loss": 0.6075, "num_tokens": 107147203.0, "step": 525 }, { "epoch": 1.2583732057416268, "grad_norm": 0.6458032921809065, "learning_rate": 8.23105841628181e-06, "loss": 0.6169, "num_tokens": 107333669.0, "step": 526 }, { "epoch": 1.2607655502392343, "grad_norm": 0.6915332882979576, "learning_rate": 8.224121834517253e-06, "loss": 0.6451, "num_tokens": 107483189.0, "step": 527 }, { "epoch": 1.263157894736842, "grad_norm": 0.761378206128293, "learning_rate": 8.217175020762702e-06, "loss": 0.725, "num_tokens": 107631808.0, "step": 528 }, { "epoch": 1.2655502392344498, "grad_norm": 0.7381435867769686, "learning_rate": 8.210218001110862e-06, "loss": 0.6566, "num_tokens": 107795891.0, "step": 529 }, { "epoch": 1.2679425837320575, "grad_norm": 0.6783225501913807, "learning_rate": 8.203250801692786e-06, "loss": 0.7918, "num_tokens": 107979143.0, "step": 530 }, { "epoch": 1.2703349282296652, "grad_norm": 0.6484130153813725, "learning_rate": 8.196273448677753e-06, "loss": 0.669, "num_tokens": 108196590.0, "step": 531 }, { "epoch": 1.2727272727272727, "grad_norm": 0.6554392135157858, "learning_rate": 8.189285968273185e-06, "loss": 0.9, "num_tokens": 108381814.0, "step": 532 }, { "epoch": 1.2751196172248804, "grad_norm": 0.5849041476113271, "learning_rate": 8.18228838672454e-06, "loss": 0.7717, "num_tokens": 108619346.0, "step": 533 }, { "epoch": 1.277511961722488, "grad_norm": 0.5350083618643547, "learning_rate": 8.175280730315222e-06, "loss": 0.595, "num_tokens": 108875870.0, "step": 534 }, { "epoch": 1.2799043062200957, "grad_norm": 0.6159114405886904, "learning_rate": 8.168263025366468e-06, "loss": 0.8042, "num_tokens": 109079053.0, "step": 535 }, { "epoch": 1.2822966507177034, "grad_norm": 0.6158840971619612, "learning_rate": 8.161235298237268e-06, "loss": 0.6988, "num_tokens": 109262309.0, "step": 536 }, { "epoch": 1.284688995215311, "grad_norm": 0.6938016995261597, "learning_rate": 8.154197575324247e-06, "loss": 0.7512, "num_tokens": 109483891.0, "step": 537 }, { "epoch": 1.2870813397129186, "grad_norm": 0.7241914849049002, "learning_rate": 8.147149883061581e-06, "loss": 0.6713, "num_tokens": 109636457.0, "step": 538 }, { "epoch": 1.2894736842105263, "grad_norm": 0.5664246048988882, "learning_rate": 8.14009224792089e-06, "loss": 0.7083, "num_tokens": 109869798.0, "step": 539 }, { "epoch": 1.291866028708134, "grad_norm": 0.6406286160132263, "learning_rate": 8.133024696411138e-06, "loss": 0.6811, "num_tokens": 110055693.0, "step": 540 }, { "epoch": 1.2942583732057416, "grad_norm": 0.6123102900846681, "learning_rate": 8.125947255078537e-06, "loss": 0.6826, "num_tokens": 110294193.0, "step": 541 }, { "epoch": 1.2966507177033493, "grad_norm": 0.7129531959007127, "learning_rate": 8.11885995050645e-06, "loss": 0.4591, "num_tokens": 110438211.0, "step": 542 }, { "epoch": 1.299043062200957, "grad_norm": 0.6384794663335771, "learning_rate": 8.111762809315278e-06, "loss": 1.0005, "num_tokens": 110698383.0, "step": 543 }, { "epoch": 1.3014354066985647, "grad_norm": 0.7042025683566201, "learning_rate": 8.10465585816238e-06, "loss": 0.7055, "num_tokens": 110867049.0, "step": 544 }, { "epoch": 1.3038277511961722, "grad_norm": 0.5768164534769485, "learning_rate": 8.097539123741949e-06, "loss": 0.6007, "num_tokens": 111108415.0, "step": 545 }, { "epoch": 1.30622009569378, "grad_norm": 0.6313351384712172, "learning_rate": 8.090412632784934e-06, "loss": 0.7005, "num_tokens": 111323074.0, "step": 546 }, { "epoch": 1.3086124401913874, "grad_norm": 0.6206830248756131, "learning_rate": 8.083276412058932e-06, "loss": 0.6337, "num_tokens": 111537304.0, "step": 547 }, { "epoch": 1.3110047846889952, "grad_norm": 0.6758816496685596, "learning_rate": 8.076130488368078e-06, "loss": 0.7958, "num_tokens": 111747591.0, "step": 548 }, { "epoch": 1.313397129186603, "grad_norm": 0.6241461189184837, "learning_rate": 8.068974888552955e-06, "loss": 0.7754, "num_tokens": 112006485.0, "step": 549 }, { "epoch": 1.3157894736842106, "grad_norm": 0.6991048663837968, "learning_rate": 8.061809639490493e-06, "loss": 0.6889, "num_tokens": 112182235.0, "step": 550 }, { "epoch": 1.3181818181818181, "grad_norm": 0.6561488497102201, "learning_rate": 8.054634768093861e-06, "loss": 0.8882, "num_tokens": 112411061.0, "step": 551 }, { "epoch": 1.3205741626794258, "grad_norm": 0.6114674626884281, "learning_rate": 8.047450301312373e-06, "loss": 0.6171, "num_tokens": 112594695.0, "step": 552 }, { "epoch": 1.3229665071770336, "grad_norm": 0.6947222688278742, "learning_rate": 8.040256266131384e-06, "loss": 0.7327, "num_tokens": 112796582.0, "step": 553 }, { "epoch": 1.325358851674641, "grad_norm": 0.7174977428549918, "learning_rate": 8.033052689572185e-06, "loss": 0.9901, "num_tokens": 113018372.0, "step": 554 }, { "epoch": 1.3277511961722488, "grad_norm": 0.5459671536037778, "learning_rate": 8.02583959869191e-06, "loss": 0.4489, "num_tokens": 113224353.0, "step": 555 }, { "epoch": 1.3301435406698565, "grad_norm": 0.6098137823161369, "learning_rate": 8.018617020583429e-06, "loss": 0.829, "num_tokens": 113427127.0, "step": 556 }, { "epoch": 1.3325358851674642, "grad_norm": 0.6568359830867574, "learning_rate": 8.011384982375238e-06, "loss": 0.5615, "num_tokens": 113626089.0, "step": 557 }, { "epoch": 1.3349282296650717, "grad_norm": 0.6281135823952779, "learning_rate": 8.00414351123138e-06, "loss": 0.7058, "num_tokens": 113865385.0, "step": 558 }, { "epoch": 1.3373205741626795, "grad_norm": 0.6010690345416376, "learning_rate": 7.996892634351319e-06, "loss": 0.6422, "num_tokens": 114094828.0, "step": 559 }, { "epoch": 1.339712918660287, "grad_norm": 0.638515263025877, "learning_rate": 7.989632378969849e-06, "loss": 0.6424, "num_tokens": 114285775.0, "step": 560 }, { "epoch": 1.3421052631578947, "grad_norm": 0.6605018239443282, "learning_rate": 7.982362772356989e-06, "loss": 0.709, "num_tokens": 114464592.0, "step": 561 }, { "epoch": 1.3444976076555024, "grad_norm": 0.633121921756574, "learning_rate": 7.97508384181789e-06, "loss": 0.6402, "num_tokens": 114661703.0, "step": 562 }, { "epoch": 1.3468899521531101, "grad_norm": 0.6930520796230434, "learning_rate": 7.967795614692714e-06, "loss": 0.7155, "num_tokens": 114854790.0, "step": 563 }, { "epoch": 1.3492822966507176, "grad_norm": 0.5462776230436035, "learning_rate": 7.96049811835655e-06, "loss": 0.4381, "num_tokens": 115063770.0, "step": 564 }, { "epoch": 1.3516746411483254, "grad_norm": 0.7349664493374579, "learning_rate": 7.953191380219294e-06, "loss": 0.6836, "num_tokens": 115230976.0, "step": 565 }, { "epoch": 1.354066985645933, "grad_norm": 0.6404975561020254, "learning_rate": 7.945875427725563e-06, "loss": 0.8089, "num_tokens": 115449531.0, "step": 566 }, { "epoch": 1.3564593301435406, "grad_norm": 0.598351041926338, "learning_rate": 7.93855028835458e-06, "loss": 0.8505, "num_tokens": 115683622.0, "step": 567 }, { "epoch": 1.3588516746411483, "grad_norm": 0.5816191942301666, "learning_rate": 7.931215989620075e-06, "loss": 0.71, "num_tokens": 115850770.0, "step": 568 }, { "epoch": 1.361244019138756, "grad_norm": 0.5989480503872827, "learning_rate": 7.923872559070179e-06, "loss": 0.5141, "num_tokens": 116051148.0, "step": 569 }, { "epoch": 1.3636363636363638, "grad_norm": 0.7146437669553671, "learning_rate": 7.916520024287329e-06, "loss": 0.7548, "num_tokens": 116247421.0, "step": 570 }, { "epoch": 1.3660287081339713, "grad_norm": 0.6512219935518021, "learning_rate": 7.909158412888152e-06, "loss": 0.7714, "num_tokens": 116456221.0, "step": 571 }, { "epoch": 1.368421052631579, "grad_norm": 0.6640687941436083, "learning_rate": 7.901787752523368e-06, "loss": 0.7303, "num_tokens": 116616642.0, "step": 572 }, { "epoch": 1.3708133971291865, "grad_norm": 0.685283033422343, "learning_rate": 7.89440807087769e-06, "loss": 0.6986, "num_tokens": 116828595.0, "step": 573 }, { "epoch": 1.3732057416267942, "grad_norm": 0.7264640201047767, "learning_rate": 7.887019395669711e-06, "loss": 0.8794, "num_tokens": 117000928.0, "step": 574 }, { "epoch": 1.375598086124402, "grad_norm": 0.6035933154970914, "learning_rate": 7.879621754651806e-06, "loss": 0.6661, "num_tokens": 117224707.0, "step": 575 }, { "epoch": 1.3779904306220097, "grad_norm": 0.7356154161250829, "learning_rate": 7.872215175610027e-06, "loss": 0.7089, "num_tokens": 117406833.0, "step": 576 }, { "epoch": 1.3803827751196172, "grad_norm": 0.6784361033034778, "learning_rate": 7.864799686363998e-06, "loss": 0.9435, "num_tokens": 117608333.0, "step": 577 }, { "epoch": 1.3827751196172249, "grad_norm": 0.601646584885362, "learning_rate": 7.857375314766805e-06, "loss": 0.6242, "num_tokens": 117824859.0, "step": 578 }, { "epoch": 1.3851674641148326, "grad_norm": 0.6286606987208951, "learning_rate": 7.849942088704908e-06, "loss": 0.7129, "num_tokens": 118009711.0, "step": 579 }, { "epoch": 1.38755980861244, "grad_norm": 0.6712984784969586, "learning_rate": 7.842500036098011e-06, "loss": 0.7308, "num_tokens": 118195660.0, "step": 580 }, { "epoch": 1.3899521531100478, "grad_norm": 0.6986289059083648, "learning_rate": 7.835049184898984e-06, "loss": 0.7085, "num_tokens": 118374150.0, "step": 581 }, { "epoch": 1.3923444976076556, "grad_norm": 0.6365250045967269, "learning_rate": 7.827589563093737e-06, "loss": 0.7445, "num_tokens": 118578238.0, "step": 582 }, { "epoch": 1.3947368421052633, "grad_norm": 0.5980300561132873, "learning_rate": 7.820121198701123e-06, "loss": 0.732, "num_tokens": 118786622.0, "step": 583 }, { "epoch": 1.3971291866028708, "grad_norm": 0.5947783020696292, "learning_rate": 7.812644119772838e-06, "loss": 0.7338, "num_tokens": 119008588.0, "step": 584 }, { "epoch": 1.3995215311004785, "grad_norm": 0.5903656266151648, "learning_rate": 7.80515835439331e-06, "loss": 0.7805, "num_tokens": 119262356.0, "step": 585 }, { "epoch": 1.401913875598086, "grad_norm": 0.6206154261533207, "learning_rate": 7.797663930679586e-06, "loss": 0.836, "num_tokens": 119472323.0, "step": 586 }, { "epoch": 1.4043062200956937, "grad_norm": 0.6626657461296346, "learning_rate": 7.790160876781242e-06, "loss": 0.8119, "num_tokens": 119652941.0, "step": 587 }, { "epoch": 1.4066985645933014, "grad_norm": 0.6868149066567649, "learning_rate": 7.782649220880268e-06, "loss": 0.6947, "num_tokens": 119833083.0, "step": 588 }, { "epoch": 1.4090909090909092, "grad_norm": 0.6204817485919448, "learning_rate": 7.775128991190963e-06, "loss": 0.7922, "num_tokens": 120030493.0, "step": 589 }, { "epoch": 1.4114832535885167, "grad_norm": 0.6724390520172234, "learning_rate": 7.76760021595983e-06, "loss": 0.7076, "num_tokens": 120259396.0, "step": 590 }, { "epoch": 1.4138755980861244, "grad_norm": 0.6440599092231826, "learning_rate": 7.760062923465467e-06, "loss": 0.4596, "num_tokens": 120416796.0, "step": 591 }, { "epoch": 1.4162679425837321, "grad_norm": 0.6218562163131692, "learning_rate": 7.75251714201847e-06, "loss": 0.6655, "num_tokens": 120616896.0, "step": 592 }, { "epoch": 1.4186602870813396, "grad_norm": 0.6849812328433857, "learning_rate": 7.744962899961311e-06, "loss": 0.7767, "num_tokens": 120787459.0, "step": 593 }, { "epoch": 1.4210526315789473, "grad_norm": 0.6170197137081643, "learning_rate": 7.73740022566825e-06, "loss": 0.7519, "num_tokens": 120998500.0, "step": 594 }, { "epoch": 1.423444976076555, "grad_norm": 0.6454119314614041, "learning_rate": 7.729829147545214e-06, "loss": 0.6476, "num_tokens": 121209444.0, "step": 595 }, { "epoch": 1.4258373205741628, "grad_norm": 0.6253962654706593, "learning_rate": 7.722249694029692e-06, "loss": 0.4759, "num_tokens": 121377586.0, "step": 596 }, { "epoch": 1.4282296650717703, "grad_norm": 0.6096531645355685, "learning_rate": 7.714661893590641e-06, "loss": 0.7043, "num_tokens": 121605803.0, "step": 597 }, { "epoch": 1.430622009569378, "grad_norm": 0.6005996949073303, "learning_rate": 7.707065774728359e-06, "loss": 0.6528, "num_tokens": 121832216.0, "step": 598 }, { "epoch": 1.4330143540669855, "grad_norm": 0.6389038973889795, "learning_rate": 7.699461365974398e-06, "loss": 0.551, "num_tokens": 122025954.0, "step": 599 }, { "epoch": 1.4354066985645932, "grad_norm": 0.6974149384728023, "learning_rate": 7.69184869589144e-06, "loss": 0.738, "num_tokens": 122208176.0, "step": 600 }, { "epoch": 1.437799043062201, "grad_norm": 0.6150640365254872, "learning_rate": 7.684227793073205e-06, "loss": 0.7368, "num_tokens": 122410734.0, "step": 601 }, { "epoch": 1.4401913875598087, "grad_norm": 0.696837292155682, "learning_rate": 7.676598686144326e-06, "loss": 0.8521, "num_tokens": 122611097.0, "step": 602 }, { "epoch": 1.4425837320574162, "grad_norm": 0.6565300019075943, "learning_rate": 7.668961403760257e-06, "loss": 0.6649, "num_tokens": 122803282.0, "step": 603 }, { "epoch": 1.444976076555024, "grad_norm": 0.6692653954984873, "learning_rate": 7.661315974607161e-06, "loss": 0.6806, "num_tokens": 123012117.0, "step": 604 }, { "epoch": 1.4473684210526316, "grad_norm": 0.6617024095318141, "learning_rate": 7.653662427401797e-06, "loss": 0.8588, "num_tokens": 123255081.0, "step": 605 }, { "epoch": 1.4497607655502391, "grad_norm": 0.5964810181267309, "learning_rate": 7.646000790891418e-06, "loss": 0.5307, "num_tokens": 123453656.0, "step": 606 }, { "epoch": 1.4521531100478469, "grad_norm": 0.7899864065988431, "learning_rate": 7.638331093853662e-06, "loss": 0.5959, "num_tokens": 123633504.0, "step": 607 }, { "epoch": 1.4545454545454546, "grad_norm": 0.6887408990548677, "learning_rate": 7.63065336509644e-06, "loss": 0.7329, "num_tokens": 123801896.0, "step": 608 }, { "epoch": 1.4569377990430623, "grad_norm": 0.633552365230913, "learning_rate": 7.622967633457831e-06, "loss": 0.6681, "num_tokens": 123974366.0, "step": 609 }, { "epoch": 1.4593301435406698, "grad_norm": 0.6343850458718804, "learning_rate": 7.615273927805976e-06, "loss": 0.9128, "num_tokens": 124190706.0, "step": 610 }, { "epoch": 1.4617224880382775, "grad_norm": 0.5748629243715516, "learning_rate": 7.6075722770389634e-06, "loss": 0.6477, "num_tokens": 124403746.0, "step": 611 }, { "epoch": 1.464114832535885, "grad_norm": 0.5661320410441629, "learning_rate": 7.599862710084731e-06, "loss": 0.5624, "num_tokens": 124578707.0, "step": 612 }, { "epoch": 1.4665071770334928, "grad_norm": 0.5900248167722216, "learning_rate": 7.5921452559009424e-06, "loss": 0.8237, "num_tokens": 124820690.0, "step": 613 }, { "epoch": 1.4688995215311005, "grad_norm": 0.6386434549167093, "learning_rate": 7.584419943474888e-06, "loss": 0.716, "num_tokens": 125023721.0, "step": 614 }, { "epoch": 1.4712918660287082, "grad_norm": 0.669335586967033, "learning_rate": 7.576686801823377e-06, "loss": 0.7834, "num_tokens": 125226122.0, "step": 615 }, { "epoch": 1.4736842105263157, "grad_norm": 0.6165452625138145, "learning_rate": 7.5689458599926244e-06, "loss": 0.8478, "num_tokens": 125465984.0, "step": 616 }, { "epoch": 1.4760765550239234, "grad_norm": 0.6508737359095208, "learning_rate": 7.5611971470581425e-06, "loss": 0.7474, "num_tokens": 125660386.0, "step": 617 }, { "epoch": 1.4784688995215312, "grad_norm": 0.5374864935102592, "learning_rate": 7.553440692124634e-06, "loss": 0.4226, "num_tokens": 125893050.0, "step": 618 }, { "epoch": 1.4808612440191387, "grad_norm": 0.6796724151735416, "learning_rate": 7.545676524325878e-06, "loss": 0.6615, "num_tokens": 126094266.0, "step": 619 }, { "epoch": 1.4832535885167464, "grad_norm": 0.5751206841528997, "learning_rate": 7.537904672824627e-06, "loss": 0.8344, "num_tokens": 126333416.0, "step": 620 }, { "epoch": 1.485645933014354, "grad_norm": 0.6976735779434823, "learning_rate": 7.530125166812493e-06, "loss": 0.8913, "num_tokens": 126513194.0, "step": 621 }, { "epoch": 1.4880382775119618, "grad_norm": 0.7134664678115061, "learning_rate": 7.5223380355098376e-06, "loss": 0.5069, "num_tokens": 126659843.0, "step": 622 }, { "epoch": 1.4904306220095693, "grad_norm": 0.6281529318513822, "learning_rate": 7.514543308165665e-06, "loss": 0.5912, "num_tokens": 126854255.0, "step": 623 }, { "epoch": 1.492822966507177, "grad_norm": 0.6753085882032426, "learning_rate": 7.5067410140575105e-06, "loss": 0.644, "num_tokens": 127064662.0, "step": 624 }, { "epoch": 1.4952153110047846, "grad_norm": 0.6543948938709288, "learning_rate": 7.498931182491329e-06, "loss": 0.8564, "num_tokens": 127279305.0, "step": 625 }, { "epoch": 1.4976076555023923, "grad_norm": 0.5694667262337427, "learning_rate": 7.491113842801391e-06, "loss": 0.6843, "num_tokens": 127503404.0, "step": 626 }, { "epoch": 1.5, "grad_norm": 0.6154345937543192, "learning_rate": 7.483289024350161e-06, "loss": 0.7261, "num_tokens": 127707884.0, "step": 627 }, { "epoch": 1.5023923444976077, "grad_norm": 0.5870500440697171, "learning_rate": 7.4754567565282e-06, "loss": 0.7015, "num_tokens": 127928802.0, "step": 628 }, { "epoch": 1.5047846889952154, "grad_norm": 0.6189074200886605, "learning_rate": 7.46761706875405e-06, "loss": 0.6552, "num_tokens": 128132676.0, "step": 629 }, { "epoch": 1.507177033492823, "grad_norm": 0.678134402555645, "learning_rate": 7.4597699904741164e-06, "loss": 0.6806, "num_tokens": 128294864.0, "step": 630 }, { "epoch": 1.5095693779904307, "grad_norm": 0.6742324121181981, "learning_rate": 7.451915551162571e-06, "loss": 0.6127, "num_tokens": 128467254.0, "step": 631 }, { "epoch": 1.5119617224880382, "grad_norm": 0.6561738843878836, "learning_rate": 7.444053780321229e-06, "loss": 0.669, "num_tokens": 128683604.0, "step": 632 }, { "epoch": 1.514354066985646, "grad_norm": 0.6712644537160007, "learning_rate": 7.436184707479446e-06, "loss": 0.5979, "num_tokens": 128886445.0, "step": 633 }, { "epoch": 1.5167464114832536, "grad_norm": 0.5858800119893577, "learning_rate": 7.4283083621940046e-06, "loss": 0.9023, "num_tokens": 129172873.0, "step": 634 }, { "epoch": 1.5191387559808613, "grad_norm": 0.6693350217386079, "learning_rate": 7.420424774049001e-06, "loss": 0.5293, "num_tokens": 129335516.0, "step": 635 }, { "epoch": 1.5215311004784688, "grad_norm": 0.5630177602648866, "learning_rate": 7.412533972655738e-06, "loss": 0.9111, "num_tokens": 129643366.0, "step": 636 }, { "epoch": 1.5239234449760766, "grad_norm": 0.5465767654675465, "learning_rate": 7.4046359876526105e-06, "loss": 0.6228, "num_tokens": 129873409.0, "step": 637 }, { "epoch": 1.526315789473684, "grad_norm": 0.6278118175282995, "learning_rate": 7.396730848704998e-06, "loss": 0.7113, "num_tokens": 130084999.0, "step": 638 }, { "epoch": 1.5287081339712918, "grad_norm": 0.5683146907948892, "learning_rate": 7.3888185855051496e-06, "loss": 0.8121, "num_tokens": 130319504.0, "step": 639 }, { "epoch": 1.5311004784688995, "grad_norm": 0.5769236478312393, "learning_rate": 7.380899227772071e-06, "loss": 0.6355, "num_tokens": 130571172.0, "step": 640 }, { "epoch": 1.5334928229665072, "grad_norm": 0.6040589218978154, "learning_rate": 7.372972805251418e-06, "loss": 0.7999, "num_tokens": 130790304.0, "step": 641 }, { "epoch": 1.535885167464115, "grad_norm": 0.6889509880785355, "learning_rate": 7.365039347715382e-06, "loss": 0.9458, "num_tokens": 130983743.0, "step": 642 }, { "epoch": 1.5382775119617225, "grad_norm": 0.5518524219982203, "learning_rate": 7.3570988849625776e-06, "loss": 0.6396, "num_tokens": 131223506.0, "step": 643 }, { "epoch": 1.5406698564593302, "grad_norm": 0.6263391549174677, "learning_rate": 7.349151446817933e-06, "loss": 0.7415, "num_tokens": 131422092.0, "step": 644 }, { "epoch": 1.5430622009569377, "grad_norm": 0.6254985326192385, "learning_rate": 7.341197063132578e-06, "loss": 0.5479, "num_tokens": 131611001.0, "step": 645 }, { "epoch": 1.5454545454545454, "grad_norm": 0.7025669116044877, "learning_rate": 7.333235763783723e-06, "loss": 0.8543, "num_tokens": 131798367.0, "step": 646 }, { "epoch": 1.5478468899521531, "grad_norm": 0.6769673977558561, "learning_rate": 7.325267578674562e-06, "loss": 0.7417, "num_tokens": 131982165.0, "step": 647 }, { "epoch": 1.5502392344497609, "grad_norm": 0.6594417737311544, "learning_rate": 7.317292537734146e-06, "loss": 0.7702, "num_tokens": 132179543.0, "step": 648 }, { "epoch": 1.5526315789473686, "grad_norm": 0.5825105050658751, "learning_rate": 7.309310670917284e-06, "loss": 0.869, "num_tokens": 132418034.0, "step": 649 }, { "epoch": 1.555023923444976, "grad_norm": 0.6076102425806628, "learning_rate": 7.301322008204417e-06, "loss": 0.6586, "num_tokens": 132649014.0, "step": 650 }, { "epoch": 1.5574162679425836, "grad_norm": 0.711423037588177, "learning_rate": 7.293326579601516e-06, "loss": 0.7468, "num_tokens": 132803863.0, "step": 651 }, { "epoch": 1.5598086124401913, "grad_norm": 0.7368594597294233, "learning_rate": 7.285324415139963e-06, "loss": 0.8176, "num_tokens": 133026746.0, "step": 652 }, { "epoch": 1.562200956937799, "grad_norm": 0.6550397000696636, "learning_rate": 7.277315544876443e-06, "loss": 0.4705, "num_tokens": 133160363.0, "step": 653 }, { "epoch": 1.5645933014354068, "grad_norm": 0.6169864021795348, "learning_rate": 7.269299998892826e-06, "loss": 0.7017, "num_tokens": 133372416.0, "step": 654 }, { "epoch": 1.5669856459330145, "grad_norm": 0.6851837929301584, "learning_rate": 7.261277807296054e-06, "loss": 0.6455, "num_tokens": 133547830.0, "step": 655 }, { "epoch": 1.569377990430622, "grad_norm": 0.5247519311165174, "learning_rate": 7.253249000218039e-06, "loss": 0.5979, "num_tokens": 133786320.0, "step": 656 }, { "epoch": 1.5717703349282297, "grad_norm": 0.6589549645314666, "learning_rate": 7.245213607815529e-06, "loss": 0.5587, "num_tokens": 133976402.0, "step": 657 }, { "epoch": 1.5741626794258372, "grad_norm": 0.6447471357403248, "learning_rate": 7.237171660270019e-06, "loss": 0.7355, "num_tokens": 134193941.0, "step": 658 }, { "epoch": 1.576555023923445, "grad_norm": 0.5905891091552951, "learning_rate": 7.229123187787618e-06, "loss": 0.7559, "num_tokens": 134406719.0, "step": 659 }, { "epoch": 1.5789473684210527, "grad_norm": 0.5849610167995951, "learning_rate": 7.221068220598944e-06, "loss": 0.5986, "num_tokens": 134606887.0, "step": 660 }, { "epoch": 1.5813397129186604, "grad_norm": 0.6367055646406621, "learning_rate": 7.213006788959013e-06, "loss": 0.7577, "num_tokens": 134802016.0, "step": 661 }, { "epoch": 1.583732057416268, "grad_norm": 0.5923713608410193, "learning_rate": 7.20493892314712e-06, "loss": 0.7655, "num_tokens": 135021088.0, "step": 662 }, { "epoch": 1.5861244019138756, "grad_norm": 0.6459428419359479, "learning_rate": 7.1968646534667245e-06, "loss": 0.9141, "num_tokens": 135244745.0, "step": 663 }, { "epoch": 1.588516746411483, "grad_norm": 0.6394239767985107, "learning_rate": 7.188784010245342e-06, "loss": 0.7753, "num_tokens": 135464277.0, "step": 664 }, { "epoch": 1.5909090909090908, "grad_norm": 0.5549326186185076, "learning_rate": 7.180697023834429e-06, "loss": 0.6716, "num_tokens": 135684135.0, "step": 665 }, { "epoch": 1.5933014354066986, "grad_norm": 0.6647510124923258, "learning_rate": 7.172603724609265e-06, "loss": 0.881, "num_tokens": 135891765.0, "step": 666 }, { "epoch": 1.5956937799043063, "grad_norm": 0.5867761913813149, "learning_rate": 7.164504142968842e-06, "loss": 0.6829, "num_tokens": 136089551.0, "step": 667 }, { "epoch": 1.598086124401914, "grad_norm": 0.7373727808155365, "learning_rate": 7.156398309335748e-06, "loss": 0.656, "num_tokens": 136218480.0, "step": 668 }, { "epoch": 1.6004784688995215, "grad_norm": 0.6654353752622224, "learning_rate": 7.148286254156055e-06, "loss": 0.6759, "num_tokens": 136394386.0, "step": 669 }, { "epoch": 1.6028708133971292, "grad_norm": 0.5865986644953781, "learning_rate": 7.140168007899204e-06, "loss": 0.6801, "num_tokens": 136626576.0, "step": 670 }, { "epoch": 1.6052631578947367, "grad_norm": 0.6059106005903319, "learning_rate": 7.1320436010578885e-06, "loss": 0.748, "num_tokens": 136876727.0, "step": 671 }, { "epoch": 1.6076555023923444, "grad_norm": 0.6863180911732465, "learning_rate": 7.123913064147942e-06, "loss": 0.7255, "num_tokens": 137073722.0, "step": 672 }, { "epoch": 1.6100478468899522, "grad_norm": 0.677113903994993, "learning_rate": 7.115776427708225e-06, "loss": 0.7247, "num_tokens": 137277798.0, "step": 673 }, { "epoch": 1.61244019138756, "grad_norm": 0.6030904942096611, "learning_rate": 7.107633722300504e-06, "loss": 0.8239, "num_tokens": 137541312.0, "step": 674 }, { "epoch": 1.6148325358851676, "grad_norm": 0.658973988174715, "learning_rate": 7.099484978509347e-06, "loss": 0.7388, "num_tokens": 137732486.0, "step": 675 }, { "epoch": 1.6172248803827751, "grad_norm": 0.5504898493296739, "learning_rate": 7.091330226941997e-06, "loss": 0.6264, "num_tokens": 137966847.0, "step": 676 }, { "epoch": 1.6196172248803826, "grad_norm": 0.5982217124976851, "learning_rate": 7.083169498228265e-06, "loss": 0.7082, "num_tokens": 138167570.0, "step": 677 }, { "epoch": 1.6220095693779903, "grad_norm": 0.6922794170445625, "learning_rate": 7.075002823020415e-06, "loss": 0.8387, "num_tokens": 138382643.0, "step": 678 }, { "epoch": 1.624401913875598, "grad_norm": 0.572655561281994, "learning_rate": 7.0668302319930385e-06, "loss": 0.6965, "num_tokens": 138585117.0, "step": 679 }, { "epoch": 1.6267942583732058, "grad_norm": 0.6552294365015775, "learning_rate": 7.0586517558429565e-06, "loss": 0.8051, "num_tokens": 138793586.0, "step": 680 }, { "epoch": 1.6291866028708135, "grad_norm": 0.681546332536658, "learning_rate": 7.05046742528909e-06, "loss": 0.5143, "num_tokens": 138946513.0, "step": 681 }, { "epoch": 1.631578947368421, "grad_norm": 0.663097578505039, "learning_rate": 7.042277271072353e-06, "loss": 0.7839, "num_tokens": 139128215.0, "step": 682 }, { "epoch": 1.6339712918660287, "grad_norm": 0.6866957270444947, "learning_rate": 7.034081323955526e-06, "loss": 0.6465, "num_tokens": 139352216.0, "step": 683 }, { "epoch": 1.6363636363636362, "grad_norm": 0.6068063835514473, "learning_rate": 7.025879614723159e-06, "loss": 0.7776, "num_tokens": 139582137.0, "step": 684 }, { "epoch": 1.638755980861244, "grad_norm": 0.6668218142807971, "learning_rate": 7.017672174181433e-06, "loss": 0.5697, "num_tokens": 139767821.0, "step": 685 }, { "epoch": 1.6411483253588517, "grad_norm": 0.5761416700134772, "learning_rate": 7.009459033158067e-06, "loss": 0.7146, "num_tokens": 140016403.0, "step": 686 }, { "epoch": 1.6435406698564594, "grad_norm": 0.6212350178196538, "learning_rate": 7.001240222502185e-06, "loss": 0.5901, "num_tokens": 140215130.0, "step": 687 }, { "epoch": 1.6459330143540671, "grad_norm": 0.6932671836785006, "learning_rate": 6.993015773084209e-06, "loss": 0.8066, "num_tokens": 140435213.0, "step": 688 }, { "epoch": 1.6483253588516746, "grad_norm": 0.6152901109158736, "learning_rate": 6.984785715795736e-06, "loss": 0.7012, "num_tokens": 140638974.0, "step": 689 }, { "epoch": 1.6507177033492821, "grad_norm": 0.6046502480861653, "learning_rate": 6.976550081549437e-06, "loss": 0.6277, "num_tokens": 140828485.0, "step": 690 }, { "epoch": 1.6531100478468899, "grad_norm": 0.624543382041799, "learning_rate": 6.968308901278918e-06, "loss": 0.7345, "num_tokens": 141024806.0, "step": 691 }, { "epoch": 1.6555023923444976, "grad_norm": 0.5853674718023154, "learning_rate": 6.960062205938625e-06, "loss": 0.5641, "num_tokens": 141226224.0, "step": 692 }, { "epoch": 1.6578947368421053, "grad_norm": 0.5897568318638701, "learning_rate": 6.951810026503716e-06, "loss": 0.6473, "num_tokens": 141456777.0, "step": 693 }, { "epoch": 1.660287081339713, "grad_norm": 0.5965536569799941, "learning_rate": 6.943552393969945e-06, "loss": 0.6568, "num_tokens": 141698392.0, "step": 694 }, { "epoch": 1.6626794258373205, "grad_norm": 0.6216221057717923, "learning_rate": 6.935289339353554e-06, "loss": 0.7735, "num_tokens": 141952515.0, "step": 695 }, { "epoch": 1.6650717703349283, "grad_norm": 0.5628621673381613, "learning_rate": 6.927020893691144e-06, "loss": 0.701, "num_tokens": 142193916.0, "step": 696 }, { "epoch": 1.6674641148325358, "grad_norm": 0.721909783452347, "learning_rate": 6.918747088039573e-06, "loss": 0.7463, "num_tokens": 142382031.0, "step": 697 }, { "epoch": 1.6698564593301435, "grad_norm": 0.6160712988562029, "learning_rate": 6.910467953475825e-06, "loss": 0.6198, "num_tokens": 142589032.0, "step": 698 }, { "epoch": 1.6722488038277512, "grad_norm": 0.5777273958814299, "learning_rate": 6.902183521096903e-06, "loss": 0.6441, "num_tokens": 142816199.0, "step": 699 }, { "epoch": 1.674641148325359, "grad_norm": 0.66250696939839, "learning_rate": 6.8938938220197045e-06, "loss": 0.6611, "num_tokens": 142981069.0, "step": 700 }, { "epoch": 1.6770334928229667, "grad_norm": 0.613928495672255, "learning_rate": 6.885598887380916e-06, "loss": 0.5948, "num_tokens": 143161674.0, "step": 701 }, { "epoch": 1.6794258373205742, "grad_norm": 0.5576396758289864, "learning_rate": 6.877298748336886e-06, "loss": 0.6836, "num_tokens": 143401819.0, "step": 702 }, { "epoch": 1.6818181818181817, "grad_norm": 0.6883792449494041, "learning_rate": 6.868993436063508e-06, "loss": 0.4856, "num_tokens": 143603641.0, "step": 703 }, { "epoch": 1.6842105263157894, "grad_norm": 0.6392186575702383, "learning_rate": 6.860682981756112e-06, "loss": 0.7784, "num_tokens": 143827071.0, "step": 704 }, { "epoch": 1.686602870813397, "grad_norm": 0.6598165642309378, "learning_rate": 6.852367416629336e-06, "loss": 0.8908, "num_tokens": 144047846.0, "step": 705 }, { "epoch": 1.6889952153110048, "grad_norm": 0.6584885917359966, "learning_rate": 6.844046771917021e-06, "loss": 0.4543, "num_tokens": 144225181.0, "step": 706 }, { "epoch": 1.6913875598086126, "grad_norm": 0.5207686938231294, "learning_rate": 6.8357210788720815e-06, "loss": 0.6706, "num_tokens": 144457553.0, "step": 707 }, { "epoch": 1.69377990430622, "grad_norm": 0.5857688524496493, "learning_rate": 6.827390368766399e-06, "loss": 0.5963, "num_tokens": 144700567.0, "step": 708 }, { "epoch": 1.6961722488038278, "grad_norm": 0.6269445216008943, "learning_rate": 6.819054672890694e-06, "loss": 0.6396, "num_tokens": 144898391.0, "step": 709 }, { "epoch": 1.6985645933014353, "grad_norm": 0.681265371552301, "learning_rate": 6.810714022554416e-06, "loss": 0.6671, "num_tokens": 145085594.0, "step": 710 }, { "epoch": 1.700956937799043, "grad_norm": 0.6385132137297136, "learning_rate": 6.802368449085629e-06, "loss": 0.7816, "num_tokens": 145313346.0, "step": 711 }, { "epoch": 1.7033492822966507, "grad_norm": 0.5699021834675809, "learning_rate": 6.79401798383088e-06, "loss": 0.5681, "num_tokens": 145537646.0, "step": 712 }, { "epoch": 1.7057416267942584, "grad_norm": 0.6766184855175188, "learning_rate": 6.785662658155096e-06, "loss": 0.7845, "num_tokens": 145746945.0, "step": 713 }, { "epoch": 1.7081339712918662, "grad_norm": 0.6563572860619684, "learning_rate": 6.777302503441456e-06, "loss": 0.7179, "num_tokens": 145958251.0, "step": 714 }, { "epoch": 1.7105263157894737, "grad_norm": 0.6546511561234387, "learning_rate": 6.768937551091281e-06, "loss": 0.6195, "num_tokens": 146148849.0, "step": 715 }, { "epoch": 1.7129186602870812, "grad_norm": 0.6292286462040894, "learning_rate": 6.760567832523911e-06, "loss": 0.5005, "num_tokens": 146356588.0, "step": 716 }, { "epoch": 1.715311004784689, "grad_norm": 0.6810222536271355, "learning_rate": 6.7521933791765845e-06, "loss": 0.8664, "num_tokens": 146568777.0, "step": 717 }, { "epoch": 1.7177033492822966, "grad_norm": 0.558428668986471, "learning_rate": 6.7438142225043326e-06, "loss": 0.7028, "num_tokens": 146825389.0, "step": 718 }, { "epoch": 1.7200956937799043, "grad_norm": 0.6455640793650751, "learning_rate": 6.735430393979844e-06, "loss": 0.7942, "num_tokens": 147063089.0, "step": 719 }, { "epoch": 1.722488038277512, "grad_norm": 0.6213200830644523, "learning_rate": 6.727041925093356e-06, "loss": 0.5403, "num_tokens": 147245237.0, "step": 720 }, { "epoch": 1.7248803827751196, "grad_norm": 0.6664752095444129, "learning_rate": 6.718648847352544e-06, "loss": 0.6582, "num_tokens": 147420404.0, "step": 721 }, { "epoch": 1.7272727272727273, "grad_norm": 0.7118439039143982, "learning_rate": 6.710251192282383e-06, "loss": 0.7656, "num_tokens": 147611537.0, "step": 722 }, { "epoch": 1.7296650717703348, "grad_norm": 0.7033855462925209, "learning_rate": 6.701848991425048e-06, "loss": 0.8086, "num_tokens": 147792423.0, "step": 723 }, { "epoch": 1.7320574162679425, "grad_norm": 0.5338887128010329, "learning_rate": 6.69344227633979e-06, "loss": 0.6233, "num_tokens": 148036593.0, "step": 724 }, { "epoch": 1.7344497607655502, "grad_norm": 0.627987489446626, "learning_rate": 6.685031078602806e-06, "loss": 0.539, "num_tokens": 148247028.0, "step": 725 }, { "epoch": 1.736842105263158, "grad_norm": 0.6380003491433351, "learning_rate": 6.676615429807139e-06, "loss": 0.7812, "num_tokens": 148445651.0, "step": 726 }, { "epoch": 1.7392344497607657, "grad_norm": 0.6304048306139799, "learning_rate": 6.66819536156255e-06, "loss": 0.7064, "num_tokens": 148649603.0, "step": 727 }, { "epoch": 1.7416267942583732, "grad_norm": 0.6961464516761647, "learning_rate": 6.659770905495395e-06, "loss": 0.8016, "num_tokens": 148826982.0, "step": 728 }, { "epoch": 1.744019138755981, "grad_norm": 0.6297522222022299, "learning_rate": 6.651342093248514e-06, "loss": 0.5682, "num_tokens": 148985475.0, "step": 729 }, { "epoch": 1.7464114832535884, "grad_norm": 0.6740019228321092, "learning_rate": 6.64290895648111e-06, "loss": 0.759, "num_tokens": 149191208.0, "step": 730 }, { "epoch": 1.7488038277511961, "grad_norm": 0.6670699466136719, "learning_rate": 6.634471526868626e-06, "loss": 0.6014, "num_tokens": 149354730.0, "step": 731 }, { "epoch": 1.7511961722488039, "grad_norm": 0.6738246947970548, "learning_rate": 6.626029836102632e-06, "loss": 0.7066, "num_tokens": 149585290.0, "step": 732 }, { "epoch": 1.7535885167464116, "grad_norm": 0.7257178534956213, "learning_rate": 6.6175839158907044e-06, "loss": 0.7729, "num_tokens": 149748808.0, "step": 733 }, { "epoch": 1.755980861244019, "grad_norm": 0.5813926473198574, "learning_rate": 6.609133797956299e-06, "loss": 0.6473, "num_tokens": 149986613.0, "step": 734 }, { "epoch": 1.7583732057416268, "grad_norm": 0.5696351716269552, "learning_rate": 6.600679514038648e-06, "loss": 0.7636, "num_tokens": 150243571.0, "step": 735 }, { "epoch": 1.7607655502392343, "grad_norm": 0.6984568231501039, "learning_rate": 6.592221095892624e-06, "loss": 0.7487, "num_tokens": 150445035.0, "step": 736 }, { "epoch": 1.763157894736842, "grad_norm": 0.7068183140491251, "learning_rate": 6.583758575288629e-06, "loss": 0.7338, "num_tokens": 150613995.0, "step": 737 }, { "epoch": 1.7655502392344498, "grad_norm": 0.5681101022881475, "learning_rate": 6.575291984012479e-06, "loss": 0.6505, "num_tokens": 150888694.0, "step": 738 }, { "epoch": 1.7679425837320575, "grad_norm": 0.6844383241481893, "learning_rate": 6.566821353865274e-06, "loss": 0.8043, "num_tokens": 151074729.0, "step": 739 }, { "epoch": 1.7703349282296652, "grad_norm": 0.6698154744229244, "learning_rate": 6.558346716663288e-06, "loss": 0.564, "num_tokens": 151278234.0, "step": 740 }, { "epoch": 1.7727272727272727, "grad_norm": 0.6931357382617912, "learning_rate": 6.5498681042378436e-06, "loss": 0.8781, "num_tokens": 151454406.0, "step": 741 }, { "epoch": 1.7751196172248804, "grad_norm": 0.547487238351369, "learning_rate": 6.541385548435194e-06, "loss": 0.7641, "num_tokens": 151721146.0, "step": 742 }, { "epoch": 1.777511961722488, "grad_norm": 0.5211188643543073, "learning_rate": 6.532899081116407e-06, "loss": 0.5243, "num_tokens": 151907808.0, "step": 743 }, { "epoch": 1.7799043062200957, "grad_norm": 0.5903546234259213, "learning_rate": 6.524408734157243e-06, "loss": 0.7527, "num_tokens": 152097386.0, "step": 744 }, { "epoch": 1.7822966507177034, "grad_norm": 0.7281834521077791, "learning_rate": 6.515914539448027e-06, "loss": 0.675, "num_tokens": 152251887.0, "step": 745 }, { "epoch": 1.784688995215311, "grad_norm": 0.7070506737237091, "learning_rate": 6.5074165288935465e-06, "loss": 0.7677, "num_tokens": 152427986.0, "step": 746 }, { "epoch": 1.7870813397129188, "grad_norm": 0.6725409977391403, "learning_rate": 6.498914734412914e-06, "loss": 0.7322, "num_tokens": 152632145.0, "step": 747 }, { "epoch": 1.7894736842105263, "grad_norm": 0.6286109645379147, "learning_rate": 6.490409187939461e-06, "loss": 0.533, "num_tokens": 152800360.0, "step": 748 }, { "epoch": 1.7918660287081338, "grad_norm": 0.6577712868510451, "learning_rate": 6.481899921420605e-06, "loss": 0.7657, "num_tokens": 152999064.0, "step": 749 }, { "epoch": 1.7942583732057416, "grad_norm": 0.5492567256359078, "learning_rate": 6.473386966817739e-06, "loss": 0.4693, "num_tokens": 153209749.0, "step": 750 }, { "epoch": 1.7966507177033493, "grad_norm": 0.635209254410753, "learning_rate": 6.464870356106111e-06, "loss": 0.5932, "num_tokens": 153409227.0, "step": 751 }, { "epoch": 1.799043062200957, "grad_norm": 0.5793755424272056, "learning_rate": 6.456350121274701e-06, "loss": 0.5852, "num_tokens": 153636309.0, "step": 752 }, { "epoch": 1.8014354066985647, "grad_norm": 0.5893424243716154, "learning_rate": 6.447826294326096e-06, "loss": 0.5136, "num_tokens": 153827119.0, "step": 753 }, { "epoch": 1.8038277511961722, "grad_norm": 0.6072000657757534, "learning_rate": 6.439298907276383e-06, "loss": 0.6568, "num_tokens": 154010369.0, "step": 754 }, { "epoch": 1.80622009569378, "grad_norm": 0.6585123683717504, "learning_rate": 6.430767992155018e-06, "loss": 0.7205, "num_tokens": 154239396.0, "step": 755 }, { "epoch": 1.8086124401913874, "grad_norm": 0.562448012001657, "learning_rate": 6.422233581004705e-06, "loss": 0.8302, "num_tokens": 154529668.0, "step": 756 }, { "epoch": 1.8110047846889952, "grad_norm": 0.669204687243846, "learning_rate": 6.413695705881285e-06, "loss": 0.755, "num_tokens": 154716614.0, "step": 757 }, { "epoch": 1.813397129186603, "grad_norm": 0.575016512956893, "learning_rate": 6.405154398853607e-06, "loss": 0.587, "num_tokens": 154938736.0, "step": 758 }, { "epoch": 1.8157894736842106, "grad_norm": 0.5777414948415983, "learning_rate": 6.396609692003409e-06, "loss": 0.5674, "num_tokens": 155140701.0, "step": 759 }, { "epoch": 1.8181818181818183, "grad_norm": 0.5963378705676244, "learning_rate": 6.3880616174252045e-06, "loss": 0.5978, "num_tokens": 155310866.0, "step": 760 }, { "epoch": 1.8205741626794258, "grad_norm": 0.6637288549764916, "learning_rate": 6.379510207226149e-06, "loss": 0.7328, "num_tokens": 155490254.0, "step": 761 }, { "epoch": 1.8229665071770333, "grad_norm": 0.6737242450951861, "learning_rate": 6.370955493525934e-06, "loss": 0.597, "num_tokens": 155671785.0, "step": 762 }, { "epoch": 1.825358851674641, "grad_norm": 0.6542151896826175, "learning_rate": 6.362397508456652e-06, "loss": 0.656, "num_tokens": 155902312.0, "step": 763 }, { "epoch": 1.8277511961722488, "grad_norm": 0.5973672724693193, "learning_rate": 6.353836284162689e-06, "loss": 0.6636, "num_tokens": 156085880.0, "step": 764 }, { "epoch": 1.8301435406698565, "grad_norm": 0.6154944777126776, "learning_rate": 6.345271852800594e-06, "loss": 0.7481, "num_tokens": 156281186.0, "step": 765 }, { "epoch": 1.8325358851674642, "grad_norm": 0.6338112989470145, "learning_rate": 6.336704246538967e-06, "loss": 0.6602, "num_tokens": 156491235.0, "step": 766 }, { "epoch": 1.8349282296650717, "grad_norm": 0.6531681516448476, "learning_rate": 6.328133497558326e-06, "loss": 0.6314, "num_tokens": 156685969.0, "step": 767 }, { "epoch": 1.8373205741626795, "grad_norm": 0.6708022721442888, "learning_rate": 6.319559638050998e-06, "loss": 0.758, "num_tokens": 156859214.0, "step": 768 }, { "epoch": 1.839712918660287, "grad_norm": 0.6556721794330259, "learning_rate": 6.310982700220993e-06, "loss": 0.7323, "num_tokens": 157060937.0, "step": 769 }, { "epoch": 1.8421052631578947, "grad_norm": 0.6578595057293558, "learning_rate": 6.302402716283883e-06, "loss": 0.9535, "num_tokens": 157261078.0, "step": 770 }, { "epoch": 1.8444976076555024, "grad_norm": 0.7148099674142656, "learning_rate": 6.293819718466679e-06, "loss": 0.6388, "num_tokens": 157414683.0, "step": 771 }, { "epoch": 1.8468899521531101, "grad_norm": 0.7238377114336406, "learning_rate": 6.2852337390077165e-06, "loss": 0.6809, "num_tokens": 157602306.0, "step": 772 }, { "epoch": 1.8492822966507179, "grad_norm": 0.7749663077031087, "learning_rate": 6.276644810156526e-06, "loss": 0.8122, "num_tokens": 157779076.0, "step": 773 }, { "epoch": 1.8516746411483254, "grad_norm": 0.6612435155051035, "learning_rate": 6.26805296417372e-06, "loss": 0.6311, "num_tokens": 157976364.0, "step": 774 }, { "epoch": 1.8540669856459329, "grad_norm": 0.5573801868109426, "learning_rate": 6.259458233330865e-06, "loss": 0.7612, "num_tokens": 158225906.0, "step": 775 }, { "epoch": 1.8564593301435406, "grad_norm": 0.6210661879260847, "learning_rate": 6.250860649910364e-06, "loss": 0.7606, "num_tokens": 158456010.0, "step": 776 }, { "epoch": 1.8588516746411483, "grad_norm": 0.5965280962073838, "learning_rate": 6.242260246205336e-06, "loss": 0.8011, "num_tokens": 158725023.0, "step": 777 }, { "epoch": 1.861244019138756, "grad_norm": 0.5411360641638349, "learning_rate": 6.233657054519493e-06, "loss": 0.7206, "num_tokens": 158972556.0, "step": 778 }, { "epoch": 1.8636363636363638, "grad_norm": 0.6880950857454029, "learning_rate": 6.225051107167016e-06, "loss": 0.5751, "num_tokens": 159137291.0, "step": 779 }, { "epoch": 1.8660287081339713, "grad_norm": 0.6732364239510688, "learning_rate": 6.21644243647244e-06, "loss": 0.8581, "num_tokens": 159304892.0, "step": 780 }, { "epoch": 1.868421052631579, "grad_norm": 0.7033409732295164, "learning_rate": 6.207831074770525e-06, "loss": 0.7957, "num_tokens": 159477762.0, "step": 781 }, { "epoch": 1.8708133971291865, "grad_norm": 0.6669767550877167, "learning_rate": 6.199217054406144e-06, "loss": 0.7279, "num_tokens": 159692236.0, "step": 782 }, { "epoch": 1.8732057416267942, "grad_norm": 0.6329334013965955, "learning_rate": 6.190600407734152e-06, "loss": 0.4734, "num_tokens": 159861359.0, "step": 783 }, { "epoch": 1.875598086124402, "grad_norm": 0.6108528654038816, "learning_rate": 6.181981167119268e-06, "loss": 0.7286, "num_tokens": 160078902.0, "step": 784 }, { "epoch": 1.8779904306220097, "grad_norm": 0.7284756956486506, "learning_rate": 6.173359364935962e-06, "loss": 0.8924, "num_tokens": 160288294.0, "step": 785 }, { "epoch": 1.8803827751196174, "grad_norm": 0.7028581335759021, "learning_rate": 6.164735033568311e-06, "loss": 0.6249, "num_tokens": 160447101.0, "step": 786 }, { "epoch": 1.8827751196172249, "grad_norm": 0.5681307594359682, "learning_rate": 6.156108205409905e-06, "loss": 0.6184, "num_tokens": 160648475.0, "step": 787 }, { "epoch": 1.8851674641148324, "grad_norm": 0.6548663136064242, "learning_rate": 6.147478912863706e-06, "loss": 0.8074, "num_tokens": 160834767.0, "step": 788 }, { "epoch": 1.88755980861244, "grad_norm": 0.5766678861349332, "learning_rate": 6.138847188341933e-06, "loss": 0.6363, "num_tokens": 161064024.0, "step": 789 }, { "epoch": 1.8899521531100478, "grad_norm": 0.6277507803468657, "learning_rate": 6.130213064265942e-06, "loss": 0.6889, "num_tokens": 161292692.0, "step": 790 }, { "epoch": 1.8923444976076556, "grad_norm": 0.6427100164026258, "learning_rate": 6.121576573066097e-06, "loss": 0.5882, "num_tokens": 161490801.0, "step": 791 }, { "epoch": 1.8947368421052633, "grad_norm": 0.5690909727573779, "learning_rate": 6.112937747181658e-06, "loss": 0.4301, "num_tokens": 161670780.0, "step": 792 }, { "epoch": 1.8971291866028708, "grad_norm": 0.636924652289299, "learning_rate": 6.104296619060654e-06, "loss": 0.5616, "num_tokens": 161876992.0, "step": 793 }, { "epoch": 1.8995215311004785, "grad_norm": 0.6100826828491932, "learning_rate": 6.095653221159758e-06, "loss": 0.6796, "num_tokens": 162076563.0, "step": 794 }, { "epoch": 1.901913875598086, "grad_norm": 0.6373158305391043, "learning_rate": 6.08700758594417e-06, "loss": 0.7603, "num_tokens": 162295731.0, "step": 795 }, { "epoch": 1.9043062200956937, "grad_norm": 0.6270935536574143, "learning_rate": 6.078359745887496e-06, "loss": 0.7324, "num_tokens": 162463047.0, "step": 796 }, { "epoch": 1.9066985645933014, "grad_norm": 0.6888041390063913, "learning_rate": 6.069709733471618e-06, "loss": 0.7396, "num_tokens": 162648928.0, "step": 797 }, { "epoch": 1.9090909090909092, "grad_norm": 0.5453051524568404, "learning_rate": 6.061057581186583e-06, "loss": 0.7578, "num_tokens": 162890874.0, "step": 798 }, { "epoch": 1.911483253588517, "grad_norm": 0.5878553603380681, "learning_rate": 6.052403321530473e-06, "loss": 0.7789, "num_tokens": 163147285.0, "step": 799 }, { "epoch": 1.9138755980861244, "grad_norm": 0.5748854305021395, "learning_rate": 6.043746987009287e-06, "loss": 0.3847, "num_tokens": 163293805.0, "step": 800 }, { "epoch": 1.916267942583732, "grad_norm": 0.6041052961662696, "learning_rate": 6.035088610136814e-06, "loss": 0.7391, "num_tokens": 163512727.0, "step": 801 }, { "epoch": 1.9186602870813396, "grad_norm": 0.623206001139848, "learning_rate": 6.026428223434515e-06, "loss": 0.569, "num_tokens": 163731841.0, "step": 802 }, { "epoch": 1.9210526315789473, "grad_norm": 0.595797010000766, "learning_rate": 6.017765859431405e-06, "loss": 0.6446, "num_tokens": 163944213.0, "step": 803 }, { "epoch": 1.923444976076555, "grad_norm": 0.6770597646109086, "learning_rate": 6.00910155066392e-06, "loss": 0.6816, "num_tokens": 164125249.0, "step": 804 }, { "epoch": 1.9258373205741628, "grad_norm": 0.6216999025378627, "learning_rate": 6.000435329675804e-06, "loss": 0.5176, "num_tokens": 164287594.0, "step": 805 }, { "epoch": 1.9282296650717703, "grad_norm": 0.5779829552728587, "learning_rate": 5.991767229017982e-06, "loss": 0.6527, "num_tokens": 164504811.0, "step": 806 }, { "epoch": 1.930622009569378, "grad_norm": 0.5999659117860371, "learning_rate": 5.983097281248439e-06, "loss": 0.6139, "num_tokens": 164700832.0, "step": 807 }, { "epoch": 1.9330143540669855, "grad_norm": 0.6925949959790602, "learning_rate": 5.974425518932099e-06, "loss": 0.736, "num_tokens": 164855500.0, "step": 808 }, { "epoch": 1.9354066985645932, "grad_norm": 0.664280629821679, "learning_rate": 5.9657519746407e-06, "loss": 0.7058, "num_tokens": 165081012.0, "step": 809 }, { "epoch": 1.937799043062201, "grad_norm": 0.6294916529959589, "learning_rate": 5.957076680952676e-06, "loss": 0.7708, "num_tokens": 165321183.0, "step": 810 }, { "epoch": 1.9401913875598087, "grad_norm": 0.613260914286205, "learning_rate": 5.948399670453029e-06, "loss": 0.6023, "num_tokens": 165510499.0, "step": 811 }, { "epoch": 1.9425837320574164, "grad_norm": 0.7432131661007875, "learning_rate": 5.93972097573321e-06, "loss": 0.6179, "num_tokens": 165680826.0, "step": 812 }, { "epoch": 1.944976076555024, "grad_norm": 0.595133384251671, "learning_rate": 5.9310406293909976e-06, "loss": 0.5926, "num_tokens": 165855759.0, "step": 813 }, { "epoch": 1.9473684210526314, "grad_norm": 0.5642473636840877, "learning_rate": 5.922358664030371e-06, "loss": 0.685, "num_tokens": 166088776.0, "step": 814 }, { "epoch": 1.9497607655502391, "grad_norm": 0.5440878709758462, "learning_rate": 5.913675112261395e-06, "loss": 0.5341, "num_tokens": 166310945.0, "step": 815 }, { "epoch": 1.9521531100478469, "grad_norm": 0.604649800399483, "learning_rate": 5.9049900067000886e-06, "loss": 0.5747, "num_tokens": 166528617.0, "step": 816 }, { "epoch": 1.9545454545454546, "grad_norm": 0.6751299088807625, "learning_rate": 5.896303379968311e-06, "loss": 0.7819, "num_tokens": 166715195.0, "step": 817 }, { "epoch": 1.9569377990430623, "grad_norm": 0.5878703194902412, "learning_rate": 5.8876152646936315e-06, "loss": 0.5685, "num_tokens": 166872201.0, "step": 818 }, { "epoch": 1.9593301435406698, "grad_norm": 0.6850872373141852, "learning_rate": 5.878925693509211e-06, "loss": 0.7066, "num_tokens": 167083751.0, "step": 819 }, { "epoch": 1.9617224880382775, "grad_norm": 0.721599338200044, "learning_rate": 5.870234699053681e-06, "loss": 0.7546, "num_tokens": 167316838.0, "step": 820 }, { "epoch": 1.964114832535885, "grad_norm": 0.5264357129866483, "learning_rate": 5.8615423139710184e-06, "loss": 0.3273, "num_tokens": 167537637.0, "step": 821 }, { "epoch": 1.9665071770334928, "grad_norm": 0.7601910916603852, "learning_rate": 5.852848570910421e-06, "loss": 0.863, "num_tokens": 167727717.0, "step": 822 }, { "epoch": 1.9688995215311005, "grad_norm": 0.5993187893093678, "learning_rate": 5.8441535025261896e-06, "loss": 0.7109, "num_tokens": 167936009.0, "step": 823 }, { "epoch": 1.9712918660287082, "grad_norm": 0.7913031835854982, "learning_rate": 5.835457141477604e-06, "loss": 0.8485, "num_tokens": 168096748.0, "step": 824 }, { "epoch": 1.973684210526316, "grad_norm": 0.5170452980689251, "learning_rate": 5.826759520428796e-06, "loss": 0.53, "num_tokens": 168321952.0, "step": 825 }, { "epoch": 1.9760765550239234, "grad_norm": 0.5512288566093851, "learning_rate": 5.818060672048633e-06, "loss": 0.3594, "num_tokens": 168487009.0, "step": 826 }, { "epoch": 1.9784688995215312, "grad_norm": 0.6451763029149299, "learning_rate": 5.809360629010587e-06, "loss": 0.6353, "num_tokens": 168700356.0, "step": 827 }, { "epoch": 1.9808612440191387, "grad_norm": 0.6873439326093063, "learning_rate": 5.800659423992627e-06, "loss": 0.7417, "num_tokens": 168901226.0, "step": 828 }, { "epoch": 1.9832535885167464, "grad_norm": 0.7057939062681675, "learning_rate": 5.791957089677077e-06, "loss": 0.7176, "num_tokens": 169106405.0, "step": 829 }, { "epoch": 1.985645933014354, "grad_norm": 0.6718548592817507, "learning_rate": 5.783253658750508e-06, "loss": 0.3632, "num_tokens": 169241118.0, "step": 830 }, { "epoch": 1.9880382775119618, "grad_norm": 0.5635631634019334, "learning_rate": 5.774549163903609e-06, "loss": 0.5848, "num_tokens": 169443391.0, "step": 831 }, { "epoch": 1.9904306220095693, "grad_norm": 0.5979096434267269, "learning_rate": 5.765843637831064e-06, "loss": 0.6669, "num_tokens": 169656881.0, "step": 832 }, { "epoch": 1.992822966507177, "grad_norm": 0.6955815014808082, "learning_rate": 5.7571371132314295e-06, "loss": 0.7672, "num_tokens": 169884159.0, "step": 833 }, { "epoch": 1.9952153110047846, "grad_norm": 0.544509544292357, "learning_rate": 5.748429622807016e-06, "loss": 0.5541, "num_tokens": 170118387.0, "step": 834 }, { "epoch": 1.9976076555023923, "grad_norm": 0.5592361506310813, "learning_rate": 5.739721199263759e-06, "loss": 0.6119, "num_tokens": 170306703.0, "step": 835 }, { "epoch": 2.0, "grad_norm": 0.6908089161991803, "learning_rate": 5.731011875311099e-06, "loss": 0.8147, "num_tokens": 170479057.0, "step": 836 }, { "epoch": 2.0, "eval_loss": 0.45707619190216064, "eval_num_tokens": 170479057.0, "eval_runtime": 109.3619, "eval_samples_per_second": 27.176, "eval_steps_per_second": 3.402, "step": 836 }, { "epoch": 2.0023923444976077, "grad_norm": 0.7010101792981096, "learning_rate": 5.722301683661859e-06, "loss": 0.5802, "num_tokens": 170662391.0, "step": 837 }, { "epoch": 2.0047846889952154, "grad_norm": 0.536459345119045, "learning_rate": 5.713590657032124e-06, "loss": 0.6956, "num_tokens": 170945306.0, "step": 838 }, { "epoch": 2.007177033492823, "grad_norm": 0.6865598520551047, "learning_rate": 5.704878828141108e-06, "loss": 0.4844, "num_tokens": 171123685.0, "step": 839 }, { "epoch": 2.0095693779904304, "grad_norm": 0.6556001284101048, "learning_rate": 5.696166229711045e-06, "loss": 0.4734, "num_tokens": 171366956.0, "step": 840 }, { "epoch": 2.011961722488038, "grad_norm": 0.7304372391236456, "learning_rate": 5.687452894467058e-06, "loss": 0.4759, "num_tokens": 171503225.0, "step": 841 }, { "epoch": 2.014354066985646, "grad_norm": 0.5893145169524098, "learning_rate": 5.678738855137037e-06, "loss": 0.5807, "num_tokens": 171715062.0, "step": 842 }, { "epoch": 2.0167464114832536, "grad_norm": 0.6891485855481355, "learning_rate": 5.6700241444515146e-06, "loss": 0.7269, "num_tokens": 171932247.0, "step": 843 }, { "epoch": 2.0191387559808613, "grad_norm": 0.7002346069882126, "learning_rate": 5.661308795143546e-06, "loss": 0.5333, "num_tokens": 172113046.0, "step": 844 }, { "epoch": 2.021531100478469, "grad_norm": 0.6197702947732776, "learning_rate": 5.6525928399485895e-06, "loss": 0.5762, "num_tokens": 172357016.0, "step": 845 }, { "epoch": 2.0239234449760763, "grad_norm": 0.6387029634051665, "learning_rate": 5.643876311604371e-06, "loss": 0.7473, "num_tokens": 172559033.0, "step": 846 }, { "epoch": 2.026315789473684, "grad_norm": 0.7098536589133576, "learning_rate": 5.635159242850779e-06, "loss": 0.5927, "num_tokens": 172732819.0, "step": 847 }, { "epoch": 2.028708133971292, "grad_norm": 0.678135657260688, "learning_rate": 5.626441666429724e-06, "loss": 0.5892, "num_tokens": 172932324.0, "step": 848 }, { "epoch": 2.0311004784688995, "grad_norm": 0.583893760377583, "learning_rate": 5.6177236150850246e-06, "loss": 0.5486, "num_tokens": 173186930.0, "step": 849 }, { "epoch": 2.0334928229665072, "grad_norm": 0.757431329558552, "learning_rate": 5.609005121562285e-06, "loss": 0.6904, "num_tokens": 173388905.0, "step": 850 }, { "epoch": 2.035885167464115, "grad_norm": 0.6637010404138924, "learning_rate": 5.600286218608773e-06, "loss": 0.6561, "num_tokens": 173594963.0, "step": 851 }, { "epoch": 2.0382775119617227, "grad_norm": 0.6278760783639207, "learning_rate": 5.591566938973287e-06, "loss": 0.5484, "num_tokens": 173764506.0, "step": 852 }, { "epoch": 2.04066985645933, "grad_norm": 0.7164186834391592, "learning_rate": 5.582847315406045e-06, "loss": 0.604, "num_tokens": 173941988.0, "step": 853 }, { "epoch": 2.0430622009569377, "grad_norm": 0.5957046275665503, "learning_rate": 5.5741273806585575e-06, "loss": 0.652, "num_tokens": 174185773.0, "step": 854 }, { "epoch": 2.0454545454545454, "grad_norm": 0.6429557814193617, "learning_rate": 5.565407167483501e-06, "loss": 0.6522, "num_tokens": 174410841.0, "step": 855 }, { "epoch": 2.047846889952153, "grad_norm": 0.6124880169897995, "learning_rate": 5.556686708634602e-06, "loss": 0.5845, "num_tokens": 174594347.0, "step": 856 }, { "epoch": 2.050239234449761, "grad_norm": 0.6247892510841981, "learning_rate": 5.547966036866503e-06, "loss": 0.499, "num_tokens": 174806968.0, "step": 857 }, { "epoch": 2.0526315789473686, "grad_norm": 0.5731204912599471, "learning_rate": 5.539245184934654e-06, "loss": 0.5916, "num_tokens": 175015537.0, "step": 858 }, { "epoch": 2.055023923444976, "grad_norm": 0.6406016610173918, "learning_rate": 5.530524185595178e-06, "loss": 0.7588, "num_tokens": 175285596.0, "step": 859 }, { "epoch": 2.0574162679425836, "grad_norm": 0.6939904180549464, "learning_rate": 5.521803071604747e-06, "loss": 0.4302, "num_tokens": 175407601.0, "step": 860 }, { "epoch": 2.0598086124401913, "grad_norm": 0.5840171644741489, "learning_rate": 5.513081875720472e-06, "loss": 0.5561, "num_tokens": 175617977.0, "step": 861 }, { "epoch": 2.062200956937799, "grad_norm": 0.5280383888463869, "learning_rate": 5.5043606306997685e-06, "loss": 0.4372, "num_tokens": 175862507.0, "step": 862 }, { "epoch": 2.0645933014354068, "grad_norm": 0.6746604030251611, "learning_rate": 5.495639369300233e-06, "loss": 0.5175, "num_tokens": 176069325.0, "step": 863 }, { "epoch": 2.0669856459330145, "grad_norm": 0.5327219079193565, "learning_rate": 5.4869181242795285e-06, "loss": 0.5431, "num_tokens": 176319089.0, "step": 864 }, { "epoch": 2.069377990430622, "grad_norm": 0.654611728144275, "learning_rate": 5.4781969283952554e-06, "loss": 0.5751, "num_tokens": 176544855.0, "step": 865 }, { "epoch": 2.0717703349282295, "grad_norm": 0.6697005720590413, "learning_rate": 5.469475814404825e-06, "loss": 0.5246, "num_tokens": 176743145.0, "step": 866 }, { "epoch": 2.074162679425837, "grad_norm": 0.6312841831223603, "learning_rate": 5.4607548150653464e-06, "loss": 0.4315, "num_tokens": 176942006.0, "step": 867 }, { "epoch": 2.076555023923445, "grad_norm": 0.5152891194242731, "learning_rate": 5.452033963133498e-06, "loss": 0.4911, "num_tokens": 177145857.0, "step": 868 }, { "epoch": 2.0789473684210527, "grad_norm": 0.5983945019880107, "learning_rate": 5.4433132913654e-06, "loss": 0.5751, "num_tokens": 177354826.0, "step": 869 }, { "epoch": 2.0813397129186604, "grad_norm": 0.5695605383121004, "learning_rate": 5.4345928325164985e-06, "loss": 0.6437, "num_tokens": 177568066.0, "step": 870 }, { "epoch": 2.083732057416268, "grad_norm": 0.577493281390095, "learning_rate": 5.425872619341445e-06, "loss": 0.6113, "num_tokens": 177785516.0, "step": 871 }, { "epoch": 2.0861244019138754, "grad_norm": 0.577110713494551, "learning_rate": 5.4171526845939584e-06, "loss": 0.5294, "num_tokens": 177989703.0, "step": 872 }, { "epoch": 2.088516746411483, "grad_norm": 0.5129404785846571, "learning_rate": 5.4084330610267156e-06, "loss": 0.6171, "num_tokens": 178259977.0, "step": 873 }, { "epoch": 2.090909090909091, "grad_norm": 0.6305465119747049, "learning_rate": 5.39971378139123e-06, "loss": 0.6896, "num_tokens": 178511083.0, "step": 874 }, { "epoch": 2.0933014354066986, "grad_norm": 0.5840406916165961, "learning_rate": 5.390994878437716e-06, "loss": 0.6089, "num_tokens": 178723973.0, "step": 875 }, { "epoch": 2.0956937799043063, "grad_norm": 0.5161608116725473, "learning_rate": 5.382276384914978e-06, "loss": 0.6667, "num_tokens": 178986671.0, "step": 876 }, { "epoch": 2.098086124401914, "grad_norm": 0.6030252688774056, "learning_rate": 5.373558333570278e-06, "loss": 0.5266, "num_tokens": 179184109.0, "step": 877 }, { "epoch": 2.1004784688995217, "grad_norm": 0.5056079079906192, "learning_rate": 5.364840757149222e-06, "loss": 0.5745, "num_tokens": 179477427.0, "step": 878 }, { "epoch": 2.102870813397129, "grad_norm": 0.5469146485955257, "learning_rate": 5.356123688395629e-06, "loss": 0.4426, "num_tokens": 179663693.0, "step": 879 }, { "epoch": 2.1052631578947367, "grad_norm": 0.5298879105875276, "learning_rate": 5.347407160051414e-06, "loss": 0.456, "num_tokens": 179868101.0, "step": 880 }, { "epoch": 2.1076555023923444, "grad_norm": 0.6377883062253229, "learning_rate": 5.338691204856455e-06, "loss": 0.5953, "num_tokens": 180104805.0, "step": 881 }, { "epoch": 2.110047846889952, "grad_norm": 0.632649188788054, "learning_rate": 5.3299758555484885e-06, "loss": 0.747, "num_tokens": 180330338.0, "step": 882 }, { "epoch": 2.11244019138756, "grad_norm": 0.6906228668304801, "learning_rate": 5.321261144862964e-06, "loss": 0.6373, "num_tokens": 180518149.0, "step": 883 }, { "epoch": 2.1148325358851676, "grad_norm": 0.5765219468106015, "learning_rate": 5.312547105532943e-06, "loss": 0.6644, "num_tokens": 180744284.0, "step": 884 }, { "epoch": 2.117224880382775, "grad_norm": 0.5228583600905762, "learning_rate": 5.303833770288955e-06, "loss": 0.5296, "num_tokens": 180984196.0, "step": 885 }, { "epoch": 2.1196172248803826, "grad_norm": 0.5378436640957062, "learning_rate": 5.295121171858893e-06, "loss": 0.3444, "num_tokens": 181190172.0, "step": 886 }, { "epoch": 2.1220095693779903, "grad_norm": 0.6509631304472094, "learning_rate": 5.286409342967878e-06, "loss": 0.7, "num_tokens": 181401483.0, "step": 887 }, { "epoch": 2.124401913875598, "grad_norm": 0.7172404422129441, "learning_rate": 5.277698316338142e-06, "loss": 0.6491, "num_tokens": 181608904.0, "step": 888 }, { "epoch": 2.126794258373206, "grad_norm": 0.7620587247682503, "learning_rate": 5.268988124688903e-06, "loss": 0.7506, "num_tokens": 181778389.0, "step": 889 }, { "epoch": 2.1291866028708135, "grad_norm": 0.6994356608870241, "learning_rate": 5.260278800736242e-06, "loss": 0.7422, "num_tokens": 181977883.0, "step": 890 }, { "epoch": 2.1315789473684212, "grad_norm": 0.6228038727199776, "learning_rate": 5.251570377192987e-06, "loss": 0.5658, "num_tokens": 182162343.0, "step": 891 }, { "epoch": 2.1339712918660285, "grad_norm": 0.7318448075698833, "learning_rate": 5.242862886768572e-06, "loss": 0.7979, "num_tokens": 182343690.0, "step": 892 }, { "epoch": 2.1363636363636362, "grad_norm": 0.6176672020032611, "learning_rate": 5.234156362168939e-06, "loss": 0.5408, "num_tokens": 182542228.0, "step": 893 }, { "epoch": 2.138755980861244, "grad_norm": 0.6830141376665197, "learning_rate": 5.225450836096392e-06, "loss": 0.7719, "num_tokens": 182755030.0, "step": 894 }, { "epoch": 2.1411483253588517, "grad_norm": 0.5870134491180863, "learning_rate": 5.216746341249493e-06, "loss": 0.4887, "num_tokens": 182947312.0, "step": 895 }, { "epoch": 2.1435406698564594, "grad_norm": 0.6325304186483628, "learning_rate": 5.208042910322924e-06, "loss": 0.5166, "num_tokens": 183110659.0, "step": 896 }, { "epoch": 2.145933014354067, "grad_norm": 0.6979115952850606, "learning_rate": 5.199340576007374e-06, "loss": 0.7542, "num_tokens": 183316493.0, "step": 897 }, { "epoch": 2.1483253588516744, "grad_norm": 0.7444990538919707, "learning_rate": 5.1906393709894135e-06, "loss": 0.5085, "num_tokens": 183475125.0, "step": 898 }, { "epoch": 2.150717703349282, "grad_norm": 0.6378799593192597, "learning_rate": 5.18193932795137e-06, "loss": 0.7381, "num_tokens": 183711701.0, "step": 899 }, { "epoch": 2.15311004784689, "grad_norm": 0.6736618720341914, "learning_rate": 5.173240479571206e-06, "loss": 0.5898, "num_tokens": 183881633.0, "step": 900 }, { "epoch": 2.1555023923444976, "grad_norm": 0.6220359607877941, "learning_rate": 5.164542858522398e-06, "loss": 0.6882, "num_tokens": 184115049.0, "step": 901 }, { "epoch": 2.1578947368421053, "grad_norm": 0.538351043125533, "learning_rate": 5.155846497473811e-06, "loss": 0.387, "num_tokens": 184287354.0, "step": 902 }, { "epoch": 2.160287081339713, "grad_norm": 0.5642599124571198, "learning_rate": 5.147151429089581e-06, "loss": 0.484, "num_tokens": 184508937.0, "step": 903 }, { "epoch": 2.1626794258373208, "grad_norm": 0.6001870881012696, "learning_rate": 5.138457686028985e-06, "loss": 0.5965, "num_tokens": 184748958.0, "step": 904 }, { "epoch": 2.165071770334928, "grad_norm": 0.6114874191151893, "learning_rate": 5.12976530094632e-06, "loss": 0.4673, "num_tokens": 184918082.0, "step": 905 }, { "epoch": 2.1674641148325358, "grad_norm": 0.5601252936577793, "learning_rate": 5.121074306490792e-06, "loss": 0.6114, "num_tokens": 185142287.0, "step": 906 }, { "epoch": 2.1698564593301435, "grad_norm": 0.6169147349809473, "learning_rate": 5.112384735306371e-06, "loss": 0.5461, "num_tokens": 185293698.0, "step": 907 }, { "epoch": 2.172248803827751, "grad_norm": 0.6195660313799543, "learning_rate": 5.103696620031692e-06, "loss": 0.6711, "num_tokens": 185499307.0, "step": 908 }, { "epoch": 2.174641148325359, "grad_norm": 0.6031245154149687, "learning_rate": 5.095009993299912e-06, "loss": 0.6294, "num_tokens": 185720997.0, "step": 909 }, { "epoch": 2.1770334928229667, "grad_norm": 0.45150169395352036, "learning_rate": 5.086324887738608e-06, "loss": 0.4616, "num_tokens": 185996237.0, "step": 910 }, { "epoch": 2.179425837320574, "grad_norm": 0.5937895068578424, "learning_rate": 5.07764133596963e-06, "loss": 0.6337, "num_tokens": 186217389.0, "step": 911 }, { "epoch": 2.1818181818181817, "grad_norm": 0.6066999652282524, "learning_rate": 5.068959370609005e-06, "loss": 0.5495, "num_tokens": 186427020.0, "step": 912 }, { "epoch": 2.1842105263157894, "grad_norm": 0.6371757752607746, "learning_rate": 5.060279024266791e-06, "loss": 0.5121, "num_tokens": 186621807.0, "step": 913 }, { "epoch": 2.186602870813397, "grad_norm": 0.5354988636020794, "learning_rate": 5.051600329546973e-06, "loss": 0.4376, "num_tokens": 186845555.0, "step": 914 }, { "epoch": 2.188995215311005, "grad_norm": 0.6984023668454643, "learning_rate": 5.042923319047325e-06, "loss": 0.6887, "num_tokens": 187042423.0, "step": 915 }, { "epoch": 2.1913875598086126, "grad_norm": 0.5999427476898559, "learning_rate": 5.034248025359302e-06, "loss": 0.5246, "num_tokens": 187225861.0, "step": 916 }, { "epoch": 2.1937799043062203, "grad_norm": 0.5612113652706427, "learning_rate": 5.025574481067904e-06, "loss": 0.5317, "num_tokens": 187424793.0, "step": 917 }, { "epoch": 2.1961722488038276, "grad_norm": 0.5751247397201221, "learning_rate": 5.016902718751563e-06, "loss": 0.5002, "num_tokens": 187616947.0, "step": 918 }, { "epoch": 2.1985645933014353, "grad_norm": 0.5933499014582654, "learning_rate": 5.008232770982021e-06, "loss": 0.6113, "num_tokens": 187859226.0, "step": 919 }, { "epoch": 2.200956937799043, "grad_norm": 0.6725228864556276, "learning_rate": 4.999564670324197e-06, "loss": 0.5584, "num_tokens": 188017748.0, "step": 920 }, { "epoch": 2.2033492822966507, "grad_norm": 0.6885249368514662, "learning_rate": 4.990898449336082e-06, "loss": 0.7148, "num_tokens": 188233000.0, "step": 921 }, { "epoch": 2.2057416267942584, "grad_norm": 0.7099706677726074, "learning_rate": 4.9822341405685955e-06, "loss": 0.7334, "num_tokens": 188417806.0, "step": 922 }, { "epoch": 2.208133971291866, "grad_norm": 0.5668827507723063, "learning_rate": 4.973571776565487e-06, "loss": 0.5281, "num_tokens": 188647820.0, "step": 923 }, { "epoch": 2.2105263157894735, "grad_norm": 0.5687915556773702, "learning_rate": 4.964911389863187e-06, "loss": 0.571, "num_tokens": 188857361.0, "step": 924 }, { "epoch": 2.212918660287081, "grad_norm": 0.5716920970399307, "learning_rate": 4.956253012990715e-06, "loss": 0.5454, "num_tokens": 189077250.0, "step": 925 }, { "epoch": 2.215311004784689, "grad_norm": 0.5520991021275099, "learning_rate": 4.947596678469527e-06, "loss": 0.6517, "num_tokens": 189332723.0, "step": 926 }, { "epoch": 2.2177033492822966, "grad_norm": 0.6837998974956843, "learning_rate": 4.938942418813417e-06, "loss": 0.6381, "num_tokens": 189525225.0, "step": 927 }, { "epoch": 2.2200956937799043, "grad_norm": 0.5976614086012115, "learning_rate": 4.930290266528383e-06, "loss": 0.5666, "num_tokens": 189744774.0, "step": 928 }, { "epoch": 2.222488038277512, "grad_norm": 0.6648157340151367, "learning_rate": 4.9216402541125064e-06, "loss": 0.5748, "num_tokens": 189898670.0, "step": 929 }, { "epoch": 2.22488038277512, "grad_norm": 0.5695867945010721, "learning_rate": 4.9129924140558305e-06, "loss": 0.5266, "num_tokens": 190078320.0, "step": 930 }, { "epoch": 2.227272727272727, "grad_norm": 0.640940270681438, "learning_rate": 4.904346778840243e-06, "loss": 0.722, "num_tokens": 190324800.0, "step": 931 }, { "epoch": 2.229665071770335, "grad_norm": 0.6115300277298661, "learning_rate": 4.895703380939348e-06, "loss": 0.6223, "num_tokens": 190526042.0, "step": 932 }, { "epoch": 2.2320574162679425, "grad_norm": 0.6078403150041939, "learning_rate": 4.887062252818343e-06, "loss": 0.6288, "num_tokens": 190763868.0, "step": 933 }, { "epoch": 2.2344497607655502, "grad_norm": 0.491734778374236, "learning_rate": 4.878423426933906e-06, "loss": 0.4818, "num_tokens": 190990352.0, "step": 934 }, { "epoch": 2.236842105263158, "grad_norm": 0.7432657101202276, "learning_rate": 4.869786935734061e-06, "loss": 0.567, "num_tokens": 191134359.0, "step": 935 }, { "epoch": 2.2392344497607657, "grad_norm": 0.6351576528189151, "learning_rate": 4.861152811658069e-06, "loss": 0.4386, "num_tokens": 191303415.0, "step": 936 }, { "epoch": 2.2416267942583734, "grad_norm": 0.7554013281643714, "learning_rate": 4.852521087136296e-06, "loss": 0.7855, "num_tokens": 191507996.0, "step": 937 }, { "epoch": 2.2440191387559807, "grad_norm": 0.601895734080878, "learning_rate": 4.843891794590097e-06, "loss": 0.5065, "num_tokens": 191719942.0, "step": 938 }, { "epoch": 2.2464114832535884, "grad_norm": 0.7135527988113347, "learning_rate": 4.835264966431691e-06, "loss": 0.5396, "num_tokens": 191853658.0, "step": 939 }, { "epoch": 2.248803827751196, "grad_norm": 0.631633953168551, "learning_rate": 4.8266406350640406e-06, "loss": 0.7803, "num_tokens": 192082886.0, "step": 940 }, { "epoch": 2.251196172248804, "grad_norm": 0.528159042893983, "learning_rate": 4.818018832880731e-06, "loss": 0.5049, "num_tokens": 192286179.0, "step": 941 }, { "epoch": 2.2535885167464116, "grad_norm": 0.5692998771536943, "learning_rate": 4.80939959226585e-06, "loss": 0.5496, "num_tokens": 192482338.0, "step": 942 }, { "epoch": 2.2559808612440193, "grad_norm": 0.5998091248077541, "learning_rate": 4.800782945593856e-06, "loss": 0.608, "num_tokens": 192680750.0, "step": 943 }, { "epoch": 2.258373205741627, "grad_norm": 0.7033907861591749, "learning_rate": 4.792168925229476e-06, "loss": 0.6006, "num_tokens": 192864346.0, "step": 944 }, { "epoch": 2.2607655502392343, "grad_norm": 0.7629806480330503, "learning_rate": 4.783557563527561e-06, "loss": 0.626, "num_tokens": 193033493.0, "step": 945 }, { "epoch": 2.263157894736842, "grad_norm": 0.5707192285167687, "learning_rate": 4.7749488928329855e-06, "loss": 0.6194, "num_tokens": 193294902.0, "step": 946 }, { "epoch": 2.2655502392344498, "grad_norm": 0.5905874972965276, "learning_rate": 4.766342945480508e-06, "loss": 0.5512, "num_tokens": 193490577.0, "step": 947 }, { "epoch": 2.2679425837320575, "grad_norm": 0.5177620597135223, "learning_rate": 4.757739753794664e-06, "loss": 0.6391, "num_tokens": 193791362.0, "step": 948 }, { "epoch": 2.270334928229665, "grad_norm": 0.5934058427993907, "learning_rate": 4.749139350089639e-06, "loss": 0.517, "num_tokens": 193983233.0, "step": 949 }, { "epoch": 2.2727272727272725, "grad_norm": 0.5978699434451642, "learning_rate": 4.740541766669136e-06, "loss": 0.5101, "num_tokens": 194192656.0, "step": 950 }, { "epoch": 2.27511961722488, "grad_norm": 0.6275089312864266, "learning_rate": 4.731947035826283e-06, "loss": 0.4238, "num_tokens": 194381428.0, "step": 951 }, { "epoch": 2.277511961722488, "grad_norm": 0.6170245058787546, "learning_rate": 4.723355189843476e-06, "loss": 0.5306, "num_tokens": 194561183.0, "step": 952 }, { "epoch": 2.2799043062200957, "grad_norm": 0.697796398431545, "learning_rate": 4.7147662609922866e-06, "loss": 0.5723, "num_tokens": 194774084.0, "step": 953 }, { "epoch": 2.2822966507177034, "grad_norm": 0.5708474589048195, "learning_rate": 4.706180281533322e-06, "loss": 0.5441, "num_tokens": 195022872.0, "step": 954 }, { "epoch": 2.284688995215311, "grad_norm": 0.6221089024679085, "learning_rate": 4.697597283716119e-06, "loss": 0.4427, "num_tokens": 195188063.0, "step": 955 }, { "epoch": 2.287081339712919, "grad_norm": 0.6966142472427688, "learning_rate": 4.689017299779007e-06, "loss": 0.6812, "num_tokens": 195357841.0, "step": 956 }, { "epoch": 2.2894736842105265, "grad_norm": 0.5858764677824216, "learning_rate": 4.6804403619490025e-06, "loss": 0.6256, "num_tokens": 195574109.0, "step": 957 }, { "epoch": 2.291866028708134, "grad_norm": 0.6720579646318956, "learning_rate": 4.671866502441674e-06, "loss": 0.6995, "num_tokens": 195763960.0, "step": 958 }, { "epoch": 2.2942583732057416, "grad_norm": 0.6739432961084355, "learning_rate": 4.663295753461035e-06, "loss": 0.7389, "num_tokens": 195972654.0, "step": 959 }, { "epoch": 2.2966507177033493, "grad_norm": 0.5337406968936691, "learning_rate": 4.654728147199405e-06, "loss": 0.5339, "num_tokens": 196221210.0, "step": 960 }, { "epoch": 2.299043062200957, "grad_norm": 0.7543750707624769, "learning_rate": 4.646163715837313e-06, "loss": 0.604, "num_tokens": 196356480.0, "step": 961 }, { "epoch": 2.3014354066985647, "grad_norm": 0.64221875053758, "learning_rate": 4.637602491543349e-06, "loss": 0.7221, "num_tokens": 196586279.0, "step": 962 }, { "epoch": 2.303827751196172, "grad_norm": 0.6051407759973162, "learning_rate": 4.629044506474069e-06, "loss": 0.4194, "num_tokens": 196759153.0, "step": 963 }, { "epoch": 2.3062200956937797, "grad_norm": 0.6673074288198911, "learning_rate": 4.620489792773854e-06, "loss": 0.5546, "num_tokens": 196928715.0, "step": 964 }, { "epoch": 2.3086124401913874, "grad_norm": 0.6667667122541079, "learning_rate": 4.611938382574799e-06, "loss": 0.6447, "num_tokens": 197132410.0, "step": 965 }, { "epoch": 2.311004784688995, "grad_norm": 0.7382401821510317, "learning_rate": 4.603390307996592e-06, "loss": 0.5502, "num_tokens": 197303154.0, "step": 966 }, { "epoch": 2.313397129186603, "grad_norm": 0.5355460347305746, "learning_rate": 4.5948456011463946e-06, "loss": 0.5119, "num_tokens": 197538989.0, "step": 967 }, { "epoch": 2.3157894736842106, "grad_norm": 0.588745542689267, "learning_rate": 4.586304294118718e-06, "loss": 0.5423, "num_tokens": 197766883.0, "step": 968 }, { "epoch": 2.3181818181818183, "grad_norm": 0.5842131197012101, "learning_rate": 4.577766418995296e-06, "loss": 0.603, "num_tokens": 197987061.0, "step": 969 }, { "epoch": 2.320574162679426, "grad_norm": 0.6979657515490693, "learning_rate": 4.569232007844985e-06, "loss": 0.5722, "num_tokens": 198185962.0, "step": 970 }, { "epoch": 2.3229665071770333, "grad_norm": 0.5616019882938909, "learning_rate": 4.560701092723617e-06, "loss": 0.7079, "num_tokens": 198419002.0, "step": 971 }, { "epoch": 2.325358851674641, "grad_norm": 0.5897872136033281, "learning_rate": 4.552173705673905e-06, "loss": 0.6447, "num_tokens": 198618480.0, "step": 972 }, { "epoch": 2.327751196172249, "grad_norm": 0.6208653472110455, "learning_rate": 4.5436498787253015e-06, "loss": 0.4777, "num_tokens": 198776227.0, "step": 973 }, { "epoch": 2.3301435406698565, "grad_norm": 0.4775412883065485, "learning_rate": 4.535129643893891e-06, "loss": 0.4726, "num_tokens": 199043473.0, "step": 974 }, { "epoch": 2.3325358851674642, "grad_norm": 0.507525873030941, "learning_rate": 4.526613033182261e-06, "loss": 0.5436, "num_tokens": 199306828.0, "step": 975 }, { "epoch": 2.334928229665072, "grad_norm": 0.6560516411422593, "learning_rate": 4.518100078579397e-06, "loss": 0.5843, "num_tokens": 199503531.0, "step": 976 }, { "epoch": 2.3373205741626792, "grad_norm": 0.6356188122044742, "learning_rate": 4.50959081206054e-06, "loss": 0.5826, "num_tokens": 199688436.0, "step": 977 }, { "epoch": 2.339712918660287, "grad_norm": 0.6420147098738003, "learning_rate": 4.501085265587086e-06, "loss": 0.5274, "num_tokens": 199876619.0, "step": 978 }, { "epoch": 2.3421052631578947, "grad_norm": 0.6400044423826036, "learning_rate": 4.492583471106454e-06, "loss": 0.5228, "num_tokens": 200078177.0, "step": 979 }, { "epoch": 2.3444976076555024, "grad_norm": 0.6337573455616772, "learning_rate": 4.484085460551974e-06, "loss": 0.4544, "num_tokens": 200277105.0, "step": 980 }, { "epoch": 2.34688995215311, "grad_norm": 0.5146634318287221, "learning_rate": 4.4755912658427605e-06, "loss": 0.6146, "num_tokens": 200556359.0, "step": 981 }, { "epoch": 2.349282296650718, "grad_norm": 0.557609526377004, "learning_rate": 4.467100918883594e-06, "loss": 0.5273, "num_tokens": 200789036.0, "step": 982 }, { "epoch": 2.3516746411483256, "grad_norm": 0.6103882750369611, "learning_rate": 4.458614451564809e-06, "loss": 0.5557, "num_tokens": 201016291.0, "step": 983 }, { "epoch": 2.354066985645933, "grad_norm": 0.5615569932242273, "learning_rate": 4.4501318957621595e-06, "loss": 0.531, "num_tokens": 201206656.0, "step": 984 }, { "epoch": 2.3564593301435406, "grad_norm": 0.5897719971401356, "learning_rate": 4.441653283336715e-06, "loss": 0.5324, "num_tokens": 201412902.0, "step": 985 }, { "epoch": 2.3588516746411483, "grad_norm": 0.6345523830064598, "learning_rate": 4.433178646134727e-06, "loss": 0.5531, "num_tokens": 201669487.0, "step": 986 }, { "epoch": 2.361244019138756, "grad_norm": 0.45138951192895616, "learning_rate": 4.424708015987524e-06, "loss": 0.5122, "num_tokens": 201917216.0, "step": 987 }, { "epoch": 2.3636363636363638, "grad_norm": 0.5682611787081613, "learning_rate": 4.416241424711372e-06, "loss": 0.4912, "num_tokens": 202090530.0, "step": 988 }, { "epoch": 2.3660287081339715, "grad_norm": 0.5261641148966557, "learning_rate": 4.4077789041073785e-06, "loss": 0.4621, "num_tokens": 202317413.0, "step": 989 }, { "epoch": 2.3684210526315788, "grad_norm": 0.5631165113167252, "learning_rate": 4.399320485961354e-06, "loss": 0.5311, "num_tokens": 202518648.0, "step": 990 }, { "epoch": 2.3708133971291865, "grad_norm": 0.4967545289756969, "learning_rate": 4.390866202043702e-06, "loss": 0.4564, "num_tokens": 202760261.0, "step": 991 }, { "epoch": 2.373205741626794, "grad_norm": 0.7465162438884122, "learning_rate": 4.382416084109298e-06, "loss": 0.5281, "num_tokens": 202889486.0, "step": 992 }, { "epoch": 2.375598086124402, "grad_norm": 0.5801443612563177, "learning_rate": 4.37397016389737e-06, "loss": 0.6409, "num_tokens": 203141411.0, "step": 993 }, { "epoch": 2.3779904306220097, "grad_norm": 0.5997099906097679, "learning_rate": 4.365528473131378e-06, "loss": 0.5756, "num_tokens": 203352106.0, "step": 994 }, { "epoch": 2.3803827751196174, "grad_norm": 0.6006398336798263, "learning_rate": 4.3570910435188915e-06, "loss": 0.4975, "num_tokens": 203544738.0, "step": 995 }, { "epoch": 2.382775119617225, "grad_norm": 0.616118763392102, "learning_rate": 4.348657906751488e-06, "loss": 0.6145, "num_tokens": 203738759.0, "step": 996 }, { "epoch": 2.3851674641148324, "grad_norm": 0.6640491937457687, "learning_rate": 4.340229094504607e-06, "loss": 0.5607, "num_tokens": 203898058.0, "step": 997 }, { "epoch": 2.38755980861244, "grad_norm": 0.6225754729256734, "learning_rate": 4.3318046384374526e-06, "loss": 0.554, "num_tokens": 204115504.0, "step": 998 }, { "epoch": 2.389952153110048, "grad_norm": 0.6377150799357686, "learning_rate": 4.3233845701928615e-06, "loss": 0.5783, "num_tokens": 204332997.0, "step": 999 }, { "epoch": 2.3923444976076556, "grad_norm": 0.5853455958148285, "learning_rate": 4.314968921397197e-06, "loss": 0.4981, "num_tokens": 204535990.0, "step": 1000 }, { "epoch": 2.3947368421052633, "grad_norm": 0.6248730722883572, "learning_rate": 4.306557723660212e-06, "loss": 0.5558, "num_tokens": 204744939.0, "step": 1001 }, { "epoch": 2.397129186602871, "grad_norm": 0.5151699139022012, "learning_rate": 4.298151008574952e-06, "loss": 0.3998, "num_tokens": 204938198.0, "step": 1002 }, { "epoch": 2.3995215311004783, "grad_norm": 0.6334922426435169, "learning_rate": 4.289748807717618e-06, "loss": 0.6982, "num_tokens": 205170249.0, "step": 1003 }, { "epoch": 2.401913875598086, "grad_norm": 0.7135525185350413, "learning_rate": 4.2813511526474585e-06, "loss": 0.5034, "num_tokens": 205304624.0, "step": 1004 }, { "epoch": 2.4043062200956937, "grad_norm": 0.5902081724362165, "learning_rate": 4.2729580749066435e-06, "loss": 0.6034, "num_tokens": 205516948.0, "step": 1005 }, { "epoch": 2.4066985645933014, "grad_norm": 0.6436090687074414, "learning_rate": 4.264569606020159e-06, "loss": 0.6051, "num_tokens": 205713236.0, "step": 1006 }, { "epoch": 2.409090909090909, "grad_norm": 0.5528964704740937, "learning_rate": 4.256185777495668e-06, "loss": 0.457, "num_tokens": 205918094.0, "step": 1007 }, { "epoch": 2.411483253588517, "grad_norm": 0.689153042053297, "learning_rate": 4.247806620823416e-06, "loss": 0.6527, "num_tokens": 206126263.0, "step": 1008 }, { "epoch": 2.4138755980861246, "grad_norm": 0.6370805657969235, "learning_rate": 4.239432167476091e-06, "loss": 0.7036, "num_tokens": 206339096.0, "step": 1009 }, { "epoch": 2.416267942583732, "grad_norm": 0.6004855383256495, "learning_rate": 4.231062448908721e-06, "loss": 0.587, "num_tokens": 206552352.0, "step": 1010 }, { "epoch": 2.4186602870813396, "grad_norm": 0.6173172520580517, "learning_rate": 4.222697496558547e-06, "loss": 0.6095, "num_tokens": 206777192.0, "step": 1011 }, { "epoch": 2.4210526315789473, "grad_norm": 0.6835207209010378, "learning_rate": 4.214337341844906e-06, "loss": 0.5918, "num_tokens": 206978187.0, "step": 1012 }, { "epoch": 2.423444976076555, "grad_norm": 0.6131323747610118, "learning_rate": 4.205982016169122e-06, "loss": 0.6657, "num_tokens": 207195050.0, "step": 1013 }, { "epoch": 2.425837320574163, "grad_norm": 0.5850555875038469, "learning_rate": 4.197631550914372e-06, "loss": 0.57, "num_tokens": 207406444.0, "step": 1014 }, { "epoch": 2.4282296650717705, "grad_norm": 0.4939100102339527, "learning_rate": 4.189285977445584e-06, "loss": 0.3718, "num_tokens": 207620232.0, "step": 1015 }, { "epoch": 2.430622009569378, "grad_norm": 0.653808161167263, "learning_rate": 4.180945327109308e-06, "loss": 0.5829, "num_tokens": 207826050.0, "step": 1016 }, { "epoch": 2.4330143540669855, "grad_norm": 0.43981969392432796, "learning_rate": 4.172609631233603e-06, "loss": 0.3208, "num_tokens": 208017777.0, "step": 1017 }, { "epoch": 2.4354066985645932, "grad_norm": 0.6261020734046672, "learning_rate": 4.164278921127919e-06, "loss": 0.5125, "num_tokens": 208207732.0, "step": 1018 }, { "epoch": 2.437799043062201, "grad_norm": 0.6317188642592404, "learning_rate": 4.15595322808298e-06, "loss": 0.4381, "num_tokens": 208386520.0, "step": 1019 }, { "epoch": 2.4401913875598087, "grad_norm": 0.6202931009885635, "learning_rate": 4.147632583370664e-06, "loss": 0.6438, "num_tokens": 208641359.0, "step": 1020 }, { "epoch": 2.4425837320574164, "grad_norm": 0.6022603778669238, "learning_rate": 4.1393170182438905e-06, "loss": 0.5467, "num_tokens": 208841020.0, "step": 1021 }, { "epoch": 2.444976076555024, "grad_norm": 0.6327040046052304, "learning_rate": 4.131006563936492e-06, "loss": 0.708, "num_tokens": 209071062.0, "step": 1022 }, { "epoch": 2.4473684210526314, "grad_norm": 0.5266051069162052, "learning_rate": 4.122701251663116e-06, "loss": 0.5639, "num_tokens": 209307718.0, "step": 1023 }, { "epoch": 2.449760765550239, "grad_norm": 0.5136191658605107, "learning_rate": 4.114401112619084e-06, "loss": 0.5557, "num_tokens": 209584037.0, "step": 1024 }, { "epoch": 2.452153110047847, "grad_norm": 0.6058510179396281, "learning_rate": 4.106106177980297e-06, "loss": 0.6519, "num_tokens": 209800026.0, "step": 1025 }, { "epoch": 2.4545454545454546, "grad_norm": 0.7748377168030689, "learning_rate": 4.0978164789031e-06, "loss": 0.6554, "num_tokens": 209945028.0, "step": 1026 }, { "epoch": 2.4569377990430623, "grad_norm": 0.539487028193741, "learning_rate": 4.089532046524176e-06, "loss": 0.7053, "num_tokens": 210227001.0, "step": 1027 }, { "epoch": 2.45933014354067, "grad_norm": 0.5917748409979356, "learning_rate": 4.081252911960428e-06, "loss": 0.5759, "num_tokens": 210417942.0, "step": 1028 }, { "epoch": 2.4617224880382773, "grad_norm": 0.5992588126824665, "learning_rate": 4.072979106308858e-06, "loss": 0.6159, "num_tokens": 210628053.0, "step": 1029 }, { "epoch": 2.464114832535885, "grad_norm": 0.6651607322996815, "learning_rate": 4.064710660646448e-06, "loss": 0.6818, "num_tokens": 210829550.0, "step": 1030 }, { "epoch": 2.4665071770334928, "grad_norm": 0.6540903157509698, "learning_rate": 4.056447606030056e-06, "loss": 0.61, "num_tokens": 211056840.0, "step": 1031 }, { "epoch": 2.4688995215311005, "grad_norm": 0.7991178220481605, "learning_rate": 4.048189973496287e-06, "loss": 0.6707, "num_tokens": 211211372.0, "step": 1032 }, { "epoch": 2.471291866028708, "grad_norm": 0.5935027675340312, "learning_rate": 4.039937794061376e-06, "loss": 0.621, "num_tokens": 211414390.0, "step": 1033 }, { "epoch": 2.473684210526316, "grad_norm": 0.6755369914687083, "learning_rate": 4.031691098721083e-06, "loss": 0.6413, "num_tokens": 211588059.0, "step": 1034 }, { "epoch": 2.4760765550239237, "grad_norm": 0.6526118866886425, "learning_rate": 4.023449918450565e-06, "loss": 0.6944, "num_tokens": 211807518.0, "step": 1035 }, { "epoch": 2.478468899521531, "grad_norm": 0.564198257094253, "learning_rate": 4.015214284204265e-06, "loss": 0.6394, "num_tokens": 212025980.0, "step": 1036 }, { "epoch": 2.4808612440191387, "grad_norm": 0.627883039676126, "learning_rate": 4.0069842269157935e-06, "loss": 0.5591, "num_tokens": 212199416.0, "step": 1037 }, { "epoch": 2.4832535885167464, "grad_norm": 0.6055641095041324, "learning_rate": 3.998759777497816e-06, "loss": 0.5301, "num_tokens": 212379337.0, "step": 1038 }, { "epoch": 2.485645933014354, "grad_norm": 0.7117718176806141, "learning_rate": 3.9905409668419346e-06, "loss": 0.6314, "num_tokens": 212556989.0, "step": 1039 }, { "epoch": 2.488038277511962, "grad_norm": 0.7179326451594609, "learning_rate": 3.982327825818568e-06, "loss": 0.6056, "num_tokens": 212743946.0, "step": 1040 }, { "epoch": 2.4904306220095696, "grad_norm": 0.6535767607430862, "learning_rate": 3.974120385276844e-06, "loss": 0.7189, "num_tokens": 212935011.0, "step": 1041 }, { "epoch": 2.492822966507177, "grad_norm": 0.6050383476235132, "learning_rate": 3.965918676044476e-06, "loss": 0.5952, "num_tokens": 213123277.0, "step": 1042 }, { "epoch": 2.4952153110047846, "grad_norm": 0.6607651606385861, "learning_rate": 3.95772272892765e-06, "loss": 0.6589, "num_tokens": 213334844.0, "step": 1043 }, { "epoch": 2.4976076555023923, "grad_norm": 0.6241385293512635, "learning_rate": 3.94953257471091e-06, "loss": 0.654, "num_tokens": 213535142.0, "step": 1044 }, { "epoch": 2.5, "grad_norm": 0.6213474009144981, "learning_rate": 3.941348244157045e-06, "loss": 0.6657, "num_tokens": 213749790.0, "step": 1045 }, { "epoch": 2.5023923444976077, "grad_norm": 0.5820164738008969, "learning_rate": 3.933169768006962e-06, "loss": 0.6203, "num_tokens": 213993242.0, "step": 1046 }, { "epoch": 2.5047846889952154, "grad_norm": 0.6881833790841918, "learning_rate": 3.924997176979589e-06, "loss": 0.7844, "num_tokens": 214184984.0, "step": 1047 }, { "epoch": 2.507177033492823, "grad_norm": 0.6146739381057159, "learning_rate": 3.916830501771736e-06, "loss": 0.4875, "num_tokens": 214360349.0, "step": 1048 }, { "epoch": 2.509569377990431, "grad_norm": 0.6025043525167425, "learning_rate": 3.908669773058005e-06, "loss": 0.6133, "num_tokens": 214575876.0, "step": 1049 }, { "epoch": 2.511961722488038, "grad_norm": 0.693733393553052, "learning_rate": 3.900515021490654e-06, "loss": 0.7332, "num_tokens": 214788140.0, "step": 1050 }, { "epoch": 2.514354066985646, "grad_norm": 0.5594177626974491, "learning_rate": 3.8923662776994965e-06, "loss": 0.433, "num_tokens": 214982831.0, "step": 1051 }, { "epoch": 2.5167464114832536, "grad_norm": 0.5817776587828116, "learning_rate": 3.884223572291778e-06, "loss": 0.5643, "num_tokens": 215190443.0, "step": 1052 }, { "epoch": 2.5191387559808613, "grad_norm": 0.6656717449668278, "learning_rate": 3.876086935852059e-06, "loss": 0.687, "num_tokens": 215387454.0, "step": 1053 }, { "epoch": 2.5215311004784686, "grad_norm": 0.46271635043906406, "learning_rate": 3.867956398942113e-06, "loss": 0.3984, "num_tokens": 215598001.0, "step": 1054 }, { "epoch": 2.5239234449760763, "grad_norm": 0.6034680377399717, "learning_rate": 3.859831992100798e-06, "loss": 0.7469, "num_tokens": 215814084.0, "step": 1055 }, { "epoch": 2.526315789473684, "grad_norm": 0.5789325890650581, "learning_rate": 3.8517137458439465e-06, "loss": 0.5048, "num_tokens": 215979272.0, "step": 1056 }, { "epoch": 2.528708133971292, "grad_norm": 0.4948180098998025, "learning_rate": 3.843601690664253e-06, "loss": 0.4004, "num_tokens": 216194524.0, "step": 1057 }, { "epoch": 2.5311004784688995, "grad_norm": 0.5754107413539873, "learning_rate": 3.83549585703116e-06, "loss": 0.4014, "num_tokens": 216365812.0, "step": 1058 }, { "epoch": 2.5334928229665072, "grad_norm": 0.6750744543039905, "learning_rate": 3.827396275390736e-06, "loss": 0.6902, "num_tokens": 216545039.0, "step": 1059 }, { "epoch": 2.535885167464115, "grad_norm": 0.6577431658509755, "learning_rate": 3.819302976165573e-06, "loss": 0.664, "num_tokens": 216746551.0, "step": 1060 }, { "epoch": 2.5382775119617227, "grad_norm": 0.627785069016739, "learning_rate": 3.8112159897546597e-06, "loss": 0.6174, "num_tokens": 216955697.0, "step": 1061 }, { "epoch": 2.5406698564593304, "grad_norm": 0.5995174609182009, "learning_rate": 3.8031353465332786e-06, "loss": 0.7305, "num_tokens": 217192047.0, "step": 1062 }, { "epoch": 2.5430622009569377, "grad_norm": 0.5695080880257041, "learning_rate": 3.795061076852882e-06, "loss": 0.6676, "num_tokens": 217418481.0, "step": 1063 }, { "epoch": 2.5454545454545454, "grad_norm": 0.6765456404371507, "learning_rate": 3.786993211040988e-06, "loss": 0.5479, "num_tokens": 217582242.0, "step": 1064 }, { "epoch": 2.547846889952153, "grad_norm": 0.5558441353632505, "learning_rate": 3.7789317794010562e-06, "loss": 0.4938, "num_tokens": 217821253.0, "step": 1065 }, { "epoch": 2.550239234449761, "grad_norm": 0.5652778076960531, "learning_rate": 3.770876812212384e-06, "loss": 0.4471, "num_tokens": 218028715.0, "step": 1066 }, { "epoch": 2.5526315789473686, "grad_norm": 0.7041275199284726, "learning_rate": 3.7628283397299825e-06, "loss": 0.4983, "num_tokens": 218188513.0, "step": 1067 }, { "epoch": 2.555023923444976, "grad_norm": 0.6547316059280318, "learning_rate": 3.7547863921844717e-06, "loss": 0.4126, "num_tokens": 218376107.0, "step": 1068 }, { "epoch": 2.5574162679425836, "grad_norm": 0.6336547930033282, "learning_rate": 3.7467509997819627e-06, "loss": 0.6674, "num_tokens": 218570758.0, "step": 1069 }, { "epoch": 2.5598086124401913, "grad_norm": 0.5478760136946561, "learning_rate": 3.738722192703946e-06, "loss": 0.4566, "num_tokens": 218786920.0, "step": 1070 }, { "epoch": 2.562200956937799, "grad_norm": 0.7139683911439062, "learning_rate": 3.730700001107177e-06, "loss": 0.7549, "num_tokens": 218985901.0, "step": 1071 }, { "epoch": 2.5645933014354068, "grad_norm": 0.6975615168028549, "learning_rate": 3.7226844551235575e-06, "loss": 0.6629, "num_tokens": 219159140.0, "step": 1072 }, { "epoch": 2.5669856459330145, "grad_norm": 0.6668143922240283, "learning_rate": 3.714675584860038e-06, "loss": 0.6375, "num_tokens": 219371386.0, "step": 1073 }, { "epoch": 2.569377990430622, "grad_norm": 0.5982014271194838, "learning_rate": 3.7066734203984855e-06, "loss": 0.3727, "num_tokens": 219552579.0, "step": 1074 }, { "epoch": 2.57177033492823, "grad_norm": 0.6955525411313587, "learning_rate": 3.698677991795585e-06, "loss": 0.7192, "num_tokens": 219775671.0, "step": 1075 }, { "epoch": 2.574162679425837, "grad_norm": 0.5369077452815465, "learning_rate": 3.6906893290827185e-06, "loss": 0.4529, "num_tokens": 219965861.0, "step": 1076 }, { "epoch": 2.576555023923445, "grad_norm": 0.6721059340974544, "learning_rate": 3.6827074622658566e-06, "loss": 0.5365, "num_tokens": 220165003.0, "step": 1077 }, { "epoch": 2.5789473684210527, "grad_norm": 0.7135684792977236, "learning_rate": 3.6747324213254397e-06, "loss": 0.5682, "num_tokens": 220392321.0, "step": 1078 }, { "epoch": 2.5813397129186604, "grad_norm": 0.6494117986406011, "learning_rate": 3.6667642362162793e-06, "loss": 0.5806, "num_tokens": 220598874.0, "step": 1079 }, { "epoch": 2.583732057416268, "grad_norm": 0.6222063527010798, "learning_rate": 3.6588029368674237e-06, "loss": 0.4917, "num_tokens": 220776263.0, "step": 1080 }, { "epoch": 2.5861244019138754, "grad_norm": 0.5934688030422492, "learning_rate": 3.650848553182067e-06, "loss": 0.5072, "num_tokens": 220960288.0, "step": 1081 }, { "epoch": 2.588516746411483, "grad_norm": 0.5684846998173526, "learning_rate": 3.6429011150374217e-06, "loss": 0.4913, "num_tokens": 221184618.0, "step": 1082 }, { "epoch": 2.590909090909091, "grad_norm": 0.549834773214977, "learning_rate": 3.634960652284619e-06, "loss": 0.3509, "num_tokens": 221386351.0, "step": 1083 }, { "epoch": 2.5933014354066986, "grad_norm": 0.6483735196222553, "learning_rate": 3.6270271947485835e-06, "loss": 0.5319, "num_tokens": 221579057.0, "step": 1084 }, { "epoch": 2.5956937799043063, "grad_norm": 0.5153676973173746, "learning_rate": 3.6191007722279315e-06, "loss": 0.4309, "num_tokens": 221781696.0, "step": 1085 }, { "epoch": 2.598086124401914, "grad_norm": 0.5489517271767583, "learning_rate": 3.611181414494852e-06, "loss": 0.4592, "num_tokens": 222005363.0, "step": 1086 }, { "epoch": 2.6004784688995217, "grad_norm": 0.49214928466990493, "learning_rate": 3.6032691512950024e-06, "loss": 0.4458, "num_tokens": 222259186.0, "step": 1087 }, { "epoch": 2.6028708133971294, "grad_norm": 0.6196094337292347, "learning_rate": 3.595364012347391e-06, "loss": 0.663, "num_tokens": 222513076.0, "step": 1088 }, { "epoch": 2.6052631578947367, "grad_norm": 0.5427019877705024, "learning_rate": 3.587466027344264e-06, "loss": 0.5428, "num_tokens": 222750527.0, "step": 1089 }, { "epoch": 2.6076555023923444, "grad_norm": 0.5771856967931067, "learning_rate": 3.579575225951003e-06, "loss": 0.5553, "num_tokens": 223006934.0, "step": 1090 }, { "epoch": 2.610047846889952, "grad_norm": 0.6034575663203413, "learning_rate": 3.5716916378059973e-06, "loss": 0.7012, "num_tokens": 223224028.0, "step": 1091 }, { "epoch": 2.61244019138756, "grad_norm": 0.6660365069148427, "learning_rate": 3.5638152925205573e-06, "loss": 0.7283, "num_tokens": 223426112.0, "step": 1092 }, { "epoch": 2.6148325358851676, "grad_norm": 0.643954882493234, "learning_rate": 3.555946219678772e-06, "loss": 0.527, "num_tokens": 223597677.0, "step": 1093 }, { "epoch": 2.617224880382775, "grad_norm": 0.5294855404946633, "learning_rate": 3.5480844488374313e-06, "loss": 0.3729, "num_tokens": 223791816.0, "step": 1094 }, { "epoch": 2.6196172248803826, "grad_norm": 0.6803208173614663, "learning_rate": 3.5402300095258833e-06, "loss": 0.6289, "num_tokens": 224011960.0, "step": 1095 }, { "epoch": 2.6220095693779903, "grad_norm": 0.5981932778163507, "learning_rate": 3.5323829312459524e-06, "loss": 0.5048, "num_tokens": 224234695.0, "step": 1096 }, { "epoch": 2.624401913875598, "grad_norm": 0.6101452118273694, "learning_rate": 3.524543243471801e-06, "loss": 0.4449, "num_tokens": 224410706.0, "step": 1097 }, { "epoch": 2.626794258373206, "grad_norm": 0.5389924597744387, "learning_rate": 3.516710975649841e-06, "loss": 0.5182, "num_tokens": 224593832.0, "step": 1098 }, { "epoch": 2.6291866028708135, "grad_norm": 0.6975855703487497, "learning_rate": 3.508886157198611e-06, "loss": 0.6577, "num_tokens": 224798557.0, "step": 1099 }, { "epoch": 2.6315789473684212, "grad_norm": 0.6351790408256973, "learning_rate": 3.5010688175086718e-06, "loss": 0.7309, "num_tokens": 225033322.0, "step": 1100 }, { "epoch": 2.633971291866029, "grad_norm": 0.6330427861367581, "learning_rate": 3.4932589859424913e-06, "loss": 0.7808, "num_tokens": 225255549.0, "step": 1101 }, { "epoch": 2.6363636363636362, "grad_norm": 0.6801017341307927, "learning_rate": 3.485456691834337e-06, "loss": 0.6036, "num_tokens": 225432118.0, "step": 1102 }, { "epoch": 2.638755980861244, "grad_norm": 0.6658896469366345, "learning_rate": 3.4776619644901655e-06, "loss": 0.6647, "num_tokens": 225616023.0, "step": 1103 }, { "epoch": 2.6411483253588517, "grad_norm": 0.4996367213764907, "learning_rate": 3.469874833187508e-06, "loss": 0.5414, "num_tokens": 225893120.0, "step": 1104 }, { "epoch": 2.6435406698564594, "grad_norm": 0.6616958516454566, "learning_rate": 3.4620953271753756e-06, "loss": 0.6326, "num_tokens": 226088868.0, "step": 1105 }, { "epoch": 2.645933014354067, "grad_norm": 0.6362043584410174, "learning_rate": 3.454323475674123e-06, "loss": 0.671, "num_tokens": 226310547.0, "step": 1106 }, { "epoch": 2.6483253588516744, "grad_norm": 0.47501498356513666, "learning_rate": 3.4465593078753688e-06, "loss": 0.4813, "num_tokens": 226550492.0, "step": 1107 }, { "epoch": 2.650717703349282, "grad_norm": 0.5031158510563932, "learning_rate": 3.438802852941858e-06, "loss": 0.5583, "num_tokens": 226832459.0, "step": 1108 }, { "epoch": 2.65311004784689, "grad_norm": 0.5435062765868961, "learning_rate": 3.431054140007377e-06, "loss": 0.5455, "num_tokens": 227038778.0, "step": 1109 }, { "epoch": 2.6555023923444976, "grad_norm": 0.5388254458858796, "learning_rate": 3.4233131981766243e-06, "loss": 0.5894, "num_tokens": 227249531.0, "step": 1110 }, { "epoch": 2.6578947368421053, "grad_norm": 0.5922470838904194, "learning_rate": 3.4155800565251135e-06, "loss": 0.4353, "num_tokens": 227462703.0, "step": 1111 }, { "epoch": 2.660287081339713, "grad_norm": 0.6846109348473851, "learning_rate": 3.40785474409906e-06, "loss": 0.5729, "num_tokens": 227635078.0, "step": 1112 }, { "epoch": 2.6626794258373208, "grad_norm": 0.4764323015116, "learning_rate": 3.4001372899152703e-06, "loss": 0.4876, "num_tokens": 227906297.0, "step": 1113 }, { "epoch": 2.6650717703349285, "grad_norm": 0.5853206610805425, "learning_rate": 3.392427722961037e-06, "loss": 0.5074, "num_tokens": 228140226.0, "step": 1114 }, { "epoch": 2.6674641148325358, "grad_norm": 0.6793627537042475, "learning_rate": 3.384726072194026e-06, "loss": 0.7223, "num_tokens": 228357361.0, "step": 1115 }, { "epoch": 2.6698564593301435, "grad_norm": 0.5463734944701496, "learning_rate": 3.3770323665421713e-06, "loss": 0.6404, "num_tokens": 228594109.0, "step": 1116 }, { "epoch": 2.672248803827751, "grad_norm": 0.6738633775521184, "learning_rate": 3.3693466349035627e-06, "loss": 0.5112, "num_tokens": 228755387.0, "step": 1117 }, { "epoch": 2.674641148325359, "grad_norm": 0.6531030945413439, "learning_rate": 3.361668906146341e-06, "loss": 0.6901, "num_tokens": 228956043.0, "step": 1118 }, { "epoch": 2.6770334928229667, "grad_norm": 0.6923041244581303, "learning_rate": 3.353999209108583e-06, "loss": 0.5057, "num_tokens": 229125917.0, "step": 1119 }, { "epoch": 2.679425837320574, "grad_norm": 0.6775907584116359, "learning_rate": 3.3463375725982056e-06, "loss": 0.5541, "num_tokens": 229281789.0, "step": 1120 }, { "epoch": 2.6818181818181817, "grad_norm": 0.6173853747715191, "learning_rate": 3.3386840253928398e-06, "loss": 0.4948, "num_tokens": 229477784.0, "step": 1121 }, { "epoch": 2.6842105263157894, "grad_norm": 0.6725183241705002, "learning_rate": 3.3310385962397453e-06, "loss": 0.7163, "num_tokens": 229673326.0, "step": 1122 }, { "epoch": 2.686602870813397, "grad_norm": 0.6499836969575646, "learning_rate": 3.3234013138556746e-06, "loss": 0.5034, "num_tokens": 229824970.0, "step": 1123 }, { "epoch": 2.688995215311005, "grad_norm": 0.7328340711385909, "learning_rate": 3.315772206926798e-06, "loss": 0.6574, "num_tokens": 229981855.0, "step": 1124 }, { "epoch": 2.6913875598086126, "grad_norm": 0.6549771664676625, "learning_rate": 3.308151304108561e-06, "loss": 0.5855, "num_tokens": 230203169.0, "step": 1125 }, { "epoch": 2.6937799043062203, "grad_norm": 0.7356607114309072, "learning_rate": 3.300538634025604e-06, "loss": 0.6899, "num_tokens": 230361759.0, "step": 1126 }, { "epoch": 2.696172248803828, "grad_norm": 0.6495199581004002, "learning_rate": 3.2929342252716424e-06, "loss": 0.4668, "num_tokens": 230559686.0, "step": 1127 }, { "epoch": 2.6985645933014353, "grad_norm": 0.5919753120066145, "learning_rate": 3.2853381064093614e-06, "loss": 0.7145, "num_tokens": 230825813.0, "step": 1128 }, { "epoch": 2.700956937799043, "grad_norm": 0.6280734437638849, "learning_rate": 3.27775030597031e-06, "loss": 0.6184, "num_tokens": 231018150.0, "step": 1129 }, { "epoch": 2.7033492822966507, "grad_norm": 0.5629872166072551, "learning_rate": 3.2701708524547883e-06, "loss": 0.5601, "num_tokens": 231240035.0, "step": 1130 }, { "epoch": 2.7057416267942584, "grad_norm": 0.56395118769966, "learning_rate": 3.2625997743317508e-06, "loss": 0.5507, "num_tokens": 231467233.0, "step": 1131 }, { "epoch": 2.708133971291866, "grad_norm": 0.5779718638481424, "learning_rate": 3.255037100038689e-06, "loss": 0.6199, "num_tokens": 231674686.0, "step": 1132 }, { "epoch": 2.7105263157894735, "grad_norm": 0.5112483312693801, "learning_rate": 3.247482857981533e-06, "loss": 0.3537, "num_tokens": 231885671.0, "step": 1133 }, { "epoch": 2.712918660287081, "grad_norm": 0.6686531419352844, "learning_rate": 3.2399370765345328e-06, "loss": 0.6546, "num_tokens": 232078705.0, "step": 1134 }, { "epoch": 2.715311004784689, "grad_norm": 0.5485950632598388, "learning_rate": 3.2323997840401733e-06, "loss": 0.4732, "num_tokens": 232297544.0, "step": 1135 }, { "epoch": 2.7177033492822966, "grad_norm": 0.6605191743413368, "learning_rate": 3.2248710088090373e-06, "loss": 0.5781, "num_tokens": 232480907.0, "step": 1136 }, { "epoch": 2.7200956937799043, "grad_norm": 0.6745067883216485, "learning_rate": 3.2173507791197338e-06, "loss": 0.6685, "num_tokens": 232645016.0, "step": 1137 }, { "epoch": 2.722488038277512, "grad_norm": 0.5925129825038307, "learning_rate": 3.2098391232187597e-06, "loss": 0.4972, "num_tokens": 232853233.0, "step": 1138 }, { "epoch": 2.72488038277512, "grad_norm": 0.6467934814235833, "learning_rate": 3.2023360693204166e-06, "loss": 0.6755, "num_tokens": 233072460.0, "step": 1139 }, { "epoch": 2.7272727272727275, "grad_norm": 0.6202378830396424, "learning_rate": 3.1948416456066934e-06, "loss": 0.5755, "num_tokens": 233276759.0, "step": 1140 }, { "epoch": 2.729665071770335, "grad_norm": 0.6243928806510726, "learning_rate": 3.187355880227163e-06, "loss": 0.6521, "num_tokens": 233492573.0, "step": 1141 }, { "epoch": 2.7320574162679425, "grad_norm": 0.6702870073976731, "learning_rate": 3.179878801298879e-06, "loss": 0.5395, "num_tokens": 233720969.0, "step": 1142 }, { "epoch": 2.7344497607655502, "grad_norm": 0.7130159451718587, "learning_rate": 3.172410436906266e-06, "loss": 0.5496, "num_tokens": 233895280.0, "step": 1143 }, { "epoch": 2.736842105263158, "grad_norm": 0.6528019038883675, "learning_rate": 3.1649508151010178e-06, "loss": 0.5347, "num_tokens": 234096477.0, "step": 1144 }, { "epoch": 2.7392344497607657, "grad_norm": 0.6158483720195869, "learning_rate": 3.15749996390199e-06, "loss": 0.4725, "num_tokens": 234270137.0, "step": 1145 }, { "epoch": 2.741626794258373, "grad_norm": 0.5818298212230908, "learning_rate": 3.150057911295094e-06, "loss": 0.5322, "num_tokens": 234494284.0, "step": 1146 }, { "epoch": 2.7440191387559807, "grad_norm": 0.6045623627858717, "learning_rate": 3.142624685233195e-06, "loss": 0.3617, "num_tokens": 234638165.0, "step": 1147 }, { "epoch": 2.7464114832535884, "grad_norm": 0.7193182995338183, "learning_rate": 3.135200313636003e-06, "loss": 0.6732, "num_tokens": 234845386.0, "step": 1148 }, { "epoch": 2.748803827751196, "grad_norm": 0.6826295810846958, "learning_rate": 3.1277848243899735e-06, "loss": 0.6403, "num_tokens": 235068606.0, "step": 1149 }, { "epoch": 2.751196172248804, "grad_norm": 0.667979745521155, "learning_rate": 3.1203782453481957e-06, "loss": 0.5665, "num_tokens": 235285213.0, "step": 1150 }, { "epoch": 2.7535885167464116, "grad_norm": 0.6627017024547665, "learning_rate": 3.112980604330291e-06, "loss": 0.6244, "num_tokens": 235484153.0, "step": 1151 }, { "epoch": 2.7559808612440193, "grad_norm": 0.6369823056389959, "learning_rate": 3.1055919291223118e-06, "loss": 0.5083, "num_tokens": 235660671.0, "step": 1152 }, { "epoch": 2.758373205741627, "grad_norm": 0.6595674482265206, "learning_rate": 3.0982122474766334e-06, "loss": 0.51, "num_tokens": 235845956.0, "step": 1153 }, { "epoch": 2.7607655502392343, "grad_norm": 0.6180105058916081, "learning_rate": 3.09084158711185e-06, "loss": 0.3644, "num_tokens": 236032609.0, "step": 1154 }, { "epoch": 2.763157894736842, "grad_norm": 0.7079586191256172, "learning_rate": 3.0834799757126733e-06, "loss": 0.6025, "num_tokens": 236240870.0, "step": 1155 }, { "epoch": 2.7655502392344498, "grad_norm": 0.6307140834614965, "learning_rate": 3.0761274409298225e-06, "loss": 0.6264, "num_tokens": 236448909.0, "step": 1156 }, { "epoch": 2.7679425837320575, "grad_norm": 0.6165485458109691, "learning_rate": 3.0687840103799273e-06, "loss": 0.5498, "num_tokens": 236637469.0, "step": 1157 }, { "epoch": 2.770334928229665, "grad_norm": 0.7029934436003829, "learning_rate": 3.061449711645422e-06, "loss": 0.4857, "num_tokens": 236807888.0, "step": 1158 }, { "epoch": 2.7727272727272725, "grad_norm": 0.6048751465681457, "learning_rate": 3.0541245722744386e-06, "loss": 0.5999, "num_tokens": 237043548.0, "step": 1159 }, { "epoch": 2.77511961722488, "grad_norm": 0.5304879535734554, "learning_rate": 3.0468086197807067e-06, "loss": 0.4942, "num_tokens": 237306083.0, "step": 1160 }, { "epoch": 2.777511961722488, "grad_norm": 0.5216652603990098, "learning_rate": 3.0395018816434518e-06, "loss": 0.5044, "num_tokens": 237542954.0, "step": 1161 }, { "epoch": 2.7799043062200957, "grad_norm": 0.5560558785906827, "learning_rate": 3.032204385307286e-06, "loss": 0.6172, "num_tokens": 237805450.0, "step": 1162 }, { "epoch": 2.7822966507177034, "grad_norm": 0.5940789140636036, "learning_rate": 3.024916158182111e-06, "loss": 0.5746, "num_tokens": 238027547.0, "step": 1163 }, { "epoch": 2.784688995215311, "grad_norm": 0.5980952368279757, "learning_rate": 3.017637227643012e-06, "loss": 0.6859, "num_tokens": 238267470.0, "step": 1164 }, { "epoch": 2.787081339712919, "grad_norm": 0.6098108550239246, "learning_rate": 3.010367621030154e-06, "loss": 0.6507, "num_tokens": 238499989.0, "step": 1165 }, { "epoch": 2.7894736842105265, "grad_norm": 0.5189097810260411, "learning_rate": 3.0031073656486837e-06, "loss": 0.523, "num_tokens": 238739324.0, "step": 1166 }, { "epoch": 2.791866028708134, "grad_norm": 0.6670038458917087, "learning_rate": 2.9958564887686215e-06, "loss": 0.5835, "num_tokens": 238934431.0, "step": 1167 }, { "epoch": 2.7942583732057416, "grad_norm": 0.702165953457099, "learning_rate": 2.988615017624763e-06, "loss": 0.7271, "num_tokens": 239127556.0, "step": 1168 }, { "epoch": 2.7966507177033493, "grad_norm": 0.6075352549844745, "learning_rate": 2.981382979416574e-06, "loss": 0.5385, "num_tokens": 239336182.0, "step": 1169 }, { "epoch": 2.799043062200957, "grad_norm": 0.6577445889508683, "learning_rate": 2.974160401308091e-06, "loss": 0.5115, "num_tokens": 239498071.0, "step": 1170 }, { "epoch": 2.8014354066985647, "grad_norm": 0.5991280998688097, "learning_rate": 2.966947310427816e-06, "loss": 0.3523, "num_tokens": 239663863.0, "step": 1171 }, { "epoch": 2.803827751196172, "grad_norm": 0.5288753605341616, "learning_rate": 2.959743733868618e-06, "loss": 0.6265, "num_tokens": 239917379.0, "step": 1172 }, { "epoch": 2.8062200956937797, "grad_norm": 0.6582699532838108, "learning_rate": 2.952549698687628e-06, "loss": 0.5765, "num_tokens": 240092500.0, "step": 1173 }, { "epoch": 2.8086124401913874, "grad_norm": 0.6417080651634351, "learning_rate": 2.9453652319061406e-06, "loss": 0.6619, "num_tokens": 240292939.0, "step": 1174 }, { "epoch": 2.811004784688995, "grad_norm": 0.6397601763793894, "learning_rate": 2.9381903605095084e-06, "loss": 0.5063, "num_tokens": 240496298.0, "step": 1175 }, { "epoch": 2.813397129186603, "grad_norm": 0.694398445810493, "learning_rate": 2.9310251114470455e-06, "loss": 0.6727, "num_tokens": 240710446.0, "step": 1176 }, { "epoch": 2.8157894736842106, "grad_norm": 0.6298183779915213, "learning_rate": 2.923869511631925e-06, "loss": 0.5447, "num_tokens": 240901068.0, "step": 1177 }, { "epoch": 2.8181818181818183, "grad_norm": 0.6481343638854814, "learning_rate": 2.916723587941068e-06, "loss": 0.5114, "num_tokens": 241084940.0, "step": 1178 }, { "epoch": 2.820574162679426, "grad_norm": 0.6356530269175816, "learning_rate": 2.9095873672150664e-06, "loss": 0.5139, "num_tokens": 241234540.0, "step": 1179 }, { "epoch": 2.8229665071770333, "grad_norm": 0.4917846106897839, "learning_rate": 2.902460876258053e-06, "loss": 0.3871, "num_tokens": 241457101.0, "step": 1180 }, { "epoch": 2.825358851674641, "grad_norm": 0.508873897219866, "learning_rate": 2.895344141837623e-06, "loss": 0.6379, "num_tokens": 241712060.0, "step": 1181 }, { "epoch": 2.827751196172249, "grad_norm": 0.6102406723360337, "learning_rate": 2.888237190684723e-06, "loss": 0.5976, "num_tokens": 241897217.0, "step": 1182 }, { "epoch": 2.8301435406698565, "grad_norm": 0.6716288629809128, "learning_rate": 2.8811400494935514e-06, "loss": 0.596, "num_tokens": 242076528.0, "step": 1183 }, { "epoch": 2.8325358851674642, "grad_norm": 0.6693125442801139, "learning_rate": 2.874052744921463e-06, "loss": 0.6404, "num_tokens": 242281353.0, "step": 1184 }, { "epoch": 2.8349282296650715, "grad_norm": 0.7474600972985558, "learning_rate": 2.866975303588863e-06, "loss": 0.431, "num_tokens": 242390556.0, "step": 1185 }, { "epoch": 2.8373205741626792, "grad_norm": 0.70776528403428, "learning_rate": 2.8599077520791113e-06, "loss": 0.6298, "num_tokens": 242574039.0, "step": 1186 }, { "epoch": 2.839712918660287, "grad_norm": 0.6671876432775635, "learning_rate": 2.85285011693842e-06, "loss": 0.6645, "num_tokens": 242774348.0, "step": 1187 }, { "epoch": 2.8421052631578947, "grad_norm": 0.6586262202984541, "learning_rate": 2.8458024246757542e-06, "loss": 0.6766, "num_tokens": 242980501.0, "step": 1188 }, { "epoch": 2.8444976076555024, "grad_norm": 0.7257360375805328, "learning_rate": 2.838764701762733e-06, "loss": 0.4131, "num_tokens": 243075975.0, "step": 1189 }, { "epoch": 2.84688995215311, "grad_norm": 0.6177579979073565, "learning_rate": 2.8317369746335336e-06, "loss": 0.549, "num_tokens": 243251454.0, "step": 1190 }, { "epoch": 2.849282296650718, "grad_norm": 0.6230229958426546, "learning_rate": 2.8247192696847793e-06, "loss": 0.5025, "num_tokens": 243413685.0, "step": 1191 }, { "epoch": 2.8516746411483256, "grad_norm": 0.6701426528255272, "learning_rate": 2.8177116132754613e-06, "loss": 0.7372, "num_tokens": 243621688.0, "step": 1192 }, { "epoch": 2.854066985645933, "grad_norm": 0.6496796729542728, "learning_rate": 2.8107140317268155e-06, "loss": 0.5084, "num_tokens": 243773491.0, "step": 1193 }, { "epoch": 2.8564593301435406, "grad_norm": 0.581030373061313, "learning_rate": 2.803726551322249e-06, "loss": 0.49, "num_tokens": 243962139.0, "step": 1194 }, { "epoch": 2.8588516746411483, "grad_norm": 0.6196073988069656, "learning_rate": 2.796749198307216e-06, "loss": 0.5849, "num_tokens": 244175734.0, "step": 1195 }, { "epoch": 2.861244019138756, "grad_norm": 0.6856361894478656, "learning_rate": 2.7897819988891395e-06, "loss": 0.5156, "num_tokens": 244385159.0, "step": 1196 }, { "epoch": 2.8636363636363638, "grad_norm": 0.7476507018765085, "learning_rate": 2.782824979237301e-06, "loss": 0.6144, "num_tokens": 244576046.0, "step": 1197 }, { "epoch": 2.866028708133971, "grad_norm": 0.6475977052467857, "learning_rate": 2.775878165482748e-06, "loss": 0.3768, "num_tokens": 244751373.0, "step": 1198 }, { "epoch": 2.8684210526315788, "grad_norm": 0.6369236071255224, "learning_rate": 2.7689415837181938e-06, "loss": 0.5896, "num_tokens": 244949211.0, "step": 1199 }, { "epoch": 2.8708133971291865, "grad_norm": 0.668885759771659, "learning_rate": 2.7620152599979187e-06, "loss": 0.4775, "num_tokens": 245111385.0, "step": 1200 }, { "epoch": 2.873205741626794, "grad_norm": 0.6643818203128119, "learning_rate": 2.7550992203376746e-06, "loss": 0.5528, "num_tokens": 245269535.0, "step": 1201 }, { "epoch": 2.875598086124402, "grad_norm": 0.550651881589533, "learning_rate": 2.7481934907145834e-06, "loss": 0.389, "num_tokens": 245470303.0, "step": 1202 }, { "epoch": 2.8779904306220097, "grad_norm": 0.5848540423797766, "learning_rate": 2.741298097067045e-06, "loss": 0.4821, "num_tokens": 245648787.0, "step": 1203 }, { "epoch": 2.8803827751196174, "grad_norm": 0.634574634635918, "learning_rate": 2.7344130652946297e-06, "loss": 0.5777, "num_tokens": 245840189.0, "step": 1204 }, { "epoch": 2.882775119617225, "grad_norm": 0.5812504617218902, "learning_rate": 2.727538421258001e-06, "loss": 0.5581, "num_tokens": 246026826.0, "step": 1205 }, { "epoch": 2.8851674641148324, "grad_norm": 0.7546932260743351, "learning_rate": 2.7206741907787894e-06, "loss": 0.6422, "num_tokens": 246221707.0, "step": 1206 }, { "epoch": 2.88755980861244, "grad_norm": 0.6438170658948759, "learning_rate": 2.713820399639526e-06, "loss": 0.5042, "num_tokens": 246407305.0, "step": 1207 }, { "epoch": 2.889952153110048, "grad_norm": 0.6527785852932717, "learning_rate": 2.7069770735835187e-06, "loss": 0.4221, "num_tokens": 246554288.0, "step": 1208 }, { "epoch": 2.8923444976076556, "grad_norm": 0.6539575911422356, "learning_rate": 2.700144238314778e-06, "loss": 0.6612, "num_tokens": 246761962.0, "step": 1209 }, { "epoch": 2.8947368421052633, "grad_norm": 0.6864455493850431, "learning_rate": 2.6933219194979026e-06, "loss": 0.6308, "num_tokens": 246962519.0, "step": 1210 }, { "epoch": 2.8971291866028706, "grad_norm": 0.6117976724684638, "learning_rate": 2.6865101427579947e-06, "loss": 0.6078, "num_tokens": 247163580.0, "step": 1211 }, { "epoch": 2.8995215311004783, "grad_norm": 0.5331232631649402, "learning_rate": 2.6797089336805593e-06, "loss": 0.3885, "num_tokens": 247358140.0, "step": 1212 }, { "epoch": 2.901913875598086, "grad_norm": 0.568828690618135, "learning_rate": 2.6729183178114073e-06, "loss": 0.5382, "num_tokens": 247571321.0, "step": 1213 }, { "epoch": 2.9043062200956937, "grad_norm": 0.6189893931640706, "learning_rate": 2.6661383206565618e-06, "loss": 0.5882, "num_tokens": 247767989.0, "step": 1214 }, { "epoch": 2.9066985645933014, "grad_norm": 0.5178577121697557, "learning_rate": 2.6593689676821615e-06, "loss": 0.3739, "num_tokens": 247952838.0, "step": 1215 }, { "epoch": 2.909090909090909, "grad_norm": 0.6986764619040865, "learning_rate": 2.652610284314365e-06, "loss": 0.6257, "num_tokens": 248123176.0, "step": 1216 }, { "epoch": 2.911483253588517, "grad_norm": 0.5886690805293942, "learning_rate": 2.6458622959392503e-06, "loss": 0.5471, "num_tokens": 248335747.0, "step": 1217 }, { "epoch": 2.9138755980861246, "grad_norm": 0.585596258764202, "learning_rate": 2.639125027902735e-06, "loss": 0.5411, "num_tokens": 248603711.0, "step": 1218 }, { "epoch": 2.916267942583732, "grad_norm": 0.48717043479237854, "learning_rate": 2.6323985055104563e-06, "loss": 0.5637, "num_tokens": 248894392.0, "step": 1219 }, { "epoch": 2.9186602870813396, "grad_norm": 0.6566559983795414, "learning_rate": 2.625682754027705e-06, "loss": 0.6663, "num_tokens": 249135781.0, "step": 1220 }, { "epoch": 2.9210526315789473, "grad_norm": 0.5735215705319195, "learning_rate": 2.618977798679301e-06, "loss": 0.5486, "num_tokens": 249335710.0, "step": 1221 }, { "epoch": 2.923444976076555, "grad_norm": 0.6211751646592965, "learning_rate": 2.6122836646495265e-06, "loss": 0.5461, "num_tokens": 249549041.0, "step": 1222 }, { "epoch": 2.925837320574163, "grad_norm": 0.642131723486152, "learning_rate": 2.6056003770820064e-06, "loss": 0.6271, "num_tokens": 249748547.0, "step": 1223 }, { "epoch": 2.92822966507177, "grad_norm": 0.7207742139976621, "learning_rate": 2.598927961079637e-06, "loss": 0.4905, "num_tokens": 249881783.0, "step": 1224 }, { "epoch": 2.930622009569378, "grad_norm": 0.6573176241763478, "learning_rate": 2.5922664417044687e-06, "loss": 0.5394, "num_tokens": 250060894.0, "step": 1225 }, { "epoch": 2.9330143540669855, "grad_norm": 0.47731017053274466, "learning_rate": 2.5856158439776313e-06, "loss": 0.3004, "num_tokens": 250217743.0, "step": 1226 }, { "epoch": 2.9354066985645932, "grad_norm": 0.6630055860064705, "learning_rate": 2.57897619287923e-06, "loss": 0.5711, "num_tokens": 250453281.0, "step": 1227 }, { "epoch": 2.937799043062201, "grad_norm": 0.5738916532045819, "learning_rate": 2.5723475133482536e-06, "loss": 0.5421, "num_tokens": 250666240.0, "step": 1228 }, { "epoch": 2.9401913875598087, "grad_norm": 0.5811112867173626, "learning_rate": 2.5657298302824807e-06, "loss": 0.4771, "num_tokens": 250890460.0, "step": 1229 }, { "epoch": 2.9425837320574164, "grad_norm": 0.6548400631555413, "learning_rate": 2.559123168538384e-06, "loss": 0.6436, "num_tokens": 251071802.0, "step": 1230 }, { "epoch": 2.944976076555024, "grad_norm": 0.6206799035843503, "learning_rate": 2.552527552931047e-06, "loss": 0.5424, "num_tokens": 251287330.0, "step": 1231 }, { "epoch": 2.9473684210526314, "grad_norm": 0.7116397379248429, "learning_rate": 2.545943008234052e-06, "loss": 0.3903, "num_tokens": 251421447.0, "step": 1232 }, { "epoch": 2.949760765550239, "grad_norm": 0.6222630079338823, "learning_rate": 2.53936955917941e-06, "loss": 0.6054, "num_tokens": 251640901.0, "step": 1233 }, { "epoch": 2.952153110047847, "grad_norm": 0.5614733325607167, "learning_rate": 2.5328072304574447e-06, "loss": 0.5067, "num_tokens": 251839968.0, "step": 1234 }, { "epoch": 2.9545454545454546, "grad_norm": 0.5829510657829275, "learning_rate": 2.526256046716723e-06, "loss": 0.3963, "num_tokens": 252001429.0, "step": 1235 }, { "epoch": 2.9569377990430623, "grad_norm": 0.705061218204218, "learning_rate": 2.519716032563937e-06, "loss": 0.6263, "num_tokens": 252201921.0, "step": 1236 }, { "epoch": 2.9593301435406696, "grad_norm": 0.7565419642872887, "learning_rate": 2.5131872125638395e-06, "loss": 0.4781, "num_tokens": 252323614.0, "step": 1237 }, { "epoch": 2.9617224880382773, "grad_norm": 0.6684300894500792, "learning_rate": 2.5066696112391242e-06, "loss": 0.6936, "num_tokens": 252532906.0, "step": 1238 }, { "epoch": 2.964114832535885, "grad_norm": 0.5483020414647526, "learning_rate": 2.5001632530703548e-06, "loss": 0.4427, "num_tokens": 252733652.0, "step": 1239 }, { "epoch": 2.9665071770334928, "grad_norm": 0.6557740442678165, "learning_rate": 2.493668162495861e-06, "loss": 0.5387, "num_tokens": 252911982.0, "step": 1240 }, { "epoch": 2.9688995215311005, "grad_norm": 0.6463982314075196, "learning_rate": 2.4871843639116546e-06, "loss": 0.5954, "num_tokens": 253079059.0, "step": 1241 }, { "epoch": 2.971291866028708, "grad_norm": 0.6447361732169392, "learning_rate": 2.480711881671331e-06, "loss": 0.4672, "num_tokens": 253237391.0, "step": 1242 }, { "epoch": 2.973684210526316, "grad_norm": 0.5759297479374669, "learning_rate": 2.474250740085977e-06, "loss": 0.4308, "num_tokens": 253440527.0, "step": 1243 }, { "epoch": 2.9760765550239237, "grad_norm": 0.5509943045604462, "learning_rate": 2.467800963424094e-06, "loss": 0.4426, "num_tokens": 253691730.0, "step": 1244 }, { "epoch": 2.9784688995215314, "grad_norm": 0.6422357151167679, "learning_rate": 2.461362575911482e-06, "loss": 0.613, "num_tokens": 253892279.0, "step": 1245 }, { "epoch": 2.9808612440191387, "grad_norm": 0.6064370988052857, "learning_rate": 2.454935601731177e-06, "loss": 0.5947, "num_tokens": 254101835.0, "step": 1246 }, { "epoch": 2.9832535885167464, "grad_norm": 0.6820673517757627, "learning_rate": 2.4485200650233317e-06, "loss": 0.5324, "num_tokens": 254297612.0, "step": 1247 }, { "epoch": 2.985645933014354, "grad_norm": 0.6652085972974082, "learning_rate": 2.4421159898851517e-06, "loss": 0.5012, "num_tokens": 254474125.0, "step": 1248 }, { "epoch": 2.988038277511962, "grad_norm": 0.5996508902229061, "learning_rate": 2.4357234003707806e-06, "loss": 0.6659, "num_tokens": 254713889.0, "step": 1249 }, { "epoch": 2.990430622009569, "grad_norm": 0.7053309145898407, "learning_rate": 2.429342320491232e-06, "loss": 0.7754, "num_tokens": 254904822.0, "step": 1250 }, { "epoch": 2.992822966507177, "grad_norm": 0.5968448525444648, "learning_rate": 2.4229727742142786e-06, "loss": 0.4946, "num_tokens": 255079070.0, "step": 1251 }, { "epoch": 2.9952153110047846, "grad_norm": 0.6978601159410557, "learning_rate": 2.4166147854643795e-06, "loss": 0.8096, "num_tokens": 255288793.0, "step": 1252 }, { "epoch": 2.9976076555023923, "grad_norm": 0.6134744705998697, "learning_rate": 2.4102683781225787e-06, "loss": 0.625, "num_tokens": 255522740.0, "step": 1253 }, { "epoch": 3.0, "grad_norm": 0.6378341238106746, "learning_rate": 2.4039335760264217e-06, "loss": 0.5745, "num_tokens": 255716206.0, "step": 1254 }, { "epoch": 3.0, "eval_loss": 0.42938756942749023, "eval_num_tokens": 255716206.0, "eval_runtime": 109.1806, "eval_samples_per_second": 27.221, "eval_steps_per_second": 3.407, "step": 1254 }, { "epoch": 3.0023923444976077, "grad_norm": 0.7402586261049895, "learning_rate": 2.3976104029698633e-06, "loss": 0.4572, "num_tokens": 255913889.0, "step": 1255 }, { "epoch": 3.0047846889952154, "grad_norm": 0.7387412356378916, "learning_rate": 2.3912988827031782e-06, "loss": 0.6147, "num_tokens": 256166684.0, "step": 1256 }, { "epoch": 3.007177033492823, "grad_norm": 0.6999477349602615, "learning_rate": 2.3849990389328756e-06, "loss": 0.4733, "num_tokens": 256312393.0, "step": 1257 }, { "epoch": 3.0095693779904304, "grad_norm": 0.5690209540504421, "learning_rate": 2.378710895321599e-06, "loss": 0.3537, "num_tokens": 256512256.0, "step": 1258 }, { "epoch": 3.011961722488038, "grad_norm": 0.7890373783734057, "learning_rate": 2.3724344754880567e-06, "loss": 0.5304, "num_tokens": 256727440.0, "step": 1259 }, { "epoch": 3.014354066985646, "grad_norm": 0.9614164329801103, "learning_rate": 2.3661698030069102e-06, "loss": 0.6293, "num_tokens": 256908452.0, "step": 1260 }, { "epoch": 3.0167464114832536, "grad_norm": 0.6512936329535286, "learning_rate": 2.359916901408708e-06, "loss": 0.4104, "num_tokens": 257118864.0, "step": 1261 }, { "epoch": 3.0191387559808613, "grad_norm": 0.6700153979446825, "learning_rate": 2.3536757941797756e-06, "loss": 0.5046, "num_tokens": 257330088.0, "step": 1262 }, { "epoch": 3.021531100478469, "grad_norm": 0.677137498964303, "learning_rate": 2.3474465047621483e-06, "loss": 0.5112, "num_tokens": 257491666.0, "step": 1263 }, { "epoch": 3.0239234449760763, "grad_norm": 0.6832888725075986, "learning_rate": 2.341229056553464e-06, "loss": 0.6466, "num_tokens": 257705251.0, "step": 1264 }, { "epoch": 3.026315789473684, "grad_norm": 0.6843703156850178, "learning_rate": 2.3350234729068903e-06, "loss": 0.5112, "num_tokens": 257901651.0, "step": 1265 }, { "epoch": 3.028708133971292, "grad_norm": 0.6746922650910443, "learning_rate": 2.3288297771310275e-06, "loss": 0.6056, "num_tokens": 258111668.0, "step": 1266 }, { "epoch": 3.0311004784688995, "grad_norm": 0.63011570756557, "learning_rate": 2.3226479924898256e-06, "loss": 0.5336, "num_tokens": 258323599.0, "step": 1267 }, { "epoch": 3.0334928229665072, "grad_norm": 0.6593826509477446, "learning_rate": 2.316478142202496e-06, "loss": 0.4867, "num_tokens": 258494973.0, "step": 1268 }, { "epoch": 3.035885167464115, "grad_norm": 0.5949720867828437, "learning_rate": 2.310320249443422e-06, "loss": 0.4177, "num_tokens": 258689669.0, "step": 1269 }, { "epoch": 3.0382775119617227, "grad_norm": 0.5964321271477145, "learning_rate": 2.3041743373420745e-06, "loss": 0.4252, "num_tokens": 258906775.0, "step": 1270 }, { "epoch": 3.04066985645933, "grad_norm": 0.6582924002907377, "learning_rate": 2.298040428982925e-06, "loss": 0.5591, "num_tokens": 259077744.0, "step": 1271 }, { "epoch": 3.0430622009569377, "grad_norm": 0.6279206160144934, "learning_rate": 2.291918547405359e-06, "loss": 0.4881, "num_tokens": 259270234.0, "step": 1272 }, { "epoch": 3.0454545454545454, "grad_norm": 0.553921649997124, "learning_rate": 2.285808715603583e-06, "loss": 0.3983, "num_tokens": 259472134.0, "step": 1273 }, { "epoch": 3.047846889952153, "grad_norm": 0.5492785480976846, "learning_rate": 2.279710956526553e-06, "loss": 0.4185, "num_tokens": 259678929.0, "step": 1274 }, { "epoch": 3.050239234449761, "grad_norm": 0.5846959421173259, "learning_rate": 2.273625293077867e-06, "loss": 0.5988, "num_tokens": 259932212.0, "step": 1275 }, { "epoch": 3.0526315789473686, "grad_norm": 0.589709064028512, "learning_rate": 2.2675517481157055e-06, "loss": 0.4188, "num_tokens": 260146724.0, "step": 1276 }, { "epoch": 3.055023923444976, "grad_norm": 0.6274338008637919, "learning_rate": 2.2614903444527188e-06, "loss": 0.4442, "num_tokens": 260350406.0, "step": 1277 }, { "epoch": 3.0574162679425836, "grad_norm": 0.7915974724556752, "learning_rate": 2.255441104855958e-06, "loss": 0.5775, "num_tokens": 260538578.0, "step": 1278 }, { "epoch": 3.0598086124401913, "grad_norm": 0.584101511052562, "learning_rate": 2.2494040520467874e-06, "loss": 0.4285, "num_tokens": 260755344.0, "step": 1279 }, { "epoch": 3.062200956937799, "grad_norm": 0.5949543375784039, "learning_rate": 2.243379208700793e-06, "loss": 0.4389, "num_tokens": 260929211.0, "step": 1280 }, { "epoch": 3.0645933014354068, "grad_norm": 0.7097892851187321, "learning_rate": 2.237366597447705e-06, "loss": 0.6095, "num_tokens": 261139280.0, "step": 1281 }, { "epoch": 3.0669856459330145, "grad_norm": 0.6198718642387921, "learning_rate": 2.2313662408713054e-06, "loss": 0.5429, "num_tokens": 261341146.0, "step": 1282 }, { "epoch": 3.069377990430622, "grad_norm": 0.6087302978836517, "learning_rate": 2.225378161509349e-06, "loss": 0.5559, "num_tokens": 261581286.0, "step": 1283 }, { "epoch": 3.0717703349282295, "grad_norm": 0.6220061356000266, "learning_rate": 2.2194023818534767e-06, "loss": 0.4997, "num_tokens": 261799499.0, "step": 1284 }, { "epoch": 3.074162679425837, "grad_norm": 0.5368146821622435, "learning_rate": 2.2134389243491296e-06, "loss": 0.5151, "num_tokens": 262026022.0, "step": 1285 }, { "epoch": 3.076555023923445, "grad_norm": 0.7153515807441471, "learning_rate": 2.207487811395466e-06, "loss": 0.6361, "num_tokens": 262256556.0, "step": 1286 }, { "epoch": 3.0789473684210527, "grad_norm": 0.6213730994225999, "learning_rate": 2.201549065345278e-06, "loss": 0.5283, "num_tokens": 262477701.0, "step": 1287 }, { "epoch": 3.0813397129186604, "grad_norm": 0.6574330655368827, "learning_rate": 2.195622708504907e-06, "loss": 0.4982, "num_tokens": 262690340.0, "step": 1288 }, { "epoch": 3.083732057416268, "grad_norm": 0.6575370913279897, "learning_rate": 2.1897087631341595e-06, "loss": 0.5313, "num_tokens": 262928530.0, "step": 1289 }, { "epoch": 3.0861244019138754, "grad_norm": 0.6051544279189891, "learning_rate": 2.1838072514462224e-06, "loss": 0.4823, "num_tokens": 263167965.0, "step": 1290 }, { "epoch": 3.088516746411483, "grad_norm": 0.6999294105552621, "learning_rate": 2.1779181956075805e-06, "loss": 0.4199, "num_tokens": 263323188.0, "step": 1291 }, { "epoch": 3.090909090909091, "grad_norm": 0.5627138982473117, "learning_rate": 2.1720416177379366e-06, "loss": 0.4115, "num_tokens": 263537362.0, "step": 1292 }, { "epoch": 3.0933014354066986, "grad_norm": 0.629390025884304, "learning_rate": 2.1661775399101234e-06, "loss": 0.4305, "num_tokens": 263749336.0, "step": 1293 }, { "epoch": 3.0956937799043063, "grad_norm": 0.7247727661733878, "learning_rate": 2.1603259841500228e-06, "loss": 0.5312, "num_tokens": 263914691.0, "step": 1294 }, { "epoch": 3.098086124401914, "grad_norm": 0.5586270865087458, "learning_rate": 2.1544869724364816e-06, "loss": 0.3326, "num_tokens": 264052504.0, "step": 1295 }, { "epoch": 3.1004784688995217, "grad_norm": 0.5532261224348151, "learning_rate": 2.148660526701233e-06, "loss": 0.4917, "num_tokens": 264268596.0, "step": 1296 }, { "epoch": 3.102870813397129, "grad_norm": 0.549079831877891, "learning_rate": 2.1428466688288102e-06, "loss": 0.5321, "num_tokens": 264529822.0, "step": 1297 }, { "epoch": 3.1052631578947367, "grad_norm": 0.5671370864801888, "learning_rate": 2.1370454206564646e-06, "loss": 0.4896, "num_tokens": 264748064.0, "step": 1298 }, { "epoch": 3.1076555023923444, "grad_norm": 0.5904040731943784, "learning_rate": 2.131256803974086e-06, "loss": 0.4935, "num_tokens": 264969358.0, "step": 1299 }, { "epoch": 3.110047846889952, "grad_norm": 0.5626029492726947, "learning_rate": 2.1254808405241183e-06, "loss": 0.4896, "num_tokens": 265201219.0, "step": 1300 }, { "epoch": 3.11244019138756, "grad_norm": 0.6848742442647878, "learning_rate": 2.1197175520014806e-06, "loss": 0.5598, "num_tokens": 265406095.0, "step": 1301 }, { "epoch": 3.1148325358851676, "grad_norm": 0.675172611025597, "learning_rate": 2.1139669600534825e-06, "loss": 0.5134, "num_tokens": 265634192.0, "step": 1302 }, { "epoch": 3.117224880382775, "grad_norm": 0.5877000187862749, "learning_rate": 2.1082290862797464e-06, "loss": 0.5396, "num_tokens": 265876633.0, "step": 1303 }, { "epoch": 3.1196172248803826, "grad_norm": 0.5662213441307106, "learning_rate": 2.102503952232121e-06, "loss": 0.5159, "num_tokens": 266111430.0, "step": 1304 }, { "epoch": 3.1220095693779903, "grad_norm": 0.5985612354820727, "learning_rate": 2.0967915794146066e-06, "loss": 0.4746, "num_tokens": 266329504.0, "step": 1305 }, { "epoch": 3.124401913875598, "grad_norm": 0.6527589826039932, "learning_rate": 2.0910919892832697e-06, "loss": 0.3463, "num_tokens": 266471386.0, "step": 1306 }, { "epoch": 3.126794258373206, "grad_norm": 0.47596331322234914, "learning_rate": 2.0854052032461667e-06, "loss": 0.3295, "num_tokens": 266684099.0, "step": 1307 }, { "epoch": 3.1291866028708135, "grad_norm": 0.6108469881663725, "learning_rate": 2.0797312426632584e-06, "loss": 0.6519, "num_tokens": 266933331.0, "step": 1308 }, { "epoch": 3.1315789473684212, "grad_norm": 0.6646226322119491, "learning_rate": 2.0740701288463327e-06, "loss": 0.5259, "num_tokens": 267130637.0, "step": 1309 }, { "epoch": 3.1339712918660285, "grad_norm": 0.5508619091258276, "learning_rate": 2.068421883058925e-06, "loss": 0.3927, "num_tokens": 267299054.0, "step": 1310 }, { "epoch": 3.1363636363636362, "grad_norm": 0.6293553226109324, "learning_rate": 2.0627865265162374e-06, "loss": 0.433, "num_tokens": 267510099.0, "step": 1311 }, { "epoch": 3.138755980861244, "grad_norm": 0.7440115581163417, "learning_rate": 2.057164080385058e-06, "loss": 0.5481, "num_tokens": 267726029.0, "step": 1312 }, { "epoch": 3.1411483253588517, "grad_norm": 0.6236043688054738, "learning_rate": 2.051554565783684e-06, "loss": 0.4325, "num_tokens": 267925976.0, "step": 1313 }, { "epoch": 3.1435406698564594, "grad_norm": 0.671585593115855, "learning_rate": 2.04595800378184e-06, "loss": 0.4412, "num_tokens": 268098208.0, "step": 1314 }, { "epoch": 3.145933014354067, "grad_norm": 0.6949266221749684, "learning_rate": 2.0403744154005988e-06, "loss": 0.4432, "num_tokens": 268308006.0, "step": 1315 }, { "epoch": 3.1483253588516744, "grad_norm": 0.6473691472293852, "learning_rate": 2.034803821612308e-06, "loss": 0.5865, "num_tokens": 268514308.0, "step": 1316 }, { "epoch": 3.150717703349282, "grad_norm": 0.5536706707358635, "learning_rate": 2.0292462433404976e-06, "loss": 0.4216, "num_tokens": 268725648.0, "step": 1317 }, { "epoch": 3.15311004784689, "grad_norm": 0.6221734201066783, "learning_rate": 2.0237017014598215e-06, "loss": 0.4751, "num_tokens": 268940579.0, "step": 1318 }, { "epoch": 3.1555023923444976, "grad_norm": 0.6392719861236702, "learning_rate": 2.018170216795959e-06, "loss": 0.4724, "num_tokens": 269130369.0, "step": 1319 }, { "epoch": 3.1578947368421053, "grad_norm": 0.7162837149510706, "learning_rate": 2.0126518101255495e-06, "loss": 0.5957, "num_tokens": 269316540.0, "step": 1320 }, { "epoch": 3.160287081339713, "grad_norm": 0.5580788330841133, "learning_rate": 2.0071465021761106e-06, "loss": 0.4358, "num_tokens": 269506411.0, "step": 1321 }, { "epoch": 3.1626794258373208, "grad_norm": 0.6598206261361217, "learning_rate": 2.0016543136259595e-06, "loss": 0.5602, "num_tokens": 269701502.0, "step": 1322 }, { "epoch": 3.165071770334928, "grad_norm": 0.6581789669102812, "learning_rate": 1.996175265104136e-06, "loss": 0.529, "num_tokens": 269900469.0, "step": 1323 }, { "epoch": 3.1674641148325358, "grad_norm": 0.6059113124115182, "learning_rate": 1.9907093771903265e-06, "loss": 0.4354, "num_tokens": 270096044.0, "step": 1324 }, { "epoch": 3.1698564593301435, "grad_norm": 0.8188363685332233, "learning_rate": 1.9852566704147823e-06, "loss": 0.6595, "num_tokens": 270254500.0, "step": 1325 }, { "epoch": 3.172248803827751, "grad_norm": 0.6548140241893287, "learning_rate": 1.979817165258248e-06, "loss": 0.5036, "num_tokens": 270443820.0, "step": 1326 }, { "epoch": 3.174641148325359, "grad_norm": 0.5207871010872199, "learning_rate": 1.9743908821518813e-06, "loss": 0.4581, "num_tokens": 270676004.0, "step": 1327 }, { "epoch": 3.1770334928229667, "grad_norm": 0.5652418638260854, "learning_rate": 1.968977841477176e-06, "loss": 0.3529, "num_tokens": 270893341.0, "step": 1328 }, { "epoch": 3.179425837320574, "grad_norm": 0.596366758247755, "learning_rate": 1.9635780635658886e-06, "loss": 0.5376, "num_tokens": 271106483.0, "step": 1329 }, { "epoch": 3.1818181818181817, "grad_norm": 0.645025175672765, "learning_rate": 1.9581915686999563e-06, "loss": 0.4806, "num_tokens": 271302176.0, "step": 1330 }, { "epoch": 3.1842105263157894, "grad_norm": 0.7117928160952178, "learning_rate": 1.9528183771114295e-06, "loss": 0.4697, "num_tokens": 271514293.0, "step": 1331 }, { "epoch": 3.186602870813397, "grad_norm": 0.6221638039635599, "learning_rate": 1.9474585089823827e-06, "loss": 0.4587, "num_tokens": 271732762.0, "step": 1332 }, { "epoch": 3.188995215311005, "grad_norm": 0.7078194657393896, "learning_rate": 1.9421119844448565e-06, "loss": 0.5736, "num_tokens": 271912358.0, "step": 1333 }, { "epoch": 3.1913875598086126, "grad_norm": 0.6440523574621861, "learning_rate": 1.9367788235807632e-06, "loss": 0.507, "num_tokens": 272100294.0, "step": 1334 }, { "epoch": 3.1937799043062203, "grad_norm": 0.5978274244815054, "learning_rate": 1.931459046421825e-06, "loss": 0.5187, "num_tokens": 272301185.0, "step": 1335 }, { "epoch": 3.1961722488038276, "grad_norm": 0.5143790337279445, "learning_rate": 1.9261526729494923e-06, "loss": 0.3528, "num_tokens": 272529698.0, "step": 1336 }, { "epoch": 3.1985645933014353, "grad_norm": 0.6269390757214239, "learning_rate": 1.9208597230948717e-06, "loss": 0.4604, "num_tokens": 272750346.0, "step": 1337 }, { "epoch": 3.200956937799043, "grad_norm": 0.6020267987482506, "learning_rate": 1.9155802167386485e-06, "loss": 0.4629, "num_tokens": 272972518.0, "step": 1338 }, { "epoch": 3.2033492822966507, "grad_norm": 0.6342687191296817, "learning_rate": 1.9103141737110132e-06, "loss": 0.6085, "num_tokens": 273188653.0, "step": 1339 }, { "epoch": 3.2057416267942584, "grad_norm": 0.6471878633504836, "learning_rate": 1.9050616137915876e-06, "loss": 0.4324, "num_tokens": 273376449.0, "step": 1340 }, { "epoch": 3.208133971291866, "grad_norm": 0.6475159896178175, "learning_rate": 1.8998225567093506e-06, "loss": 0.4442, "num_tokens": 273542078.0, "step": 1341 }, { "epoch": 3.2105263157894735, "grad_norm": 0.6651285488992867, "learning_rate": 1.894597022142563e-06, "loss": 0.4446, "num_tokens": 273740563.0, "step": 1342 }, { "epoch": 3.212918660287081, "grad_norm": 0.7333478802010834, "learning_rate": 1.8893850297186938e-06, "loss": 0.4411, "num_tokens": 273911486.0, "step": 1343 }, { "epoch": 3.215311004784689, "grad_norm": 0.7195592143270082, "learning_rate": 1.8841865990143476e-06, "loss": 0.4759, "num_tokens": 274101029.0, "step": 1344 }, { "epoch": 3.2177033492822966, "grad_norm": 0.6670742198623395, "learning_rate": 1.8790017495551871e-06, "loss": 0.4744, "num_tokens": 274258804.0, "step": 1345 }, { "epoch": 3.2200956937799043, "grad_norm": 0.594034860458389, "learning_rate": 1.8738305008158692e-06, "loss": 0.5643, "num_tokens": 274480204.0, "step": 1346 }, { "epoch": 3.222488038277512, "grad_norm": 0.5442037919610911, "learning_rate": 1.8686728722199576e-06, "loss": 0.4505, "num_tokens": 274743758.0, "step": 1347 }, { "epoch": 3.22488038277512, "grad_norm": 0.6333611210516911, "learning_rate": 1.8635288831398666e-06, "loss": 0.5109, "num_tokens": 274953215.0, "step": 1348 }, { "epoch": 3.227272727272727, "grad_norm": 0.5057598137437262, "learning_rate": 1.8583985528967715e-06, "loss": 0.5229, "num_tokens": 275221291.0, "step": 1349 }, { "epoch": 3.229665071770335, "grad_norm": 0.7028594855989811, "learning_rate": 1.8532819007605482e-06, "loss": 0.5113, "num_tokens": 275395942.0, "step": 1350 }, { "epoch": 3.2320574162679425, "grad_norm": 0.6257706921052196, "learning_rate": 1.8481789459496962e-06, "loss": 0.5542, "num_tokens": 275628282.0, "step": 1351 }, { "epoch": 3.2344497607655502, "grad_norm": 0.6279792884558518, "learning_rate": 1.843089707631267e-06, "loss": 0.4097, "num_tokens": 275800544.0, "step": 1352 }, { "epoch": 3.236842105263158, "grad_norm": 0.6539270252609328, "learning_rate": 1.8380142049207916e-06, "loss": 0.5232, "num_tokens": 275973348.0, "step": 1353 }, { "epoch": 3.2392344497607657, "grad_norm": 0.6739143040433802, "learning_rate": 1.8329524568822087e-06, "loss": 0.5698, "num_tokens": 276166783.0, "step": 1354 }, { "epoch": 3.2416267942583734, "grad_norm": 0.674231860453957, "learning_rate": 1.8279044825277941e-06, "loss": 0.5229, "num_tokens": 276344559.0, "step": 1355 }, { "epoch": 3.2440191387559807, "grad_norm": 0.7948154225401068, "learning_rate": 1.8228703008180892e-06, "loss": 0.6472, "num_tokens": 276514471.0, "step": 1356 }, { "epoch": 3.2464114832535884, "grad_norm": 0.5373150672084231, "learning_rate": 1.817849930661829e-06, "loss": 0.4819, "num_tokens": 276767291.0, "step": 1357 }, { "epoch": 3.248803827751196, "grad_norm": 0.5376958767027991, "learning_rate": 1.8128433909158689e-06, "loss": 0.4425, "num_tokens": 276976997.0, "step": 1358 }, { "epoch": 3.251196172248804, "grad_norm": 0.587439310427447, "learning_rate": 1.8078507003851209e-06, "loss": 0.4103, "num_tokens": 277181334.0, "step": 1359 }, { "epoch": 3.2535885167464116, "grad_norm": 0.5554877966420847, "learning_rate": 1.8028718778224727e-06, "loss": 0.4608, "num_tokens": 277438461.0, "step": 1360 }, { "epoch": 3.2559808612440193, "grad_norm": 0.7349112175964104, "learning_rate": 1.7979069419287304e-06, "loss": 0.5519, "num_tokens": 277603370.0, "step": 1361 }, { "epoch": 3.258373205741627, "grad_norm": 0.5577771336621236, "learning_rate": 1.7929559113525322e-06, "loss": 0.5025, "num_tokens": 277816222.0, "step": 1362 }, { "epoch": 3.2607655502392343, "grad_norm": 0.6032365353365063, "learning_rate": 1.7880188046902957e-06, "loss": 0.4066, "num_tokens": 278008701.0, "step": 1363 }, { "epoch": 3.263157894736842, "grad_norm": 0.6015228385731863, "learning_rate": 1.783095640486131e-06, "loss": 0.5296, "num_tokens": 278240487.0, "step": 1364 }, { "epoch": 3.2655502392344498, "grad_norm": 0.5754578129860267, "learning_rate": 1.778186437231786e-06, "loss": 0.6699, "num_tokens": 278485002.0, "step": 1365 }, { "epoch": 3.2679425837320575, "grad_norm": 0.5289445323732264, "learning_rate": 1.7732912133665658e-06, "loss": 0.202, "num_tokens": 278625829.0, "step": 1366 }, { "epoch": 3.270334928229665, "grad_norm": 0.5179852711199288, "learning_rate": 1.7684099872772714e-06, "loss": 0.4492, "num_tokens": 278887561.0, "step": 1367 }, { "epoch": 3.2727272727272725, "grad_norm": 0.656713592521766, "learning_rate": 1.7635427772981245e-06, "loss": 0.5281, "num_tokens": 279094325.0, "step": 1368 }, { "epoch": 3.27511961722488, "grad_norm": 0.6915296583387971, "learning_rate": 1.7586896017107024e-06, "loss": 0.4877, "num_tokens": 279256505.0, "step": 1369 }, { "epoch": 3.277511961722488, "grad_norm": 0.5676187090907497, "learning_rate": 1.7538504787438698e-06, "loss": 0.4877, "num_tokens": 279501359.0, "step": 1370 }, { "epoch": 3.2799043062200957, "grad_norm": 0.5059203117134556, "learning_rate": 1.749025426573704e-06, "loss": 0.4648, "num_tokens": 279742742.0, "step": 1371 }, { "epoch": 3.2822966507177034, "grad_norm": 0.679642013673189, "learning_rate": 1.7442144633234385e-06, "loss": 0.5408, "num_tokens": 279936081.0, "step": 1372 }, { "epoch": 3.284688995215311, "grad_norm": 0.5728793788182743, "learning_rate": 1.7394176070633801e-06, "loss": 0.5537, "num_tokens": 280175806.0, "step": 1373 }, { "epoch": 3.287081339712919, "grad_norm": 0.5101464525299488, "learning_rate": 1.7346348758108558e-06, "loss": 0.3603, "num_tokens": 280390741.0, "step": 1374 }, { "epoch": 3.2894736842105265, "grad_norm": 0.6298630050286675, "learning_rate": 1.729866287530132e-06, "loss": 0.6027, "num_tokens": 280626762.0, "step": 1375 }, { "epoch": 3.291866028708134, "grad_norm": 0.610266116505584, "learning_rate": 1.7251118601323606e-06, "loss": 0.3322, "num_tokens": 280826020.0, "step": 1376 }, { "epoch": 3.2942583732057416, "grad_norm": 0.6684549544777105, "learning_rate": 1.7203716114754937e-06, "loss": 0.4599, "num_tokens": 281012635.0, "step": 1377 }, { "epoch": 3.2966507177033493, "grad_norm": 0.5036480775036766, "learning_rate": 1.7156455593642387e-06, "loss": 0.3533, "num_tokens": 281221620.0, "step": 1378 }, { "epoch": 3.299043062200957, "grad_norm": 0.7343363629973206, "learning_rate": 1.7109337215499711e-06, "loss": 0.5138, "num_tokens": 281411026.0, "step": 1379 }, { "epoch": 3.3014354066985647, "grad_norm": 0.7136580742476932, "learning_rate": 1.7062361157306795e-06, "loss": 0.6031, "num_tokens": 281635408.0, "step": 1380 }, { "epoch": 3.303827751196172, "grad_norm": 0.6779875945232949, "learning_rate": 1.7015527595508973e-06, "loss": 0.435, "num_tokens": 281823429.0, "step": 1381 }, { "epoch": 3.3062200956937797, "grad_norm": 0.6165374543507615, "learning_rate": 1.6968836706016334e-06, "loss": 0.5343, "num_tokens": 282042936.0, "step": 1382 }, { "epoch": 3.3086124401913874, "grad_norm": 0.6671429016147503, "learning_rate": 1.6922288664203106e-06, "loss": 0.6049, "num_tokens": 282244064.0, "step": 1383 }, { "epoch": 3.311004784688995, "grad_norm": 0.6971528883690388, "learning_rate": 1.6875883644906916e-06, "loss": 0.4373, "num_tokens": 282421524.0, "step": 1384 }, { "epoch": 3.313397129186603, "grad_norm": 0.6347144039685346, "learning_rate": 1.6829621822428273e-06, "loss": 0.4571, "num_tokens": 282591812.0, "step": 1385 }, { "epoch": 3.3157894736842106, "grad_norm": 0.596451326446268, "learning_rate": 1.678350337052973e-06, "loss": 0.5014, "num_tokens": 282841560.0, "step": 1386 }, { "epoch": 3.3181818181818183, "grad_norm": 0.5504535023805781, "learning_rate": 1.6737528462435432e-06, "loss": 0.4702, "num_tokens": 283075035.0, "step": 1387 }, { "epoch": 3.320574162679426, "grad_norm": 0.6372384711768105, "learning_rate": 1.6691697270830264e-06, "loss": 0.4023, "num_tokens": 283262952.0, "step": 1388 }, { "epoch": 3.3229665071770333, "grad_norm": 0.44016337227270674, "learning_rate": 1.6646009967859406e-06, "loss": 0.3404, "num_tokens": 283507482.0, "step": 1389 }, { "epoch": 3.325358851674641, "grad_norm": 0.6430682309007208, "learning_rate": 1.660046672512747e-06, "loss": 0.6049, "num_tokens": 283741511.0, "step": 1390 }, { "epoch": 3.327751196172249, "grad_norm": 0.6662078531560585, "learning_rate": 1.6555067713698078e-06, "loss": 0.4163, "num_tokens": 283910656.0, "step": 1391 }, { "epoch": 3.3301435406698565, "grad_norm": 0.5236951135874593, "learning_rate": 1.6509813104093025e-06, "loss": 0.433, "num_tokens": 284141437.0, "step": 1392 }, { "epoch": 3.3325358851674642, "grad_norm": 0.5891915715828933, "learning_rate": 1.646470306629176e-06, "loss": 0.4364, "num_tokens": 284302071.0, "step": 1393 }, { "epoch": 3.334928229665072, "grad_norm": 0.5659081142660974, "learning_rate": 1.64197377697307e-06, "loss": 0.3732, "num_tokens": 284465884.0, "step": 1394 }, { "epoch": 3.3373205741626792, "grad_norm": 0.581006107542102, "learning_rate": 1.6374917383302616e-06, "loss": 0.3888, "num_tokens": 284650977.0, "step": 1395 }, { "epoch": 3.339712918660287, "grad_norm": 0.6364252639645998, "learning_rate": 1.6330242075355996e-06, "loss": 0.4256, "num_tokens": 284823907.0, "step": 1396 }, { "epoch": 3.3421052631578947, "grad_norm": 0.6938031314089937, "learning_rate": 1.6285712013694338e-06, "loss": 0.4122, "num_tokens": 284984075.0, "step": 1397 }, { "epoch": 3.3444976076555024, "grad_norm": 0.6895442975483198, "learning_rate": 1.6241327365575688e-06, "loss": 0.564, "num_tokens": 285203372.0, "step": 1398 }, { "epoch": 3.34688995215311, "grad_norm": 0.5894084030392284, "learning_rate": 1.6197088297711806e-06, "loss": 0.4046, "num_tokens": 285398044.0, "step": 1399 }, { "epoch": 3.349282296650718, "grad_norm": 0.72395557559556, "learning_rate": 1.6152994976267732e-06, "loss": 0.5514, "num_tokens": 285592197.0, "step": 1400 }, { "epoch": 3.3516746411483256, "grad_norm": 0.6491839671223758, "learning_rate": 1.6109047566860997e-06, "loss": 0.3798, "num_tokens": 285738915.0, "step": 1401 }, { "epoch": 3.354066985645933, "grad_norm": 0.7049883361490701, "learning_rate": 1.6065246234561137e-06, "loss": 0.6441, "num_tokens": 285928806.0, "step": 1402 }, { "epoch": 3.3564593301435406, "grad_norm": 0.6036528727339665, "learning_rate": 1.602159114388896e-06, "loss": 0.4551, "num_tokens": 286166116.0, "step": 1403 }, { "epoch": 3.3588516746411483, "grad_norm": 0.6062845296574271, "learning_rate": 1.5978082458816032e-06, "loss": 0.5571, "num_tokens": 286423925.0, "step": 1404 }, { "epoch": 3.361244019138756, "grad_norm": 0.6306014380075803, "learning_rate": 1.593472034276396e-06, "loss": 0.4419, "num_tokens": 286624999.0, "step": 1405 }, { "epoch": 3.3636363636363638, "grad_norm": 0.7035613097127984, "learning_rate": 1.5891504958603867e-06, "loss": 0.4917, "num_tokens": 286794117.0, "step": 1406 }, { "epoch": 3.3660287081339715, "grad_norm": 0.665456708111723, "learning_rate": 1.584843646865571e-06, "loss": 0.5867, "num_tokens": 287043237.0, "step": 1407 }, { "epoch": 3.3684210526315788, "grad_norm": 0.6523486514014031, "learning_rate": 1.5805515034687725e-06, "loss": 0.5675, "num_tokens": 287262346.0, "step": 1408 }, { "epoch": 3.3708133971291865, "grad_norm": 0.6800989600061507, "learning_rate": 1.5762740817915783e-06, "loss": 0.3928, "num_tokens": 287435821.0, "step": 1409 }, { "epoch": 3.373205741626794, "grad_norm": 0.6085689835831343, "learning_rate": 1.5720113979002805e-06, "loss": 0.3708, "num_tokens": 287621476.0, "step": 1410 }, { "epoch": 3.375598086124402, "grad_norm": 0.7554742958605225, "learning_rate": 1.5677634678058146e-06, "loss": 0.4084, "num_tokens": 287759069.0, "step": 1411 }, { "epoch": 3.3779904306220097, "grad_norm": 0.6596430075419027, "learning_rate": 1.5635303074636979e-06, "loss": 0.5614, "num_tokens": 287942413.0, "step": 1412 }, { "epoch": 3.3803827751196174, "grad_norm": 0.6869062135938377, "learning_rate": 1.5593119327739773e-06, "loss": 0.5973, "num_tokens": 288154785.0, "step": 1413 }, { "epoch": 3.382775119617225, "grad_norm": 0.6454421371119977, "learning_rate": 1.5551083595811551e-06, "loss": 0.5079, "num_tokens": 288343799.0, "step": 1414 }, { "epoch": 3.3851674641148324, "grad_norm": 0.5860523042624308, "learning_rate": 1.5509196036741466e-06, "loss": 0.3886, "num_tokens": 288535767.0, "step": 1415 }, { "epoch": 3.38755980861244, "grad_norm": 0.6020591732561443, "learning_rate": 1.5467456807862063e-06, "loss": 0.4034, "num_tokens": 288737701.0, "step": 1416 }, { "epoch": 3.389952153110048, "grad_norm": 0.6740634893238907, "learning_rate": 1.5425866065948797e-06, "loss": 0.5953, "num_tokens": 288922136.0, "step": 1417 }, { "epoch": 3.3923444976076556, "grad_norm": 0.6082129324791387, "learning_rate": 1.5384423967219347e-06, "loss": 0.397, "num_tokens": 289111549.0, "step": 1418 }, { "epoch": 3.3947368421052633, "grad_norm": 0.6587824796591607, "learning_rate": 1.5343130667333116e-06, "loss": 0.5051, "num_tokens": 289319059.0, "step": 1419 }, { "epoch": 3.397129186602871, "grad_norm": 0.6464490795588403, "learning_rate": 1.5301986321390588e-06, "loss": 0.4862, "num_tokens": 289517343.0, "step": 1420 }, { "epoch": 3.3995215311004783, "grad_norm": 0.6097300879288532, "learning_rate": 1.5260991083932772e-06, "loss": 0.4842, "num_tokens": 289715665.0, "step": 1421 }, { "epoch": 3.401913875598086, "grad_norm": 0.6548853735664123, "learning_rate": 1.5220145108940614e-06, "loss": 0.4464, "num_tokens": 289876828.0, "step": 1422 }, { "epoch": 3.4043062200956937, "grad_norm": 0.5754235894763253, "learning_rate": 1.5179448549834427e-06, "loss": 0.3191, "num_tokens": 290072817.0, "step": 1423 }, { "epoch": 3.4066985645933014, "grad_norm": 0.6774434347134051, "learning_rate": 1.5138901559473295e-06, "loss": 0.5724, "num_tokens": 290318208.0, "step": 1424 }, { "epoch": 3.409090909090909, "grad_norm": 0.6309179267047327, "learning_rate": 1.5098504290154522e-06, "loss": 0.5446, "num_tokens": 290561443.0, "step": 1425 }, { "epoch": 3.411483253588517, "grad_norm": 0.7245178380869103, "learning_rate": 1.5058256893613052e-06, "loss": 0.529, "num_tokens": 290728218.0, "step": 1426 }, { "epoch": 3.4138755980861246, "grad_norm": 0.6793714067868013, "learning_rate": 1.501815952102086e-06, "loss": 0.4026, "num_tokens": 290879223.0, "step": 1427 }, { "epoch": 3.416267942583732, "grad_norm": 0.677049622272289, "learning_rate": 1.4978212322986494e-06, "loss": 0.4234, "num_tokens": 291044967.0, "step": 1428 }, { "epoch": 3.4186602870813396, "grad_norm": 0.5891529086179643, "learning_rate": 1.4938415449554347e-06, "loss": 0.3909, "num_tokens": 291244108.0, "step": 1429 }, { "epoch": 3.4210526315789473, "grad_norm": 0.7680086416040134, "learning_rate": 1.489876905020427e-06, "loss": 0.5968, "num_tokens": 291422309.0, "step": 1430 }, { "epoch": 3.423444976076555, "grad_norm": 0.60541747237417, "learning_rate": 1.4859273273850853e-06, "loss": 0.4644, "num_tokens": 291607211.0, "step": 1431 }, { "epoch": 3.425837320574163, "grad_norm": 0.5129408106770269, "learning_rate": 1.481992826884297e-06, "loss": 0.4663, "num_tokens": 291847315.0, "step": 1432 }, { "epoch": 3.4282296650717705, "grad_norm": 0.6418602073158258, "learning_rate": 1.4780734182963186e-06, "loss": 0.3039, "num_tokens": 292003446.0, "step": 1433 }, { "epoch": 3.430622009569378, "grad_norm": 0.5414698213649558, "learning_rate": 1.474169116342719e-06, "loss": 0.4259, "num_tokens": 292221228.0, "step": 1434 }, { "epoch": 3.4330143540669855, "grad_norm": 0.5861235141143475, "learning_rate": 1.4702799356883276e-06, "loss": 0.4508, "num_tokens": 292392595.0, "step": 1435 }, { "epoch": 3.4354066985645932, "grad_norm": 0.6492147392943652, "learning_rate": 1.4664058909411743e-06, "loss": 0.5558, "num_tokens": 292616491.0, "step": 1436 }, { "epoch": 3.437799043062201, "grad_norm": 0.5867621281367356, "learning_rate": 1.4625469966524402e-06, "loss": 0.3956, "num_tokens": 292811254.0, "step": 1437 }, { "epoch": 3.4401913875598087, "grad_norm": 0.6082196218062246, "learning_rate": 1.4587032673163986e-06, "loss": 0.3441, "num_tokens": 293001343.0, "step": 1438 }, { "epoch": 3.4425837320574164, "grad_norm": 0.6414375078998078, "learning_rate": 1.454874717370363e-06, "loss": 0.5404, "num_tokens": 293221621.0, "step": 1439 }, { "epoch": 3.444976076555024, "grad_norm": 0.613280918544221, "learning_rate": 1.4510613611946317e-06, "loss": 0.5266, "num_tokens": 293420371.0, "step": 1440 }, { "epoch": 3.4473684210526314, "grad_norm": 0.687184407411569, "learning_rate": 1.4472632131124343e-06, "loss": 0.5471, "num_tokens": 293613114.0, "step": 1441 }, { "epoch": 3.449760765550239, "grad_norm": 0.7132696555661605, "learning_rate": 1.4434802873898774e-06, "loss": 0.6045, "num_tokens": 293828237.0, "step": 1442 }, { "epoch": 3.452153110047847, "grad_norm": 0.663988889436373, "learning_rate": 1.4397125982358924e-06, "loss": 0.3951, "num_tokens": 294030397.0, "step": 1443 }, { "epoch": 3.4545454545454546, "grad_norm": 0.6111538223995584, "learning_rate": 1.435960159802179e-06, "loss": 0.5118, "num_tokens": 294266833.0, "step": 1444 }, { "epoch": 3.4569377990430623, "grad_norm": 0.7058152631528316, "learning_rate": 1.4322229861831558e-06, "loss": 0.5049, "num_tokens": 294451025.0, "step": 1445 }, { "epoch": 3.45933014354067, "grad_norm": 0.5699602285006586, "learning_rate": 1.4285010914159065e-06, "loss": 0.5684, "num_tokens": 294672913.0, "step": 1446 }, { "epoch": 3.4617224880382773, "grad_norm": 0.5623025715207065, "learning_rate": 1.4247944894801236e-06, "loss": 0.5162, "num_tokens": 294903672.0, "step": 1447 }, { "epoch": 3.464114832535885, "grad_norm": 0.712647341968248, "learning_rate": 1.4211031942980613e-06, "loss": 0.5999, "num_tokens": 295087849.0, "step": 1448 }, { "epoch": 3.4665071770334928, "grad_norm": 0.6172051693085459, "learning_rate": 1.4174272197344793e-06, "loss": 0.4612, "num_tokens": 295287798.0, "step": 1449 }, { "epoch": 3.4688995215311005, "grad_norm": 0.6311358310811482, "learning_rate": 1.4137665795965927e-06, "loss": 0.5761, "num_tokens": 295492370.0, "step": 1450 }, { "epoch": 3.471291866028708, "grad_norm": 0.7251245358729828, "learning_rate": 1.4101212876340187e-06, "loss": 0.5706, "num_tokens": 295672740.0, "step": 1451 }, { "epoch": 3.473684210526316, "grad_norm": 0.560049433607437, "learning_rate": 1.4064913575387251e-06, "loss": 0.4304, "num_tokens": 295869423.0, "step": 1452 }, { "epoch": 3.4760765550239237, "grad_norm": 0.6974882287107438, "learning_rate": 1.4028768029449824e-06, "loss": 0.5892, "num_tokens": 296062015.0, "step": 1453 }, { "epoch": 3.478468899521531, "grad_norm": 0.6545967283254652, "learning_rate": 1.399277637429306e-06, "loss": 0.5022, "num_tokens": 296274972.0, "step": 1454 }, { "epoch": 3.4808612440191387, "grad_norm": 0.8022540603329968, "learning_rate": 1.3956938745104109e-06, "loss": 0.5512, "num_tokens": 296429164.0, "step": 1455 }, { "epoch": 3.4832535885167464, "grad_norm": 0.7082878808081615, "learning_rate": 1.3921255276491586e-06, "loss": 0.5427, "num_tokens": 296614072.0, "step": 1456 }, { "epoch": 3.485645933014354, "grad_norm": 0.7641750032011092, "learning_rate": 1.3885726102485073e-06, "loss": 0.6203, "num_tokens": 296769349.0, "step": 1457 }, { "epoch": 3.488038277511962, "grad_norm": 0.7398101140526424, "learning_rate": 1.3850351356534592e-06, "loss": 0.5137, "num_tokens": 296956478.0, "step": 1458 }, { "epoch": 3.4904306220095696, "grad_norm": 0.6930674531073416, "learning_rate": 1.3815131171510138e-06, "loss": 0.7111, "num_tokens": 297202360.0, "step": 1459 }, { "epoch": 3.492822966507177, "grad_norm": 0.7104167967522225, "learning_rate": 1.3780065679701163e-06, "loss": 0.4507, "num_tokens": 297368352.0, "step": 1460 }, { "epoch": 3.4952153110047846, "grad_norm": 0.6783264118382551, "learning_rate": 1.3745155012816083e-06, "loss": 0.591, "num_tokens": 297571077.0, "step": 1461 }, { "epoch": 3.4976076555023923, "grad_norm": 0.6499466582199835, "learning_rate": 1.3710399301981774e-06, "loss": 0.3173, "num_tokens": 297713503.0, "step": 1462 }, { "epoch": 3.5, "grad_norm": 0.6482663233430564, "learning_rate": 1.3675798677743094e-06, "loss": 0.425, "num_tokens": 297877390.0, "step": 1463 }, { "epoch": 3.5023923444976077, "grad_norm": 0.7096180478132679, "learning_rate": 1.364135327006237e-06, "loss": 0.6449, "num_tokens": 298080783.0, "step": 1464 }, { "epoch": 3.5047846889952154, "grad_norm": 0.820714681833925, "learning_rate": 1.3607063208318944e-06, "loss": 0.4437, "num_tokens": 298294526.0, "step": 1465 }, { "epoch": 3.507177033492823, "grad_norm": 0.5435495568523284, "learning_rate": 1.3572928621308662e-06, "loss": 0.4597, "num_tokens": 298534343.0, "step": 1466 }, { "epoch": 3.509569377990431, "grad_norm": 0.7369001406669367, "learning_rate": 1.3538949637243382e-06, "loss": 0.5622, "num_tokens": 298709480.0, "step": 1467 }, { "epoch": 3.511961722488038, "grad_norm": 0.6616177825442939, "learning_rate": 1.3505126383750528e-06, "loss": 0.5391, "num_tokens": 298918347.0, "step": 1468 }, { "epoch": 3.514354066985646, "grad_norm": 0.5800915812997612, "learning_rate": 1.3471458987872574e-06, "loss": 0.5072, "num_tokens": 299167577.0, "step": 1469 }, { "epoch": 3.5167464114832536, "grad_norm": 0.5776807179015478, "learning_rate": 1.3437947576066593e-06, "loss": 0.3321, "num_tokens": 299349965.0, "step": 1470 }, { "epoch": 3.5191387559808613, "grad_norm": 0.5576868780501636, "learning_rate": 1.3404592274203752e-06, "loss": 0.5429, "num_tokens": 299589593.0, "step": 1471 }, { "epoch": 3.5215311004784686, "grad_norm": 0.7277519126409697, "learning_rate": 1.3371393207568894e-06, "loss": 0.5165, "num_tokens": 299747218.0, "step": 1472 }, { "epoch": 3.5239234449760763, "grad_norm": 0.6846297716168203, "learning_rate": 1.3338350500859988e-06, "loss": 0.4876, "num_tokens": 299950882.0, "step": 1473 }, { "epoch": 3.526315789473684, "grad_norm": 0.5504630676093619, "learning_rate": 1.330546427818773e-06, "loss": 0.5165, "num_tokens": 300172705.0, "step": 1474 }, { "epoch": 3.528708133971292, "grad_norm": 0.6077413153293856, "learning_rate": 1.3272734663075054e-06, "loss": 0.5361, "num_tokens": 300408847.0, "step": 1475 }, { "epoch": 3.5311004784688995, "grad_norm": 0.6151817478419886, "learning_rate": 1.324016177845665e-06, "loss": 0.5336, "num_tokens": 300627309.0, "step": 1476 }, { "epoch": 3.5334928229665072, "grad_norm": 0.6657148941500219, "learning_rate": 1.3207745746678533e-06, "loss": 0.6072, "num_tokens": 300893676.0, "step": 1477 }, { "epoch": 3.535885167464115, "grad_norm": 0.5864974741212591, "learning_rate": 1.3175486689497558e-06, "loss": 0.4759, "num_tokens": 301119831.0, "step": 1478 }, { "epoch": 3.5382775119617227, "grad_norm": 0.669995482447796, "learning_rate": 1.3143384728080966e-06, "loss": 0.397, "num_tokens": 301288802.0, "step": 1479 }, { "epoch": 3.5406698564593304, "grad_norm": 0.6229464873082609, "learning_rate": 1.3111439983005952e-06, "loss": 0.6313, "num_tokens": 301518507.0, "step": 1480 }, { "epoch": 3.5430622009569377, "grad_norm": 0.6175779598804242, "learning_rate": 1.3079652574259187e-06, "loss": 0.4716, "num_tokens": 301730836.0, "step": 1481 }, { "epoch": 3.5454545454545454, "grad_norm": 0.6557616851250915, "learning_rate": 1.3048022621236382e-06, "loss": 0.5501, "num_tokens": 301944446.0, "step": 1482 }, { "epoch": 3.547846889952153, "grad_norm": 0.6431228077639217, "learning_rate": 1.301655024274182e-06, "loss": 0.4078, "num_tokens": 302125230.0, "step": 1483 }, { "epoch": 3.550239234449761, "grad_norm": 0.6165365783638824, "learning_rate": 1.2985235556987926e-06, "loss": 0.4014, "num_tokens": 302326057.0, "step": 1484 }, { "epoch": 3.5526315789473686, "grad_norm": 0.6022448407744763, "learning_rate": 1.2954078681594845e-06, "loss": 0.372, "num_tokens": 302506769.0, "step": 1485 }, { "epoch": 3.555023923444976, "grad_norm": 0.5938105239263732, "learning_rate": 1.2923079733589928e-06, "loss": 0.5537, "num_tokens": 302720143.0, "step": 1486 }, { "epoch": 3.5574162679425836, "grad_norm": 0.6873490122710798, "learning_rate": 1.2892238829407395e-06, "loss": 0.4151, "num_tokens": 302877685.0, "step": 1487 }, { "epoch": 3.5598086124401913, "grad_norm": 0.5681553037684327, "learning_rate": 1.2861556084887797e-06, "loss": 0.355, "num_tokens": 303118737.0, "step": 1488 }, { "epoch": 3.562200956937799, "grad_norm": 0.6948437599501222, "learning_rate": 1.2831031615277648e-06, "loss": 0.5261, "num_tokens": 303302096.0, "step": 1489 }, { "epoch": 3.5645933014354068, "grad_norm": 0.6419899066739717, "learning_rate": 1.2800665535228967e-06, "loss": 0.3954, "num_tokens": 303492631.0, "step": 1490 }, { "epoch": 3.5669856459330145, "grad_norm": 0.6901526120395478, "learning_rate": 1.2770457958798858e-06, "loss": 0.4241, "num_tokens": 303694151.0, "step": 1491 }, { "epoch": 3.569377990430622, "grad_norm": 0.6328577003617731, "learning_rate": 1.2740408999449055e-06, "loss": 0.6208, "num_tokens": 303941252.0, "step": 1492 }, { "epoch": 3.57177033492823, "grad_norm": 0.621056467226815, "learning_rate": 1.2710518770045535e-06, "loss": 0.367, "num_tokens": 304122539.0, "step": 1493 }, { "epoch": 3.574162679425837, "grad_norm": 0.6484720088529732, "learning_rate": 1.2680787382858068e-06, "loss": 0.5649, "num_tokens": 304340159.0, "step": 1494 }, { "epoch": 3.576555023923445, "grad_norm": 0.6765968336575626, "learning_rate": 1.2651214949559796e-06, "loss": 0.5138, "num_tokens": 304564017.0, "step": 1495 }, { "epoch": 3.5789473684210527, "grad_norm": 0.6151693720513433, "learning_rate": 1.2621801581226832e-06, "loss": 0.477, "num_tokens": 304782069.0, "step": 1496 }, { "epoch": 3.5813397129186604, "grad_norm": 0.6951947406065941, "learning_rate": 1.2592547388337797e-06, "loss": 0.5591, "num_tokens": 304964038.0, "step": 1497 }, { "epoch": 3.583732057416268, "grad_norm": 0.7379327952571767, "learning_rate": 1.25634524807735e-06, "loss": 0.5745, "num_tokens": 305144017.0, "step": 1498 }, { "epoch": 3.5861244019138754, "grad_norm": 0.6546876395903558, "learning_rate": 1.253451696781639e-06, "loss": 0.5772, "num_tokens": 305325548.0, "step": 1499 }, { "epoch": 3.588516746411483, "grad_norm": 0.5931090788351342, "learning_rate": 1.2505740958150287e-06, "loss": 0.5098, "num_tokens": 305545495.0, "step": 1500 }, { "epoch": 3.590909090909091, "grad_norm": 0.6700019687712389, "learning_rate": 1.2477124559859837e-06, "loss": 0.6396, "num_tokens": 305763102.0, "step": 1501 }, { "epoch": 3.5933014354066986, "grad_norm": 0.528208222358471, "learning_rate": 1.2448667880430252e-06, "loss": 0.4368, "num_tokens": 306006879.0, "step": 1502 }, { "epoch": 3.5956937799043063, "grad_norm": 0.5929577605264226, "learning_rate": 1.242037102674675e-06, "loss": 0.4719, "num_tokens": 306202979.0, "step": 1503 }, { "epoch": 3.598086124401914, "grad_norm": 0.5935852989033888, "learning_rate": 1.2392234105094301e-06, "loss": 0.5019, "num_tokens": 306423290.0, "step": 1504 }, { "epoch": 3.6004784688995217, "grad_norm": 0.6272016449415893, "learning_rate": 1.236425722115711e-06, "loss": 0.6179, "num_tokens": 306674700.0, "step": 1505 }, { "epoch": 3.6028708133971294, "grad_norm": 0.6795819406741945, "learning_rate": 1.2336440480018292e-06, "loss": 0.4432, "num_tokens": 306864232.0, "step": 1506 }, { "epoch": 3.6052631578947367, "grad_norm": 0.6346248295282361, "learning_rate": 1.2308783986159451e-06, "loss": 0.6093, "num_tokens": 307116714.0, "step": 1507 }, { "epoch": 3.6076555023923444, "grad_norm": 0.5839403967153011, "learning_rate": 1.228128784346029e-06, "loss": 0.3204, "num_tokens": 307290174.0, "step": 1508 }, { "epoch": 3.610047846889952, "grad_norm": 0.5847207999870764, "learning_rate": 1.225395215519823e-06, "loss": 0.47, "num_tokens": 307496199.0, "step": 1509 }, { "epoch": 3.61244019138756, "grad_norm": 0.6125888173508722, "learning_rate": 1.2226777024047985e-06, "loss": 0.393, "num_tokens": 307673185.0, "step": 1510 }, { "epoch": 3.6148325358851676, "grad_norm": 0.6713056636085881, "learning_rate": 1.2199762552081254e-06, "loss": 0.5381, "num_tokens": 307901253.0, "step": 1511 }, { "epoch": 3.617224880382775, "grad_norm": 0.6509525755677749, "learning_rate": 1.2172908840766242e-06, "loss": 0.2484, "num_tokens": 308052165.0, "step": 1512 }, { "epoch": 3.6196172248803826, "grad_norm": 0.6804638610041888, "learning_rate": 1.2146215990967363e-06, "loss": 0.6382, "num_tokens": 308286709.0, "step": 1513 }, { "epoch": 3.6220095693779903, "grad_norm": 0.5726101216910606, "learning_rate": 1.2119684102944782e-06, "loss": 0.5203, "num_tokens": 308518482.0, "step": 1514 }, { "epoch": 3.624401913875598, "grad_norm": 0.7457351396502082, "learning_rate": 1.2093313276354139e-06, "loss": 0.5756, "num_tokens": 308701930.0, "step": 1515 }, { "epoch": 3.626794258373206, "grad_norm": 0.618254544803711, "learning_rate": 1.2067103610246046e-06, "loss": 0.5782, "num_tokens": 308920981.0, "step": 1516 }, { "epoch": 3.6291866028708135, "grad_norm": 0.6140635887344684, "learning_rate": 1.2041055203065852e-06, "loss": 0.5024, "num_tokens": 309109111.0, "step": 1517 }, { "epoch": 3.6315789473684212, "grad_norm": 0.5547321147206474, "learning_rate": 1.2015168152653159e-06, "loss": 0.3661, "num_tokens": 309314804.0, "step": 1518 }, { "epoch": 3.633971291866029, "grad_norm": 0.7611178589551898, "learning_rate": 1.198944255624152e-06, "loss": 0.622, "num_tokens": 309496156.0, "step": 1519 }, { "epoch": 3.6363636363636362, "grad_norm": 0.6207298025870296, "learning_rate": 1.1963878510458058e-06, "loss": 0.5098, "num_tokens": 309713174.0, "step": 1520 }, { "epoch": 3.638755980861244, "grad_norm": 0.5547121537829205, "learning_rate": 1.1938476111323094e-06, "loss": 0.4402, "num_tokens": 309921616.0, "step": 1521 }, { "epoch": 3.6411483253588517, "grad_norm": 0.7199324671101245, "learning_rate": 1.1913235454249807e-06, "loss": 0.3728, "num_tokens": 310061923.0, "step": 1522 }, { "epoch": 3.6435406698564594, "grad_norm": 0.5715673360964941, "learning_rate": 1.1888156634043825e-06, "loss": 0.5396, "num_tokens": 310282339.0, "step": 1523 }, { "epoch": 3.645933014354067, "grad_norm": 0.4864991883545776, "learning_rate": 1.1863239744902962e-06, "loss": 0.3361, "num_tokens": 310516447.0, "step": 1524 }, { "epoch": 3.6483253588516744, "grad_norm": 0.7242060944351417, "learning_rate": 1.1838484880416746e-06, "loss": 0.6207, "num_tokens": 310711662.0, "step": 1525 }, { "epoch": 3.650717703349282, "grad_norm": 0.6919148397543631, "learning_rate": 1.181389213356619e-06, "loss": 0.5954, "num_tokens": 310902943.0, "step": 1526 }, { "epoch": 3.65311004784689, "grad_norm": 0.7059381091150269, "learning_rate": 1.1789461596723325e-06, "loss": 0.4793, "num_tokens": 311096238.0, "step": 1527 }, { "epoch": 3.6555023923444976, "grad_norm": 0.7806938857436164, "learning_rate": 1.1765193361650964e-06, "loss": 0.6368, "num_tokens": 311260692.0, "step": 1528 }, { "epoch": 3.6578947368421053, "grad_norm": 0.621657260828602, "learning_rate": 1.1741087519502246e-06, "loss": 0.328, "num_tokens": 311444390.0, "step": 1529 }, { "epoch": 3.660287081339713, "grad_norm": 0.6199793090742324, "learning_rate": 1.171714416082041e-06, "loss": 0.4227, "num_tokens": 311632736.0, "step": 1530 }, { "epoch": 3.6626794258373208, "grad_norm": 0.7466730988608571, "learning_rate": 1.1693363375538352e-06, "loss": 0.4312, "num_tokens": 311787085.0, "step": 1531 }, { "epoch": 3.6650717703349285, "grad_norm": 0.706574319299492, "learning_rate": 1.166974525297834e-06, "loss": 0.4932, "num_tokens": 311960841.0, "step": 1532 }, { "epoch": 3.6674641148325358, "grad_norm": 0.6489663598451897, "learning_rate": 1.1646289881851686e-06, "loss": 0.4557, "num_tokens": 312140508.0, "step": 1533 }, { "epoch": 3.6698564593301435, "grad_norm": 0.5629965502266332, "learning_rate": 1.1622997350258386e-06, "loss": 0.3826, "num_tokens": 312368815.0, "step": 1534 }, { "epoch": 3.672248803827751, "grad_norm": 0.7502937179319551, "learning_rate": 1.1599867745686797e-06, "loss": 0.4459, "num_tokens": 312510042.0, "step": 1535 }, { "epoch": 3.674641148325359, "grad_norm": 0.6450025468017979, "learning_rate": 1.157690115501331e-06, "loss": 0.5155, "num_tokens": 312753628.0, "step": 1536 }, { "epoch": 3.6770334928229667, "grad_norm": 0.6516806356437004, "learning_rate": 1.1554097664502042e-06, "loss": 0.6561, "num_tokens": 313017066.0, "step": 1537 }, { "epoch": 3.679425837320574, "grad_norm": 0.5838091230332593, "learning_rate": 1.1531457359804453e-06, "loss": 0.5613, "num_tokens": 313258011.0, "step": 1538 }, { "epoch": 3.6818181818181817, "grad_norm": 0.6930340123423862, "learning_rate": 1.1508980325959124e-06, "loss": 0.5582, "num_tokens": 313462213.0, "step": 1539 }, { "epoch": 3.6842105263157894, "grad_norm": 0.6056783363354274, "learning_rate": 1.1486666647391325e-06, "loss": 0.4607, "num_tokens": 313715622.0, "step": 1540 }, { "epoch": 3.686602870813397, "grad_norm": 0.5370198813826438, "learning_rate": 1.1464516407912792e-06, "loss": 0.4582, "num_tokens": 313970687.0, "step": 1541 }, { "epoch": 3.688995215311005, "grad_norm": 0.6121865084573093, "learning_rate": 1.1442529690721341e-06, "loss": 0.6653, "num_tokens": 314228905.0, "step": 1542 }, { "epoch": 3.6913875598086126, "grad_norm": 0.6261892366272805, "learning_rate": 1.142070657840062e-06, "loss": 0.4181, "num_tokens": 314417597.0, "step": 1543 }, { "epoch": 3.6937799043062203, "grad_norm": 0.5999298045009195, "learning_rate": 1.1399047152919733e-06, "loss": 0.4485, "num_tokens": 314612517.0, "step": 1544 }, { "epoch": 3.696172248803828, "grad_norm": 0.6510228815523412, "learning_rate": 1.1377551495632985e-06, "loss": 0.4576, "num_tokens": 314810459.0, "step": 1545 }, { "epoch": 3.6985645933014353, "grad_norm": 0.7103858911919039, "learning_rate": 1.1356219687279554e-06, "loss": 0.497, "num_tokens": 314975980.0, "step": 1546 }, { "epoch": 3.700956937799043, "grad_norm": 0.6483471228722542, "learning_rate": 1.1335051807983188e-06, "loss": 0.3976, "num_tokens": 315153202.0, "step": 1547 }, { "epoch": 3.7033492822966507, "grad_norm": 0.5767066645524299, "learning_rate": 1.1314047937251894e-06, "loss": 0.4727, "num_tokens": 315378174.0, "step": 1548 }, { "epoch": 3.7057416267942584, "grad_norm": 0.6178224515003014, "learning_rate": 1.1293208153977677e-06, "loss": 0.4304, "num_tokens": 315577510.0, "step": 1549 }, { "epoch": 3.708133971291866, "grad_norm": 0.6414850819255512, "learning_rate": 1.1272532536436205e-06, "loss": 0.3087, "num_tokens": 315722493.0, "step": 1550 }, { "epoch": 3.7105263157894735, "grad_norm": 0.5916056531201479, "learning_rate": 1.1252021162286513e-06, "loss": 0.4199, "num_tokens": 315915892.0, "step": 1551 }, { "epoch": 3.712918660287081, "grad_norm": 0.638094656338588, "learning_rate": 1.1231674108570756e-06, "loss": 0.5788, "num_tokens": 316148867.0, "step": 1552 }, { "epoch": 3.715311004784689, "grad_norm": 0.7083497582287557, "learning_rate": 1.121149145171386e-06, "loss": 0.583, "num_tokens": 316320260.0, "step": 1553 }, { "epoch": 3.7177033492822966, "grad_norm": 0.650630130615892, "learning_rate": 1.1191473267523297e-06, "loss": 0.5426, "num_tokens": 316501062.0, "step": 1554 }, { "epoch": 3.7200956937799043, "grad_norm": 0.6113182049236018, "learning_rate": 1.1171619631188733e-06, "loss": 0.5281, "num_tokens": 316760839.0, "step": 1555 }, { "epoch": 3.722488038277512, "grad_norm": 0.6124521857276314, "learning_rate": 1.1151930617281814e-06, "loss": 0.4915, "num_tokens": 316978471.0, "step": 1556 }, { "epoch": 3.72488038277512, "grad_norm": 0.5855301464032153, "learning_rate": 1.1132406299755828e-06, "loss": 0.3729, "num_tokens": 317175826.0, "step": 1557 }, { "epoch": 3.7272727272727275, "grad_norm": 0.4900955113250577, "learning_rate": 1.111304675194547e-06, "loss": 0.3394, "num_tokens": 317410638.0, "step": 1558 }, { "epoch": 3.729665071770335, "grad_norm": 0.6800914502081618, "learning_rate": 1.1093852046566534e-06, "loss": 0.5626, "num_tokens": 317596615.0, "step": 1559 }, { "epoch": 3.7320574162679425, "grad_norm": 0.580145555361245, "learning_rate": 1.1074822255715664e-06, "loss": 0.3454, "num_tokens": 317798272.0, "step": 1560 }, { "epoch": 3.7344497607655502, "grad_norm": 0.586689431375617, "learning_rate": 1.1055957450870066e-06, "loss": 0.4209, "num_tokens": 318011177.0, "step": 1561 }, { "epoch": 3.736842105263158, "grad_norm": 0.5508088873557061, "learning_rate": 1.103725770288726e-06, "loss": 0.3773, "num_tokens": 318216392.0, "step": 1562 }, { "epoch": 3.7392344497607657, "grad_norm": 0.6811545208859248, "learning_rate": 1.1018723082004782e-06, "loss": 0.5659, "num_tokens": 318416875.0, "step": 1563 }, { "epoch": 3.741626794258373, "grad_norm": 0.72071013278369, "learning_rate": 1.1000353657839962e-06, "loss": 0.5061, "num_tokens": 318594496.0, "step": 1564 }, { "epoch": 3.7440191387559807, "grad_norm": 0.7606517545104856, "learning_rate": 1.0982149499389617e-06, "loss": 0.6404, "num_tokens": 318787094.0, "step": 1565 }, { "epoch": 3.7464114832535884, "grad_norm": 0.5877892564705959, "learning_rate": 1.0964110675029828e-06, "loss": 0.4668, "num_tokens": 318981017.0, "step": 1566 }, { "epoch": 3.748803827751196, "grad_norm": 0.5670748324028697, "learning_rate": 1.0946237252515671e-06, "loss": 0.4134, "num_tokens": 319161348.0, "step": 1567 }, { "epoch": 3.751196172248804, "grad_norm": 0.6047984846668623, "learning_rate": 1.092852929898095e-06, "loss": 0.5675, "num_tokens": 319404004.0, "step": 1568 }, { "epoch": 3.7535885167464116, "grad_norm": 0.774410044947522, "learning_rate": 1.0910986880937972e-06, "loss": 0.6418, "num_tokens": 319566164.0, "step": 1569 }, { "epoch": 3.7559808612440193, "grad_norm": 0.6901217036526389, "learning_rate": 1.0893610064277268e-06, "loss": 0.5677, "num_tokens": 319784161.0, "step": 1570 }, { "epoch": 3.758373205741627, "grad_norm": 0.601635981880848, "learning_rate": 1.0876398914267358e-06, "loss": 0.497, "num_tokens": 320014206.0, "step": 1571 }, { "epoch": 3.7607655502392343, "grad_norm": 0.5909062617346802, "learning_rate": 1.0859353495554517e-06, "loss": 0.5804, "num_tokens": 320272986.0, "step": 1572 }, { "epoch": 3.763157894736842, "grad_norm": 0.6020618990919157, "learning_rate": 1.0842473872162517e-06, "loss": 0.2992, "num_tokens": 320400622.0, "step": 1573 }, { "epoch": 3.7655502392344498, "grad_norm": 0.5821329545334476, "learning_rate": 1.0825760107492384e-06, "loss": 0.5617, "num_tokens": 320646872.0, "step": 1574 }, { "epoch": 3.7679425837320575, "grad_norm": 0.491139430544993, "learning_rate": 1.0809212264322185e-06, "loss": 0.4149, "num_tokens": 320920941.0, "step": 1575 }, { "epoch": 3.770334928229665, "grad_norm": 0.6017226268390595, "learning_rate": 1.0792830404806756e-06, "loss": 0.4497, "num_tokens": 321121661.0, "step": 1576 }, { "epoch": 3.7727272727272725, "grad_norm": 0.6654757497149889, "learning_rate": 1.0776614590477493e-06, "loss": 0.3742, "num_tokens": 321281938.0, "step": 1577 }, { "epoch": 3.77511961722488, "grad_norm": 0.5210291647393411, "learning_rate": 1.0760564882242122e-06, "loss": 0.3463, "num_tokens": 321483427.0, "step": 1578 }, { "epoch": 3.777511961722488, "grad_norm": 0.7436218210948605, "learning_rate": 1.0744681340384452e-06, "loss": 0.4782, "num_tokens": 321632939.0, "step": 1579 }, { "epoch": 3.7799043062200957, "grad_norm": 0.6545805936948292, "learning_rate": 1.0728964024564165e-06, "loss": 0.565, "num_tokens": 321812453.0, "step": 1580 }, { "epoch": 3.7822966507177034, "grad_norm": 0.4607908620988967, "learning_rate": 1.071341299381659e-06, "loss": 0.3077, "num_tokens": 321996465.0, "step": 1581 }, { "epoch": 3.784688995215311, "grad_norm": 0.5033378978914038, "learning_rate": 1.069802830655248e-06, "loss": 0.4661, "num_tokens": 322267207.0, "step": 1582 }, { "epoch": 3.787081339712919, "grad_norm": 0.6235361901036778, "learning_rate": 1.0682810020557762e-06, "loss": 0.5563, "num_tokens": 322476579.0, "step": 1583 }, { "epoch": 3.7894736842105265, "grad_norm": 0.5893893559866515, "learning_rate": 1.0667758192993393e-06, "loss": 0.4772, "num_tokens": 322683747.0, "step": 1584 }, { "epoch": 3.791866028708134, "grad_norm": 0.5758781713440533, "learning_rate": 1.0652872880395065e-06, "loss": 0.3445, "num_tokens": 322872058.0, "step": 1585 }, { "epoch": 3.7942583732057416, "grad_norm": 0.5335020907001772, "learning_rate": 1.063815413867305e-06, "loss": 0.3899, "num_tokens": 323127466.0, "step": 1586 }, { "epoch": 3.7966507177033493, "grad_norm": 0.6289334799509361, "learning_rate": 1.0623602023111953e-06, "loss": 0.5584, "num_tokens": 323326737.0, "step": 1587 }, { "epoch": 3.799043062200957, "grad_norm": 0.6211221471702003, "learning_rate": 1.0609216588370533e-06, "loss": 0.563, "num_tokens": 323569246.0, "step": 1588 }, { "epoch": 3.8014354066985647, "grad_norm": 0.5346284527419118, "learning_rate": 1.0594997888481473e-06, "loss": 0.4427, "num_tokens": 323829962.0, "step": 1589 }, { "epoch": 3.803827751196172, "grad_norm": 0.6105884009837914, "learning_rate": 1.058094597685119e-06, "loss": 0.4559, "num_tokens": 324028942.0, "step": 1590 }, { "epoch": 3.8062200956937797, "grad_norm": 0.5468954320316094, "learning_rate": 1.0567060906259646e-06, "loss": 0.4562, "num_tokens": 324256000.0, "step": 1591 }, { "epoch": 3.8086124401913874, "grad_norm": 0.684261812822514, "learning_rate": 1.055334272886012e-06, "loss": 0.5445, "num_tokens": 324434459.0, "step": 1592 }, { "epoch": 3.811004784688995, "grad_norm": 0.6526354730563445, "learning_rate": 1.0539791496179034e-06, "loss": 0.5026, "num_tokens": 324612716.0, "step": 1593 }, { "epoch": 3.813397129186603, "grad_norm": 0.808016517252592, "learning_rate": 1.052640725911575e-06, "loss": 0.4559, "num_tokens": 324755406.0, "step": 1594 }, { "epoch": 3.8157894736842106, "grad_norm": 0.6847153846248437, "learning_rate": 1.0513190067942405e-06, "loss": 0.5469, "num_tokens": 324945448.0, "step": 1595 }, { "epoch": 3.8181818181818183, "grad_norm": 0.5032406865472859, "learning_rate": 1.0500139972303667e-06, "loss": 0.3701, "num_tokens": 325177483.0, "step": 1596 }, { "epoch": 3.820574162679426, "grad_norm": 0.6356968965119758, "learning_rate": 1.048725702121661e-06, "loss": 0.6085, "num_tokens": 325386402.0, "step": 1597 }, { "epoch": 3.8229665071770333, "grad_norm": 0.5246325645094293, "learning_rate": 1.047454126307048e-06, "loss": 0.3515, "num_tokens": 325610116.0, "step": 1598 }, { "epoch": 3.825358851674641, "grad_norm": 0.5578264875928621, "learning_rate": 1.0461992745626539e-06, "loss": 0.385, "num_tokens": 325809437.0, "step": 1599 }, { "epoch": 3.827751196172249, "grad_norm": 0.5877031056698316, "learning_rate": 1.04496115160179e-06, "loss": 0.4319, "num_tokens": 326016149.0, "step": 1600 }, { "epoch": 3.8301435406698565, "grad_norm": 0.7046844882013413, "learning_rate": 1.0437397620749312e-06, "loss": 0.6778, "num_tokens": 326230889.0, "step": 1601 }, { "epoch": 3.8325358851674642, "grad_norm": 0.6223264724390841, "learning_rate": 1.0425351105697009e-06, "loss": 0.4787, "num_tokens": 326414427.0, "step": 1602 }, { "epoch": 3.8349282296650715, "grad_norm": 0.5851815349219407, "learning_rate": 1.0413472016108534e-06, "loss": 0.3812, "num_tokens": 326602846.0, "step": 1603 }, { "epoch": 3.8373205741626792, "grad_norm": 0.7261081213176126, "learning_rate": 1.040176039660257e-06, "loss": 0.6145, "num_tokens": 326792181.0, "step": 1604 }, { "epoch": 3.839712918660287, "grad_norm": 0.7167146631031324, "learning_rate": 1.0390216291168773e-06, "loss": 0.6231, "num_tokens": 326996430.0, "step": 1605 }, { "epoch": 3.8421052631578947, "grad_norm": 0.69933271543258, "learning_rate": 1.0378839743167598e-06, "loss": 0.4485, "num_tokens": 327146520.0, "step": 1606 }, { "epoch": 3.8444976076555024, "grad_norm": 0.6096694228023282, "learning_rate": 1.0367630795330143e-06, "loss": 0.4307, "num_tokens": 327368158.0, "step": 1607 }, { "epoch": 3.84688995215311, "grad_norm": 0.6284560072949098, "learning_rate": 1.0356589489757998e-06, "loss": 0.4751, "num_tokens": 327581854.0, "step": 1608 }, { "epoch": 3.849282296650718, "grad_norm": 0.6986455090501255, "learning_rate": 1.0345715867923069e-06, "loss": 0.5876, "num_tokens": 327758594.0, "step": 1609 }, { "epoch": 3.8516746411483256, "grad_norm": 0.6279890443357188, "learning_rate": 1.0335009970667424e-06, "loss": 0.452, "num_tokens": 328005472.0, "step": 1610 }, { "epoch": 3.854066985645933, "grad_norm": 0.5038022028163734, "learning_rate": 1.0324471838203165e-06, "loss": 0.3595, "num_tokens": 328237457.0, "step": 1611 }, { "epoch": 3.8564593301435406, "grad_norm": 0.6174001923472133, "learning_rate": 1.031410151011224e-06, "loss": 0.5795, "num_tokens": 328452327.0, "step": 1612 }, { "epoch": 3.8588516746411483, "grad_norm": 0.6609200384872597, "learning_rate": 1.0303899025346317e-06, "loss": 0.4913, "num_tokens": 328640854.0, "step": 1613 }, { "epoch": 3.861244019138756, "grad_norm": 0.6438960049381821, "learning_rate": 1.0293864422226636e-06, "loss": 0.4904, "num_tokens": 328846549.0, "step": 1614 }, { "epoch": 3.8636363636363638, "grad_norm": 0.6478362817028315, "learning_rate": 1.028399773844386e-06, "loss": 0.5504, "num_tokens": 329126737.0, "step": 1615 }, { "epoch": 3.866028708133971, "grad_norm": 0.5035472376476936, "learning_rate": 1.0274299011057936e-06, "loss": 0.4404, "num_tokens": 329338983.0, "step": 1616 }, { "epoch": 3.8684210526315788, "grad_norm": 0.7034481106903326, "learning_rate": 1.0264768276497958e-06, "loss": 0.5546, "num_tokens": 329546794.0, "step": 1617 }, { "epoch": 3.8708133971291865, "grad_norm": 0.631611590849145, "learning_rate": 1.0255405570562023e-06, "loss": 0.4272, "num_tokens": 329731632.0, "step": 1618 }, { "epoch": 3.873205741626794, "grad_norm": 0.6622179754285995, "learning_rate": 1.024621092841709e-06, "loss": 0.397, "num_tokens": 329926478.0, "step": 1619 }, { "epoch": 3.875598086124402, "grad_norm": 0.533233637209565, "learning_rate": 1.023718438459889e-06, "loss": 0.3988, "num_tokens": 330152207.0, "step": 1620 }, { "epoch": 3.8779904306220097, "grad_norm": 0.7196831072594635, "learning_rate": 1.0228325973011735e-06, "loss": 0.685, "num_tokens": 330344567.0, "step": 1621 }, { "epoch": 3.8803827751196174, "grad_norm": 0.6799866497463672, "learning_rate": 1.0219635726928432e-06, "loss": 0.5472, "num_tokens": 330553058.0, "step": 1622 }, { "epoch": 3.882775119617225, "grad_norm": 0.6452372012120255, "learning_rate": 1.0211113678990146e-06, "loss": 0.586, "num_tokens": 330770601.0, "step": 1623 }, { "epoch": 3.8851674641148324, "grad_norm": 0.7358669796048254, "learning_rate": 1.0202759861206288e-06, "loss": 0.5635, "num_tokens": 330949102.0, "step": 1624 }, { "epoch": 3.88755980861244, "grad_norm": 0.6914992060273657, "learning_rate": 1.0194574304954356e-06, "loss": 0.5109, "num_tokens": 331138604.0, "step": 1625 }, { "epoch": 3.889952153110048, "grad_norm": 0.6049799861182562, "learning_rate": 1.0186557040979883e-06, "loss": 0.4627, "num_tokens": 331323330.0, "step": 1626 }, { "epoch": 3.8923444976076556, "grad_norm": 0.5709437841289162, "learning_rate": 1.0178708099396263e-06, "loss": 0.5568, "num_tokens": 331588980.0, "step": 1627 }, { "epoch": 3.8947368421052633, "grad_norm": 0.6835800952526491, "learning_rate": 1.0171027509684653e-06, "loss": 0.4222, "num_tokens": 331795142.0, "step": 1628 }, { "epoch": 3.8971291866028706, "grad_norm": 0.6336779564251682, "learning_rate": 1.016351530069389e-06, "loss": 0.6262, "num_tokens": 332018389.0, "step": 1629 }, { "epoch": 3.8995215311004783, "grad_norm": 0.6138106246476915, "learning_rate": 1.0156171500640349e-06, "loss": 0.5274, "num_tokens": 332265698.0, "step": 1630 }, { "epoch": 3.901913875598086, "grad_norm": 0.6367531709575587, "learning_rate": 1.0148996137107847e-06, "loss": 0.4006, "num_tokens": 332416740.0, "step": 1631 }, { "epoch": 3.9043062200956937, "grad_norm": 0.6968162515814991, "learning_rate": 1.0141989237047547e-06, "loss": 0.7096, "num_tokens": 332651113.0, "step": 1632 }, { "epoch": 3.9066985645933014, "grad_norm": 0.6117743697368456, "learning_rate": 1.013515082677786e-06, "loss": 0.5262, "num_tokens": 332879068.0, "step": 1633 }, { "epoch": 3.909090909090909, "grad_norm": 0.6863788690669168, "learning_rate": 1.0128480931984307e-06, "loss": 0.5488, "num_tokens": 333095663.0, "step": 1634 }, { "epoch": 3.911483253588517, "grad_norm": 0.692371309851935, "learning_rate": 1.0121979577719493e-06, "loss": 0.6368, "num_tokens": 333279591.0, "step": 1635 }, { "epoch": 3.9138755980861246, "grad_norm": 0.5543815742967236, "learning_rate": 1.0115646788402948e-06, "loss": 0.4403, "num_tokens": 333515081.0, "step": 1636 }, { "epoch": 3.916267942583732, "grad_norm": 0.42712256054935754, "learning_rate": 1.010948258782106e-06, "loss": 0.2922, "num_tokens": 333771678.0, "step": 1637 }, { "epoch": 3.9186602870813396, "grad_norm": 0.6438096039933913, "learning_rate": 1.0103486999127003e-06, "loss": 0.4995, "num_tokens": 333951246.0, "step": 1638 }, { "epoch": 3.9210526315789473, "grad_norm": 0.7556968161493737, "learning_rate": 1.0097660044840622e-06, "loss": 0.5535, "num_tokens": 334119009.0, "step": 1639 }, { "epoch": 3.923444976076555, "grad_norm": 0.681944719510437, "learning_rate": 1.009200174684835e-06, "loss": 0.4946, "num_tokens": 334355751.0, "step": 1640 }, { "epoch": 3.925837320574163, "grad_norm": 0.6353703842610504, "learning_rate": 1.0086512126403156e-06, "loss": 0.4986, "num_tokens": 334568267.0, "step": 1641 }, { "epoch": 3.92822966507177, "grad_norm": 0.659594484115981, "learning_rate": 1.0081191204124426e-06, "loss": 0.5352, "num_tokens": 334796021.0, "step": 1642 }, { "epoch": 3.930622009569378, "grad_norm": 0.6454639384667783, "learning_rate": 1.0076038999997921e-06, "loss": 0.5735, "num_tokens": 335010213.0, "step": 1643 }, { "epoch": 3.9330143540669855, "grad_norm": 0.6662553736968893, "learning_rate": 1.007105553337567e-06, "loss": 0.5289, "num_tokens": 335231088.0, "step": 1644 }, { "epoch": 3.9354066985645932, "grad_norm": 0.7481341829695102, "learning_rate": 1.006624082297592e-06, "loss": 0.4365, "num_tokens": 335413772.0, "step": 1645 }, { "epoch": 3.937799043062201, "grad_norm": 0.5629931719433228, "learning_rate": 1.0061594886883055e-06, "loss": 0.2927, "num_tokens": 335607882.0, "step": 1646 }, { "epoch": 3.9401913875598087, "grad_norm": 0.634796826062313, "learning_rate": 1.005711774254753e-06, "loss": 0.4054, "num_tokens": 335767032.0, "step": 1647 }, { "epoch": 3.9425837320574164, "grad_norm": 0.6001806697380168, "learning_rate": 1.0052809406785818e-06, "loss": 0.4927, "num_tokens": 335966589.0, "step": 1648 }, { "epoch": 3.944976076555024, "grad_norm": 0.6228056915445697, "learning_rate": 1.0048669895780313e-06, "loss": 0.4874, "num_tokens": 336173798.0, "step": 1649 }, { "epoch": 3.9473684210526314, "grad_norm": 0.6936880601170302, "learning_rate": 1.0044699225079318e-06, "loss": 0.2793, "num_tokens": 336328091.0, "step": 1650 }, { "epoch": 3.949760765550239, "grad_norm": 0.7405485802310904, "learning_rate": 1.0040897409596937e-06, "loss": 0.5016, "num_tokens": 336476142.0, "step": 1651 }, { "epoch": 3.952153110047847, "grad_norm": 0.6722368227288898, "learning_rate": 1.0037264463613057e-06, "loss": 0.558, "num_tokens": 336689820.0, "step": 1652 }, { "epoch": 3.9545454545454546, "grad_norm": 0.6758549587597688, "learning_rate": 1.0033800400773267e-06, "loss": 0.509, "num_tokens": 336926238.0, "step": 1653 }, { "epoch": 3.9569377990430623, "grad_norm": 0.5580208536403065, "learning_rate": 1.003050523408884e-06, "loss": 0.4084, "num_tokens": 337164449.0, "step": 1654 }, { "epoch": 3.9593301435406696, "grad_norm": 0.628066217543992, "learning_rate": 1.0027378975936637e-06, "loss": 0.38, "num_tokens": 337359565.0, "step": 1655 }, { "epoch": 3.9617224880382773, "grad_norm": 0.6580546118467593, "learning_rate": 1.0024421638059106e-06, "loss": 0.5665, "num_tokens": 337600405.0, "step": 1656 }, { "epoch": 3.964114832535885, "grad_norm": 0.6799976508185673, "learning_rate": 1.0021633231564209e-06, "loss": 0.5376, "num_tokens": 337851152.0, "step": 1657 }, { "epoch": 3.9665071770334928, "grad_norm": 0.6386070227564089, "learning_rate": 1.00190137669254e-06, "loss": 0.4064, "num_tokens": 338007553.0, "step": 1658 }, { "epoch": 3.9688995215311005, "grad_norm": 0.671364821882183, "learning_rate": 1.0016563253981562e-06, "loss": 0.5927, "num_tokens": 338245439.0, "step": 1659 }, { "epoch": 3.971291866028708, "grad_norm": 0.6417827038307478, "learning_rate": 1.0014281701936995e-06, "loss": 0.5757, "num_tokens": 338450116.0, "step": 1660 }, { "epoch": 3.973684210526316, "grad_norm": 0.6704182296093534, "learning_rate": 1.001216911936137e-06, "loss": 0.601, "num_tokens": 338673938.0, "step": 1661 }, { "epoch": 3.9760765550239237, "grad_norm": 0.7305158742473202, "learning_rate": 1.0010225514189683e-06, "loss": 0.599, "num_tokens": 338866942.0, "step": 1662 }, { "epoch": 3.9784688995215314, "grad_norm": 0.6042213634249859, "learning_rate": 1.0008450893722257e-06, "loss": 0.4336, "num_tokens": 339041961.0, "step": 1663 }, { "epoch": 3.9808612440191387, "grad_norm": 0.7572508371907327, "learning_rate": 1.000684526462469e-06, "loss": 0.6808, "num_tokens": 339208183.0, "step": 1664 }, { "epoch": 3.9832535885167464, "grad_norm": 0.5604458795361834, "learning_rate": 1.0005408632927834e-06, "loss": 0.3913, "num_tokens": 339420138.0, "step": 1665 }, { "epoch": 3.985645933014354, "grad_norm": 0.5880986784981255, "learning_rate": 1.0004141004027772e-06, "loss": 0.6053, "num_tokens": 339646260.0, "step": 1666 }, { "epoch": 3.988038277511962, "grad_norm": 0.6648043812580797, "learning_rate": 1.0003042382685818e-06, "loss": 0.5238, "num_tokens": 339835650.0, "step": 1667 }, { "epoch": 3.990430622009569, "grad_norm": 0.6181701791737245, "learning_rate": 1.0002112773028455e-06, "loss": 0.506, "num_tokens": 340078350.0, "step": 1668 }, { "epoch": 3.992822966507177, "grad_norm": 0.5772600494577129, "learning_rate": 1.0001352178547368e-06, "loss": 0.4216, "num_tokens": 340306707.0, "step": 1669 }, { "epoch": 3.9952153110047846, "grad_norm": 0.735348445818812, "learning_rate": 1.0000760602099404e-06, "loss": 0.5963, "num_tokens": 340485329.0, "step": 1670 }, { "epoch": 3.9976076555023923, "grad_norm": 0.5903693208196457, "learning_rate": 1.0000338045906564e-06, "loss": 0.5158, "num_tokens": 340697021.0, "step": 1671 }, { "epoch": 4.0, "grad_norm": 0.4716699799293331, "learning_rate": 1.0000084511556e-06, "loss": 0.3813, "num_tokens": 340953374.0, "step": 1672 }, { "epoch": 4.0, "eval_loss": 0.4249289631843567, "eval_num_tokens": 340953374.0, "eval_runtime": 109.3301, "eval_samples_per_second": 27.184, "eval_steps_per_second": 3.403, "step": 1672 }, { "epoch": 4.0, "step": 1672, "total_flos": 1933127778631680.0, "train_loss": 0.6745891945267265, "train_runtime": 14540.5824, "train_samples_per_second": 7.357, "train_steps_per_second": 0.115 } ], "logging_steps": 1, "max_steps": 1672, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1933127778631680.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }