{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.2814258911819887, "eval_steps": 800, "global_step": 30400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004803001876172608, "grad_norm": 3.330345630645752, "learning_rate": 0.0001113525, "loss": 4.3152, "step": 64 }, { "epoch": 0.009606003752345216, "grad_norm": 2.5987207889556885, "learning_rate": 0.00022447249999999998, "loss": 3.5908, "step": 128 }, { "epoch": 0.014409005628517824, "grad_norm": 10.176867485046387, "learning_rate": 0.00033759249999999996, "loss": 3.3927, "step": 192 }, { "epoch": 0.01921200750469043, "grad_norm": 6.534875869750977, "learning_rate": 0.00045071249999999993, "loss": 3.3333, "step": 256 }, { "epoch": 0.02401500938086304, "grad_norm": 6.088456630706787, "learning_rate": 0.0005638325, "loss": 3.2928, "step": 320 }, { "epoch": 0.028818011257035647, "grad_norm": 6.937580108642578, "learning_rate": 0.0006769524999999999, "loss": 3.2901, "step": 384 }, { "epoch": 0.033621013133208255, "grad_norm": 6.744969844818115, "learning_rate": 0.0007900724999999999, "loss": 3.2589, "step": 448 }, { "epoch": 0.03842401500938086, "grad_norm": 2.2261719703674316, "learning_rate": 0.0009031925, "loss": 3.231, "step": 512 }, { "epoch": 0.04322701688555347, "grad_norm": 1.030404806137085, "learning_rate": 0.0010163124999999999, "loss": 3.2278, "step": 576 }, { "epoch": 0.04803001876172608, "grad_norm": 1.036293387413025, "learning_rate": 0.0011294324999999998, "loss": 3.272, "step": 640 }, { "epoch": 0.05283302063789869, "grad_norm": 1.1835274696350098, "learning_rate": 0.0012425525, "loss": 3.256, "step": 704 }, { "epoch": 0.057636022514071295, "grad_norm": 0.8378634452819824, "learning_rate": 0.0013556724999999998, "loss": 3.27, "step": 768 }, { "epoch": 0.0624390243902439, "grad_norm": 0.7602612972259521, "learning_rate": 0.0014687925, "loss": 3.2261, "step": 832 }, { "epoch": 0.06724202626641651, "grad_norm": 0.6387987732887268, "learning_rate": 0.0015819124999999997, "loss": 3.2153, "step": 896 }, { "epoch": 0.07204502814258912, "grad_norm": 0.4422095715999603, "learning_rate": 0.0016950325, "loss": 3.1975, "step": 960 }, { "epoch": 0.07684803001876173, "grad_norm": 0.39002183079719543, "learning_rate": 0.0018081524999999999, "loss": 3.1983, "step": 1024 }, { "epoch": 0.08165103189493433, "grad_norm": 5.926162242889404, "learning_rate": 0.0019212724999999996, "loss": 3.1763, "step": 1088 }, { "epoch": 0.08645403377110694, "grad_norm": 0.4173193871974945, "learning_rate": 0.0020343924999999996, "loss": 3.1833, "step": 1152 }, { "epoch": 0.09125703564727955, "grad_norm": 0.4136042594909668, "learning_rate": 0.0021475125, "loss": 3.1846, "step": 1216 }, { "epoch": 0.09606003752345216, "grad_norm": 0.39301183819770813, "learning_rate": 0.0022606324999999996, "loss": 3.1739, "step": 1280 }, { "epoch": 0.10086303939962477, "grad_norm": 0.4910842776298523, "learning_rate": 0.0023737525, "loss": 3.1614, "step": 1344 }, { "epoch": 0.10566604127579737, "grad_norm": 0.4039038121700287, "learning_rate": 0.0024868725, "loss": 3.1577, "step": 1408 }, { "epoch": 0.11046904315196998, "grad_norm": 0.3286585211753845, "learning_rate": 0.0025999925, "loss": 3.1458, "step": 1472 }, { "epoch": 0.11527204502814259, "grad_norm": 0.44095373153686523, "learning_rate": 0.0027131125, "loss": 3.155, "step": 1536 }, { "epoch": 0.1200750469043152, "grad_norm": 0.40613290667533875, "learning_rate": 0.0028262325, "loss": 3.1469, "step": 1600 }, { "epoch": 0.1248780487804878, "grad_norm": 0.4613141417503357, "learning_rate": 0.002828, "loss": 3.1392, "step": 1664 }, { "epoch": 0.1296810506566604, "grad_norm": 0.3758493661880493, "learning_rate": 0.002828, "loss": 3.1298, "step": 1728 }, { "epoch": 0.13448405253283302, "grad_norm": 0.32609787583351135, "learning_rate": 0.002828, "loss": 3.123, "step": 1792 }, { "epoch": 0.13928705440900563, "grad_norm": 0.4221761226654053, "learning_rate": 0.002828, "loss": 3.1076, "step": 1856 }, { "epoch": 0.14409005628517824, "grad_norm": 0.4372267425060272, "learning_rate": 0.002828, "loss": 3.098, "step": 1920 }, { "epoch": 0.14889305816135084, "grad_norm": 0.36804404854774475, "learning_rate": 0.002828, "loss": 3.0952, "step": 1984 }, { "epoch": 0.15369606003752345, "grad_norm": 0.314120888710022, "learning_rate": 0.002828, "loss": 3.0751, "step": 2048 }, { "epoch": 0.15849906191369606, "grad_norm": 0.3158409297466278, "learning_rate": 0.002828, "loss": 3.0574, "step": 2112 }, { "epoch": 0.16330206378986867, "grad_norm": 0.35668376088142395, "learning_rate": 0.002828, "loss": 3.0598, "step": 2176 }, { "epoch": 0.16810506566604128, "grad_norm": 0.3429064452648163, "learning_rate": 0.002828, "loss": 3.0554, "step": 2240 }, { "epoch": 0.17290806754221388, "grad_norm": 0.37981563806533813, "learning_rate": 0.002828, "loss": 3.0439, "step": 2304 }, { "epoch": 0.1777110694183865, "grad_norm": 0.45046043395996094, "learning_rate": 0.002828, "loss": 3.034, "step": 2368 }, { "epoch": 0.1825140712945591, "grad_norm": 0.30424681305885315, "learning_rate": 0.002828, "loss": 3.0408, "step": 2432 }, { "epoch": 0.1873170731707317, "grad_norm": 0.4374525845050812, "learning_rate": 0.002828, "loss": 3.0289, "step": 2496 }, { "epoch": 0.19212007504690432, "grad_norm": 0.4312361776828766, "learning_rate": 0.002828, "loss": 3.0252, "step": 2560 }, { "epoch": 0.19692307692307692, "grad_norm": 0.33109021186828613, "learning_rate": 0.002828, "loss": 3.0094, "step": 2624 }, { "epoch": 0.20172607879924953, "grad_norm": 0.4393901228904724, "learning_rate": 0.002828, "loss": 3.0021, "step": 2688 }, { "epoch": 0.20652908067542214, "grad_norm": 0.44241341948509216, "learning_rate": 0.002828, "loss": 3.0005, "step": 2752 }, { "epoch": 0.21133208255159475, "grad_norm": 0.36241745948791504, "learning_rate": 0.002828, "loss": 2.9939, "step": 2816 }, { "epoch": 0.21613508442776735, "grad_norm": 0.40780672430992126, "learning_rate": 0.002828, "loss": 2.9788, "step": 2880 }, { "epoch": 0.22093808630393996, "grad_norm": 0.3944590389728546, "learning_rate": 0.002828, "loss": 2.9854, "step": 2944 }, { "epoch": 0.22574108818011257, "grad_norm": 0.40449267625808716, "learning_rate": 0.002828, "loss": 2.9819, "step": 3008 }, { "epoch": 0.23054409005628518, "grad_norm": 0.37247487902641296, "learning_rate": 0.002828, "loss": 2.9827, "step": 3072 }, { "epoch": 0.23534709193245779, "grad_norm": 0.3732891082763672, "learning_rate": 0.002828, "loss": 2.9714, "step": 3136 }, { "epoch": 0.2401500938086304, "grad_norm": 0.3168690800666809, "learning_rate": 0.002828, "loss": 2.9649, "step": 3200 }, { "epoch": 0.244953095684803, "grad_norm": 0.32185083627700806, "learning_rate": 0.002828, "loss": 2.9607, "step": 3264 }, { "epoch": 0.2497560975609756, "grad_norm": 0.3293335437774658, "learning_rate": 0.002828, "loss": 2.9464, "step": 3328 }, { "epoch": 0.2545590994371482, "grad_norm": 0.39153945446014404, "learning_rate": 0.002828, "loss": 2.9513, "step": 3392 }, { "epoch": 0.2593621013133208, "grad_norm": 0.36884990334510803, "learning_rate": 0.002828, "loss": 2.9418, "step": 3456 }, { "epoch": 0.26416510318949343, "grad_norm": 0.39196011424064636, "learning_rate": 0.002828, "loss": 2.9407, "step": 3520 }, { "epoch": 0.26896810506566604, "grad_norm": 0.36011603474617004, "learning_rate": 0.002828, "loss": 2.9461, "step": 3584 }, { "epoch": 0.27377110694183865, "grad_norm": 0.3608081638813019, "learning_rate": 0.002828, "loss": 2.937, "step": 3648 }, { "epoch": 0.27857410881801126, "grad_norm": 0.3833774924278259, "learning_rate": 0.002828, "loss": 2.9254, "step": 3712 }, { "epoch": 0.28337711069418386, "grad_norm": 0.35225459933280945, "learning_rate": 0.002828, "loss": 2.9165, "step": 3776 }, { "epoch": 0.2881801125703565, "grad_norm": 0.39832860231399536, "learning_rate": 0.002828, "loss": 2.9259, "step": 3840 }, { "epoch": 0.2929831144465291, "grad_norm": 0.36834558844566345, "learning_rate": 0.002828, "loss": 2.9186, "step": 3904 }, { "epoch": 0.2977861163227017, "grad_norm": 0.3877101540565491, "learning_rate": 0.002828, "loss": 2.9107, "step": 3968 }, { "epoch": 0.3025891181988743, "grad_norm": 0.40037983655929565, "learning_rate": 0.002828, "loss": 2.9086, "step": 4032 }, { "epoch": 0.3073921200750469, "grad_norm": 0.35432353615760803, "learning_rate": 0.002828, "loss": 2.9039, "step": 4096 }, { "epoch": 0.3121951219512195, "grad_norm": 0.3740752935409546, "learning_rate": 0.002828, "loss": 2.8973, "step": 4160 }, { "epoch": 0.3169981238273921, "grad_norm": 0.3972289264202118, "learning_rate": 0.002828, "loss": 2.8868, "step": 4224 }, { "epoch": 0.3218011257035647, "grad_norm": 0.3818065822124481, "learning_rate": 0.002828, "loss": 2.8916, "step": 4288 }, { "epoch": 0.32660412757973734, "grad_norm": 0.31802886724472046, "learning_rate": 0.002828, "loss": 2.895, "step": 4352 }, { "epoch": 0.33140712945590994, "grad_norm": 0.3920498192310333, "learning_rate": 0.002828, "loss": 2.896, "step": 4416 }, { "epoch": 0.33621013133208255, "grad_norm": 0.42001602053642273, "learning_rate": 0.002828, "loss": 2.8757, "step": 4480 }, { "epoch": 0.34101313320825516, "grad_norm": 0.38037222623825073, "learning_rate": 0.002828, "loss": 2.8812, "step": 4544 }, { "epoch": 0.34581613508442777, "grad_norm": 0.6402748823165894, "learning_rate": 0.002828, "loss": 2.8741, "step": 4608 }, { "epoch": 0.3506191369606004, "grad_norm": 0.3265625536441803, "learning_rate": 0.002828, "loss": 2.8659, "step": 4672 }, { "epoch": 0.355422138836773, "grad_norm": 0.3389698565006256, "learning_rate": 0.002828, "loss": 2.863, "step": 4736 }, { "epoch": 0.3602251407129456, "grad_norm": 0.34922096133232117, "learning_rate": 0.002828, "loss": 2.8555, "step": 4800 }, { "epoch": 0.3650281425891182, "grad_norm": 0.370980441570282, "learning_rate": 0.002828, "loss": 2.8624, "step": 4864 }, { "epoch": 0.3698311444652908, "grad_norm": 0.3553221821784973, "learning_rate": 0.002828, "loss": 2.8573, "step": 4928 }, { "epoch": 0.3746341463414634, "grad_norm": 0.36796537041664124, "learning_rate": 0.002828, "loss": 2.8567, "step": 4992 }, { "epoch": 0.379437148217636, "grad_norm": 0.3615240752696991, "learning_rate": 0.002828, "loss": 2.8444, "step": 5056 }, { "epoch": 0.38424015009380863, "grad_norm": 0.4196101427078247, "learning_rate": 0.002828, "loss": 2.845, "step": 5120 }, { "epoch": 0.38904315196998124, "grad_norm": 0.334185928106308, "learning_rate": 0.002828, "loss": 2.8376, "step": 5184 }, { "epoch": 0.39384615384615385, "grad_norm": 0.30093756318092346, "learning_rate": 0.002828, "loss": 2.8302, "step": 5248 }, { "epoch": 0.39864915572232645, "grad_norm": 0.41615140438079834, "learning_rate": 0.002828, "loss": 2.8365, "step": 5312 }, { "epoch": 0.40345215759849906, "grad_norm": 0.38547712564468384, "learning_rate": 0.002828, "loss": 2.833, "step": 5376 }, { "epoch": 0.40825515947467167, "grad_norm": 0.336453378200531, "learning_rate": 0.002828, "loss": 2.8289, "step": 5440 }, { "epoch": 0.4130581613508443, "grad_norm": 0.33043336868286133, "learning_rate": 0.002828, "loss": 2.8154, "step": 5504 }, { "epoch": 0.4178611632270169, "grad_norm": 0.33151519298553467, "learning_rate": 0.002828, "loss": 2.8267, "step": 5568 }, { "epoch": 0.4226641651031895, "grad_norm": 0.29418498277664185, "learning_rate": 0.002828, "loss": 2.8167, "step": 5632 }, { "epoch": 0.4274671669793621, "grad_norm": 0.3507523536682129, "learning_rate": 0.002828, "loss": 2.8227, "step": 5696 }, { "epoch": 0.4322701688555347, "grad_norm": 0.36976736783981323, "learning_rate": 0.002828, "loss": 2.8087, "step": 5760 }, { "epoch": 0.4370731707317073, "grad_norm": 0.4142448604106903, "learning_rate": 0.002828, "loss": 2.8191, "step": 5824 }, { "epoch": 0.4418761726078799, "grad_norm": 0.3893688917160034, "learning_rate": 0.002828, "loss": 2.8032, "step": 5888 }, { "epoch": 0.44667917448405253, "grad_norm": 0.3025995194911957, "learning_rate": 0.002828, "loss": 2.8049, "step": 5952 }, { "epoch": 0.45148217636022514, "grad_norm": 0.3676198422908783, "learning_rate": 0.002828, "loss": 2.7976, "step": 6016 }, { "epoch": 0.45628517823639775, "grad_norm": 0.39022454619407654, "learning_rate": 0.002828, "loss": 2.796, "step": 6080 }, { "epoch": 0.46108818011257036, "grad_norm": 0.38986560702323914, "learning_rate": 0.002828, "loss": 2.791, "step": 6144 }, { "epoch": 0.46589118198874296, "grad_norm": 0.35879769921302795, "learning_rate": 0.002828, "loss": 2.7949, "step": 6208 }, { "epoch": 0.47069418386491557, "grad_norm": 0.44419315457344055, "learning_rate": 0.002828, "loss": 2.7862, "step": 6272 }, { "epoch": 0.4754971857410882, "grad_norm": 0.30884304642677307, "learning_rate": 0.002828, "loss": 2.7864, "step": 6336 }, { "epoch": 0.4803001876172608, "grad_norm": 0.542960524559021, "learning_rate": 0.002828, "loss": 2.7842, "step": 6400 }, { "epoch": 0.4851031894934334, "grad_norm": 0.39032405614852905, "learning_rate": 0.002828, "loss": 2.7798, "step": 6464 }, { "epoch": 0.489906191369606, "grad_norm": 0.3760650157928467, "learning_rate": 0.002828, "loss": 2.78, "step": 6528 }, { "epoch": 0.4947091932457786, "grad_norm": 0.33309632539749146, "learning_rate": 0.002828, "loss": 2.7741, "step": 6592 }, { "epoch": 0.4995121951219512, "grad_norm": 0.37640711665153503, "learning_rate": 0.002828, "loss": 2.7795, "step": 6656 }, { "epoch": 0.5043151969981239, "grad_norm": 0.36830273270606995, "learning_rate": 0.002828, "loss": 2.7596, "step": 6720 }, { "epoch": 0.5091181988742964, "grad_norm": 0.3751394748687744, "learning_rate": 0.002828, "loss": 2.761, "step": 6784 }, { "epoch": 0.5139212007504691, "grad_norm": 0.3472868800163269, "learning_rate": 0.002828, "loss": 2.7567, "step": 6848 }, { "epoch": 0.5187242026266417, "grad_norm": 0.3749905526638031, "learning_rate": 0.002828, "loss": 2.7654, "step": 6912 }, { "epoch": 0.5235272045028143, "grad_norm": 0.4672335982322693, "learning_rate": 0.002828, "loss": 2.7467, "step": 6976 }, { "epoch": 0.5283302063789869, "grad_norm": 0.30083194375038147, "learning_rate": 0.002828, "loss": 2.7596, "step": 7040 }, { "epoch": 0.5331332082551595, "grad_norm": 0.34232673048973083, "learning_rate": 0.002828, "loss": 2.7425, "step": 7104 }, { "epoch": 0.5379362101313321, "grad_norm": 0.42222973704338074, "learning_rate": 0.002828, "loss": 2.7486, "step": 7168 }, { "epoch": 0.5427392120075047, "grad_norm": 0.36008650064468384, "learning_rate": 0.002828, "loss": 2.7451, "step": 7232 }, { "epoch": 0.5475422138836773, "grad_norm": 0.34359127283096313, "learning_rate": 0.002828, "loss": 2.734, "step": 7296 }, { "epoch": 0.55234521575985, "grad_norm": 0.3953745663166046, "learning_rate": 0.002828, "loss": 2.7397, "step": 7360 }, { "epoch": 0.5571482176360225, "grad_norm": 0.36703094840049744, "learning_rate": 0.002828, "loss": 2.7313, "step": 7424 }, { "epoch": 0.5619512195121952, "grad_norm": 0.31787919998168945, "learning_rate": 0.002828, "loss": 2.7363, "step": 7488 }, { "epoch": 0.5667542213883677, "grad_norm": 0.31179967522621155, "learning_rate": 0.002828, "loss": 2.7236, "step": 7552 }, { "epoch": 0.5715572232645404, "grad_norm": 0.3990299105644226, "learning_rate": 0.002828, "loss": 2.7191, "step": 7616 }, { "epoch": 0.576360225140713, "grad_norm": 0.3776848018169403, "learning_rate": 0.002828, "loss": 2.7244, "step": 7680 }, { "epoch": 0.5811632270168856, "grad_norm": 0.36117562651634216, "learning_rate": 0.002828, "loss": 2.7131, "step": 7744 }, { "epoch": 0.5859662288930582, "grad_norm": 0.3219313323497772, "learning_rate": 0.002828, "loss": 2.7202, "step": 7808 }, { "epoch": 0.5907692307692308, "grad_norm": 0.4501495957374573, "learning_rate": 0.002828, "loss": 2.7115, "step": 7872 }, { "epoch": 0.5955722326454034, "grad_norm": 0.3939913809299469, "learning_rate": 0.002828, "loss": 2.7076, "step": 7936 }, { "epoch": 0.600375234521576, "grad_norm": 0.3244933485984802, "learning_rate": 0.002828, "loss": 2.7047, "step": 8000 }, { "epoch": 0.6051782363977486, "grad_norm": 0.3094891607761383, "learning_rate": 0.002828, "loss": 2.698, "step": 8064 }, { "epoch": 0.6099812382739213, "grad_norm": 0.3525580167770386, "learning_rate": 0.002828, "loss": 2.7056, "step": 8128 }, { "epoch": 0.6147842401500938, "grad_norm": 0.3058718144893646, "learning_rate": 0.002828, "loss": 2.6937, "step": 8192 }, { "epoch": 0.6195872420262665, "grad_norm": 0.31864726543426514, "learning_rate": 0.002828, "loss": 2.6935, "step": 8256 }, { "epoch": 0.624390243902439, "grad_norm": 0.3197256028652191, "learning_rate": 0.002828, "loss": 2.6981, "step": 8320 }, { "epoch": 0.6291932457786117, "grad_norm": 0.30954182147979736, "learning_rate": 0.002828, "loss": 2.705, "step": 8384 }, { "epoch": 0.6339962476547842, "grad_norm": 0.4144911468029022, "learning_rate": 0.002828, "loss": 2.6832, "step": 8448 }, { "epoch": 0.6387992495309569, "grad_norm": 0.34720951318740845, "learning_rate": 0.002828, "loss": 2.6858, "step": 8512 }, { "epoch": 0.6436022514071295, "grad_norm": 0.30545172095298767, "learning_rate": 0.002828, "loss": 2.6758, "step": 8576 }, { "epoch": 0.6484052532833021, "grad_norm": 0.3341416120529175, "learning_rate": 0.002828, "loss": 2.6673, "step": 8640 }, { "epoch": 0.6532082551594747, "grad_norm": 0.5191973447799683, "learning_rate": 0.002828, "loss": 2.6798, "step": 8704 }, { "epoch": 0.6580112570356473, "grad_norm": 0.44382575154304504, "learning_rate": 0.002828, "loss": 2.683, "step": 8768 }, { "epoch": 0.6628142589118199, "grad_norm": 0.45676809549331665, "learning_rate": 0.002828, "loss": 2.6731, "step": 8832 }, { "epoch": 0.6676172607879926, "grad_norm": 0.3542475700378418, "learning_rate": 0.002828, "loss": 2.6813, "step": 8896 }, { "epoch": 0.6724202626641651, "grad_norm": 0.3976110517978668, "learning_rate": 0.002828, "loss": 2.6714, "step": 8960 }, { "epoch": 0.6772232645403378, "grad_norm": 0.37194061279296875, "learning_rate": 0.002828, "loss": 2.6646, "step": 9024 }, { "epoch": 0.6820262664165103, "grad_norm": 0.4080849289894104, "learning_rate": 0.002828, "loss": 2.6638, "step": 9088 }, { "epoch": 0.686829268292683, "grad_norm": 0.3275296986103058, "learning_rate": 0.002828, "loss": 2.6643, "step": 9152 }, { "epoch": 0.6916322701688555, "grad_norm": 0.4300732910633087, "learning_rate": 0.002828, "loss": 2.6545, "step": 9216 }, { "epoch": 0.6964352720450282, "grad_norm": 0.528816282749176, "learning_rate": 0.002828, "loss": 2.6639, "step": 9280 }, { "epoch": 0.7012382739212007, "grad_norm": 0.39729437232017517, "learning_rate": 0.002828, "loss": 2.6669, "step": 9344 }, { "epoch": 0.7060412757973734, "grad_norm": 0.36177024245262146, "learning_rate": 0.002828, "loss": 2.6429, "step": 9408 }, { "epoch": 0.710844277673546, "grad_norm": 0.3488599956035614, "learning_rate": 0.002828, "loss": 2.6409, "step": 9472 }, { "epoch": 0.7156472795497186, "grad_norm": 0.361208438873291, "learning_rate": 0.002828, "loss": 2.6354, "step": 9536 }, { "epoch": 0.7204502814258912, "grad_norm": 0.3307696282863617, "learning_rate": 0.002828, "loss": 2.6398, "step": 9600 }, { "epoch": 0.7252532833020638, "grad_norm": 0.47409588098526, "learning_rate": 0.002828, "loss": 2.6899, "step": 9664 }, { "epoch": 0.7300562851782364, "grad_norm": 0.43482983112335205, "learning_rate": 0.002828, "loss": 2.6675, "step": 9728 }, { "epoch": 0.7348592870544091, "grad_norm": 0.43177512288093567, "learning_rate": 0.002828, "loss": 2.6703, "step": 9792 }, { "epoch": 0.7396622889305816, "grad_norm": 0.5830815434455872, "learning_rate": 0.002828, "loss": 2.6698, "step": 9856 }, { "epoch": 0.7444652908067543, "grad_norm": 0.42559024691581726, "learning_rate": 0.002828, "loss": 2.6687, "step": 9920 }, { "epoch": 0.7492682926829268, "grad_norm": 0.36572182178497314, "learning_rate": 0.002828, "loss": 2.6602, "step": 9984 }, { "epoch": 0.7540712945590995, "grad_norm": 0.42863738536834717, "learning_rate": 0.002828, "loss": 2.6684, "step": 10048 }, { "epoch": 0.758874296435272, "grad_norm": 0.34681934118270874, "learning_rate": 0.002828, "loss": 2.6618, "step": 10112 }, { "epoch": 0.7636772983114447, "grad_norm": 0.40332967042922974, "learning_rate": 0.002828, "loss": 2.6523, "step": 10176 }, { "epoch": 0.7684803001876173, "grad_norm": 0.47137463092803955, "learning_rate": 0.002828, "loss": 2.6543, "step": 10240 }, { "epoch": 0.7732833020637899, "grad_norm": 0.3324384093284607, "learning_rate": 0.002828, "loss": 2.6444, "step": 10304 }, { "epoch": 0.7780863039399625, "grad_norm": 0.3714103698730469, "learning_rate": 0.002828, "loss": 2.6466, "step": 10368 }, { "epoch": 0.7828893058161351, "grad_norm": 0.3684547543525696, "learning_rate": 0.002828, "loss": 2.6497, "step": 10432 }, { "epoch": 0.7876923076923077, "grad_norm": 0.3580617606639862, "learning_rate": 0.002828, "loss": 2.6428, "step": 10496 }, { "epoch": 0.7924953095684804, "grad_norm": 0.4132176339626312, "learning_rate": 0.002828, "loss": 2.6407, "step": 10560 }, { "epoch": 0.7972983114446529, "grad_norm": 0.4079800546169281, "learning_rate": 0.002828, "loss": 2.6374, "step": 10624 }, { "epoch": 0.8021013133208256, "grad_norm": 0.40170854330062866, "learning_rate": 0.002828, "loss": 2.6319, "step": 10688 }, { "epoch": 0.8069043151969981, "grad_norm": 0.4748755097389221, "learning_rate": 0.002828, "loss": 2.6489, "step": 10752 }, { "epoch": 0.8117073170731708, "grad_norm": 0.3806183338165283, "learning_rate": 0.002828, "loss": 2.6363, "step": 10816 }, { "epoch": 0.8165103189493433, "grad_norm": 0.32777532935142517, "learning_rate": 0.002828, "loss": 2.6386, "step": 10880 }, { "epoch": 0.821313320825516, "grad_norm": 0.4884773790836334, "learning_rate": 0.002828, "loss": 2.6293, "step": 10944 }, { "epoch": 0.8261163227016886, "grad_norm": 0.43175649642944336, "learning_rate": 0.002828, "loss": 2.6351, "step": 11008 }, { "epoch": 0.8309193245778612, "grad_norm": 0.44375500082969666, "learning_rate": 0.002828, "loss": 2.6272, "step": 11072 }, { "epoch": 0.8357223264540338, "grad_norm": 0.36503469944000244, "learning_rate": 0.002828, "loss": 2.628, "step": 11136 }, { "epoch": 0.8405253283302064, "grad_norm": 0.3493196368217468, "learning_rate": 0.002828, "loss": 2.6238, "step": 11200 }, { "epoch": 0.845328330206379, "grad_norm": 0.3593812584877014, "learning_rate": 0.002828, "loss": 2.6161, "step": 11264 }, { "epoch": 0.8501313320825516, "grad_norm": 0.4043927788734436, "learning_rate": 0.002828, "loss": 2.6248, "step": 11328 }, { "epoch": 0.8549343339587242, "grad_norm": 0.3805730938911438, "learning_rate": 0.002828, "loss": 2.619, "step": 11392 }, { "epoch": 0.8597373358348969, "grad_norm": 0.40822461247444153, "learning_rate": 0.002828, "loss": 2.619, "step": 11456 }, { "epoch": 0.8645403377110694, "grad_norm": 0.3430253565311432, "learning_rate": 0.002828, "loss": 2.6162, "step": 11520 }, { "epoch": 0.8693433395872421, "grad_norm": 0.3665921688079834, "learning_rate": 0.002828, "loss": 2.6083, "step": 11584 }, { "epoch": 0.8741463414634146, "grad_norm": 0.3768637776374817, "learning_rate": 0.002828, "loss": 2.6085, "step": 11648 }, { "epoch": 0.8789493433395873, "grad_norm": 0.6709098219871521, "learning_rate": 0.002828, "loss": 2.6067, "step": 11712 }, { "epoch": 0.8837523452157598, "grad_norm": 0.37109729647636414, "learning_rate": 0.002828, "loss": 2.5975, "step": 11776 }, { "epoch": 0.8885553470919325, "grad_norm": 0.35545358061790466, "learning_rate": 0.002828, "loss": 2.6086, "step": 11840 }, { "epoch": 0.8933583489681051, "grad_norm": 0.34493309259414673, "learning_rate": 0.002828, "loss": 2.6009, "step": 11904 }, { "epoch": 0.8981613508442777, "grad_norm": 0.35226738452911377, "learning_rate": 0.002828, "loss": 2.5909, "step": 11968 }, { "epoch": 0.9029643527204503, "grad_norm": 0.3626823425292969, "learning_rate": 0.002828, "loss": 2.5954, "step": 12032 }, { "epoch": 0.9077673545966229, "grad_norm": 0.4639281630516052, "learning_rate": 0.002828, "loss": 2.5976, "step": 12096 }, { "epoch": 0.9125703564727955, "grad_norm": 0.425073504447937, "learning_rate": 0.002828, "loss": 2.5846, "step": 12160 }, { "epoch": 0.9173733583489682, "grad_norm": 0.4849206507205963, "learning_rate": 0.002828, "loss": 2.5851, "step": 12224 }, { "epoch": 0.9221763602251407, "grad_norm": 0.3517647385597229, "learning_rate": 0.002828, "loss": 2.5832, "step": 12288 }, { "epoch": 0.9269793621013134, "grad_norm": 0.4217440187931061, "learning_rate": 0.002828, "loss": 2.5777, "step": 12352 }, { "epoch": 0.9317823639774859, "grad_norm": 0.3862438499927521, "learning_rate": 0.002828, "loss": 2.5769, "step": 12416 }, { "epoch": 0.9365853658536586, "grad_norm": 0.4026007056236267, "learning_rate": 0.002828, "loss": 2.5802, "step": 12480 }, { "epoch": 0.9413883677298311, "grad_norm": 0.3353049159049988, "learning_rate": 0.002828, "loss": 2.5741, "step": 12544 }, { "epoch": 0.9461913696060038, "grad_norm": 0.35357797145843506, "learning_rate": 0.002828, "loss": 2.5723, "step": 12608 }, { "epoch": 0.9509943714821764, "grad_norm": 0.35685861110687256, "learning_rate": 0.002828, "loss": 2.5801, "step": 12672 }, { "epoch": 0.955797373358349, "grad_norm": 0.36265361309051514, "learning_rate": 0.002828, "loss": 2.5784, "step": 12736 }, { "epoch": 0.9606003752345216, "grad_norm": 0.4119773805141449, "learning_rate": 0.002828, "loss": 2.5646, "step": 12800 }, { "epoch": 0.9654033771106942, "grad_norm": 0.3662680387496948, "learning_rate": 0.002828, "loss": 2.5693, "step": 12864 }, { "epoch": 0.9702063789868668, "grad_norm": 0.3822716772556305, "learning_rate": 0.002828, "loss": 2.5643, "step": 12928 }, { "epoch": 0.9750093808630395, "grad_norm": 0.3412950038909912, "learning_rate": 0.002828, "loss": 2.5646, "step": 12992 }, { "epoch": 0.979812382739212, "grad_norm": 0.373353511095047, "learning_rate": 0.002828, "loss": 2.5614, "step": 13056 }, { "epoch": 0.9846153846153847, "grad_norm": 10.112526893615723, "learning_rate": 0.002828, "loss": 2.5578, "step": 13120 }, { "epoch": 0.9894183864915572, "grad_norm": 0.36393383145332336, "learning_rate": 0.002828, "loss": 2.5696, "step": 13184 }, { "epoch": 0.9942213883677299, "grad_norm": 0.4176023006439209, "learning_rate": 0.002828, "loss": 2.5533, "step": 13248 }, { "epoch": 0.9990243902439024, "grad_norm": 0.4248984456062317, "learning_rate": 0.002828, "loss": 2.5569, "step": 13312 }, { "epoch": 1.003827392120075, "grad_norm": 0.3931824564933777, "learning_rate": 0.002828, "loss": 2.5246, "step": 13376 }, { "epoch": 1.0086303939962478, "grad_norm": 0.3742982745170593, "learning_rate": 0.002828, "loss": 2.5104, "step": 13440 }, { "epoch": 1.0134333958724202, "grad_norm": 0.4388613998889923, "learning_rate": 0.002828, "loss": 2.519, "step": 13504 }, { "epoch": 1.0182363977485929, "grad_norm": 0.41458427906036377, "learning_rate": 0.002828, "loss": 2.5162, "step": 13568 }, { "epoch": 1.0230393996247655, "grad_norm": 0.3841855227947235, "learning_rate": 0.002828, "loss": 2.5129, "step": 13632 }, { "epoch": 1.027842401500938, "grad_norm": 0.43930500745773315, "learning_rate": 0.002828, "loss": 2.5179, "step": 13696 }, { "epoch": 1.0326454033771106, "grad_norm": 0.3687760531902313, "learning_rate": 0.002828, "loss": 2.5006, "step": 13760 }, { "epoch": 1.0374484052532833, "grad_norm": 0.3823833465576172, "learning_rate": 0.002828, "loss": 2.5039, "step": 13824 }, { "epoch": 1.042251407129456, "grad_norm": 0.40025222301483154, "learning_rate": 0.002828, "loss": 2.5155, "step": 13888 }, { "epoch": 1.0470544090056286, "grad_norm": 0.40790122747421265, "learning_rate": 0.002828, "loss": 2.5064, "step": 13952 }, { "epoch": 1.051857410881801, "grad_norm": 0.42718634009361267, "learning_rate": 0.002828, "loss": 2.5095, "step": 14016 }, { "epoch": 1.0566604127579737, "grad_norm": 0.3305782079696655, "learning_rate": 0.002828, "loss": 2.5, "step": 14080 }, { "epoch": 1.0614634146341464, "grad_norm": 0.37126559019088745, "learning_rate": 0.002828, "loss": 2.5099, "step": 14144 }, { "epoch": 1.0662664165103188, "grad_norm": 0.414987176656723, "learning_rate": 0.002828, "loss": 2.501, "step": 14208 }, { "epoch": 1.0710694183864915, "grad_norm": 0.45917075872421265, "learning_rate": 0.002828, "loss": 2.5062, "step": 14272 }, { "epoch": 1.0758724202626642, "grad_norm": 0.4362465739250183, "learning_rate": 0.002828, "loss": 2.4949, "step": 14336 }, { "epoch": 1.0806754221388368, "grad_norm": 0.40015289187431335, "learning_rate": 0.002828, "loss": 2.4957, "step": 14400 }, { "epoch": 1.0854784240150095, "grad_norm": 0.3781159818172455, "learning_rate": 0.002828, "loss": 2.4979, "step": 14464 }, { "epoch": 1.090281425891182, "grad_norm": 0.4165579676628113, "learning_rate": 0.002828, "loss": 2.4913, "step": 14528 }, { "epoch": 1.0950844277673546, "grad_norm": 0.4100767970085144, "learning_rate": 0.002828, "loss": 2.4978, "step": 14592 }, { "epoch": 1.0998874296435273, "grad_norm": 0.4211256504058838, "learning_rate": 0.002828, "loss": 2.4871, "step": 14656 }, { "epoch": 1.1046904315196997, "grad_norm": 0.390396386384964, "learning_rate": 0.002828, "loss": 2.4933, "step": 14720 }, { "epoch": 1.1094934333958724, "grad_norm": 0.3585355281829834, "learning_rate": 0.002828, "loss": 2.4811, "step": 14784 }, { "epoch": 1.114296435272045, "grad_norm": 0.5148431062698364, "learning_rate": 0.002828, "loss": 2.4888, "step": 14848 }, { "epoch": 1.1190994371482177, "grad_norm": 0.44254639744758606, "learning_rate": 0.002828, "loss": 2.4821, "step": 14912 }, { "epoch": 1.1239024390243904, "grad_norm": 0.3710468113422394, "learning_rate": 0.002828, "loss": 2.4819, "step": 14976 }, { "epoch": 1.1287054409005628, "grad_norm": 0.41197285056114197, "learning_rate": 0.002828, "loss": 2.4842, "step": 15040 }, { "epoch": 1.1335084427767355, "grad_norm": 0.37512508034706116, "learning_rate": 0.002828, "loss": 2.4776, "step": 15104 }, { "epoch": 1.1383114446529081, "grad_norm": 0.4286038279533386, "learning_rate": 0.002828, "loss": 2.4748, "step": 15168 }, { "epoch": 1.1431144465290806, "grad_norm": 0.37446776032447815, "learning_rate": 0.002828, "loss": 2.4727, "step": 15232 }, { "epoch": 1.1479174484052532, "grad_norm": 0.4537597894668579, "learning_rate": 0.002828, "loss": 2.4663, "step": 15296 }, { "epoch": 1.152720450281426, "grad_norm": 0.36247050762176514, "learning_rate": 0.002828, "loss": 2.4699, "step": 15360 }, { "epoch": 1.1575234521575986, "grad_norm": 0.3772297501564026, "learning_rate": 0.002828, "loss": 2.4734, "step": 15424 }, { "epoch": 1.1623264540337712, "grad_norm": 0.3789200186729431, "learning_rate": 0.002828, "loss": 2.4696, "step": 15488 }, { "epoch": 1.1671294559099437, "grad_norm": 0.36870113015174866, "learning_rate": 0.002828, "loss": 2.4671, "step": 15552 }, { "epoch": 1.1719324577861163, "grad_norm": 0.37448298931121826, "learning_rate": 0.002828, "loss": 2.462, "step": 15616 }, { "epoch": 1.176735459662289, "grad_norm": 0.4384878873825073, "learning_rate": 0.002828, "loss": 2.4648, "step": 15680 }, { "epoch": 1.1815384615384614, "grad_norm": 0.37811148166656494, "learning_rate": 0.002828, "loss": 2.4598, "step": 15744 }, { "epoch": 1.186341463414634, "grad_norm": 0.4190385341644287, "learning_rate": 0.002828, "loss": 2.4643, "step": 15808 }, { "epoch": 1.1911444652908068, "grad_norm": 0.48885485529899597, "learning_rate": 0.002828, "loss": 2.4564, "step": 15872 }, { "epoch": 1.1959474671669794, "grad_norm": 0.42267611622810364, "learning_rate": 0.002828, "loss": 2.4671, "step": 15936 }, { "epoch": 1.200750469043152, "grad_norm": 0.3886626064777374, "learning_rate": 0.002828, "loss": 2.4715, "step": 16000 }, { "epoch": 1.2055534709193245, "grad_norm": 0.40871456265449524, "learning_rate": 0.002828, "loss": 2.4558, "step": 16064 }, { "epoch": 1.2103564727954972, "grad_norm": 0.46952739357948303, "learning_rate": 0.002828, "loss": 2.4497, "step": 16128 }, { "epoch": 1.2151594746716698, "grad_norm": 0.41340023279190063, "learning_rate": 0.002828, "loss": 2.4402, "step": 16192 }, { "epoch": 1.2199624765478423, "grad_norm": 0.36176440119743347, "learning_rate": 0.002828, "loss": 2.4473, "step": 16256 }, { "epoch": 1.224765478424015, "grad_norm": 0.4117899239063263, "learning_rate": 0.002828, "loss": 2.443, "step": 16320 }, { "epoch": 1.2295684803001876, "grad_norm": 0.5039286613464355, "learning_rate": 0.002828, "loss": 2.4557, "step": 16384 }, { "epoch": 1.2343714821763603, "grad_norm": 0.3716677129268646, "learning_rate": 0.002828, "loss": 2.4522, "step": 16448 }, { "epoch": 1.239174484052533, "grad_norm": 0.42316168546676636, "learning_rate": 0.002828, "loss": 2.4424, "step": 16512 }, { "epoch": 1.2439774859287054, "grad_norm": 0.5081620216369629, "learning_rate": 0.002828, "loss": 2.4325, "step": 16576 }, { "epoch": 1.248780487804878, "grad_norm": 0.39409589767456055, "learning_rate": 0.002828, "loss": 2.435, "step": 16640 }, { "epoch": 1.2535834896810507, "grad_norm": 0.38638824224472046, "learning_rate": 0.002828, "loss": 2.4363, "step": 16704 }, { "epoch": 1.2583864915572232, "grad_norm": 0.41918718814849854, "learning_rate": 0.002828, "loss": 2.4404, "step": 16768 }, { "epoch": 1.2631894934333958, "grad_norm": 0.3932395279407501, "learning_rate": 0.002828, "loss": 2.4403, "step": 16832 }, { "epoch": 1.2679924953095685, "grad_norm": 0.3787371814250946, "learning_rate": 0.002828, "loss": 2.4386, "step": 16896 }, { "epoch": 1.2727954971857411, "grad_norm": 0.40612953901290894, "learning_rate": 0.002828, "loss": 2.4219, "step": 16960 }, { "epoch": 1.2775984990619138, "grad_norm": 0.4243071675300598, "learning_rate": 0.002828, "loss": 2.4261, "step": 17024 }, { "epoch": 1.2824015009380862, "grad_norm": 0.4240303039550781, "learning_rate": 0.002828, "loss": 2.444, "step": 17088 }, { "epoch": 1.287204502814259, "grad_norm": 0.4888259470462799, "learning_rate": 0.002828, "loss": 2.4344, "step": 17152 }, { "epoch": 1.2920075046904316, "grad_norm": 0.4678399860858917, "learning_rate": 0.002828, "loss": 2.4306, "step": 17216 }, { "epoch": 1.296810506566604, "grad_norm": 0.38733649253845215, "learning_rate": 0.002828, "loss": 2.431, "step": 17280 }, { "epoch": 1.3016135084427767, "grad_norm": 0.38587358593940735, "learning_rate": 0.002828, "loss": 2.4205, "step": 17344 }, { "epoch": 1.3064165103189493, "grad_norm": 0.39998751878738403, "learning_rate": 0.002828, "loss": 2.4336, "step": 17408 }, { "epoch": 1.311219512195122, "grad_norm": 0.36294978857040405, "learning_rate": 0.002828, "loss": 2.4238, "step": 17472 }, { "epoch": 1.3160225140712947, "grad_norm": 0.3924562633037567, "learning_rate": 0.002828, "loss": 2.4197, "step": 17536 }, { "epoch": 1.320825515947467, "grad_norm": 0.3837553560733795, "learning_rate": 0.002828, "loss": 2.4243, "step": 17600 }, { "epoch": 1.3256285178236398, "grad_norm": 0.38875913619995117, "learning_rate": 0.002828, "loss": 2.4172, "step": 17664 }, { "epoch": 1.3304315196998124, "grad_norm": 0.41738125681877136, "learning_rate": 0.002828, "loss": 2.4225, "step": 17728 }, { "epoch": 1.3352345215759849, "grad_norm": 0.3645491898059845, "learning_rate": 0.002828, "loss": 2.4151, "step": 17792 }, { "epoch": 1.3400375234521575, "grad_norm": 0.43829870223999023, "learning_rate": 0.002828, "loss": 2.4099, "step": 17856 }, { "epoch": 1.3448405253283302, "grad_norm": 0.3851640820503235, "learning_rate": 0.002828, "loss": 2.4168, "step": 17920 }, { "epoch": 1.3496435272045029, "grad_norm": 0.36147060990333557, "learning_rate": 0.002828, "loss": 2.4085, "step": 17984 }, { "epoch": 1.3544465290806755, "grad_norm": 0.42050638794898987, "learning_rate": 0.002828, "loss": 2.4121, "step": 18048 }, { "epoch": 1.359249530956848, "grad_norm": 0.3830699920654297, "learning_rate": 0.002828, "loss": 2.4095, "step": 18112 }, { "epoch": 1.3640525328330206, "grad_norm": 0.3830968737602234, "learning_rate": 0.002828, "loss": 2.4077, "step": 18176 }, { "epoch": 1.3688555347091933, "grad_norm": 0.3880060017108917, "learning_rate": 0.002828, "loss": 2.4124, "step": 18240 }, { "epoch": 1.3736585365853657, "grad_norm": 0.45445796847343445, "learning_rate": 0.002828, "loss": 2.4014, "step": 18304 }, { "epoch": 1.3784615384615384, "grad_norm": 0.3750540316104889, "learning_rate": 0.002828, "loss": 2.4003, "step": 18368 }, { "epoch": 1.383264540337711, "grad_norm": 0.3783455193042755, "learning_rate": 0.002828, "loss": 2.3983, "step": 18432 }, { "epoch": 1.3880675422138837, "grad_norm": 0.40336528420448303, "learning_rate": 0.002828, "loss": 2.4105, "step": 18496 }, { "epoch": 1.3928705440900564, "grad_norm": 0.43220385909080505, "learning_rate": 0.002828, "loss": 2.4018, "step": 18560 }, { "epoch": 1.3976735459662288, "grad_norm": 0.4069630205631256, "learning_rate": 0.002828, "loss": 2.4049, "step": 18624 }, { "epoch": 1.4024765478424015, "grad_norm": 0.3866819441318512, "learning_rate": 0.002828, "loss": 2.3917, "step": 18688 }, { "epoch": 1.4072795497185742, "grad_norm": 0.3699668347835541, "learning_rate": 0.002828, "loss": 2.3908, "step": 18752 }, { "epoch": 1.4120825515947466, "grad_norm": 0.377645879983902, "learning_rate": 0.002828, "loss": 2.3957, "step": 18816 }, { "epoch": 1.4168855534709193, "grad_norm": 0.36612892150878906, "learning_rate": 0.002828, "loss": 2.3973, "step": 18880 }, { "epoch": 1.421688555347092, "grad_norm": 0.385735422372818, "learning_rate": 0.002828, "loss": 2.3952, "step": 18944 }, { "epoch": 1.4264915572232646, "grad_norm": 0.4026818871498108, "learning_rate": 0.002828, "loss": 2.3908, "step": 19008 }, { "epoch": 1.4312945590994373, "grad_norm": 0.39212891459465027, "learning_rate": 0.002828, "loss": 2.3923, "step": 19072 }, { "epoch": 1.4360975609756097, "grad_norm": 0.43533411622047424, "learning_rate": 0.002828, "loss": 2.3918, "step": 19136 }, { "epoch": 1.4409005628517824, "grad_norm": 0.4136466383934021, "learning_rate": 0.002828, "loss": 2.3885, "step": 19200 }, { "epoch": 1.445703564727955, "grad_norm": 0.38349345326423645, "learning_rate": 0.002828, "loss": 2.3891, "step": 19264 }, { "epoch": 1.4505065666041275, "grad_norm": 0.42666760087013245, "learning_rate": 0.002828, "loss": 2.3725, "step": 19328 }, { "epoch": 1.4553095684803001, "grad_norm": 0.3926577866077423, "learning_rate": 0.002828, "loss": 2.3885, "step": 19392 }, { "epoch": 1.4601125703564728, "grad_norm": 0.3736414611339569, "learning_rate": 0.002828, "loss": 2.3855, "step": 19456 }, { "epoch": 1.4649155722326455, "grad_norm": 0.36343908309936523, "learning_rate": 0.002828, "loss": 2.3773, "step": 19520 }, { "epoch": 1.4697185741088181, "grad_norm": 0.380211740732193, "learning_rate": 0.002828, "loss": 2.3809, "step": 19584 }, { "epoch": 1.4745215759849906, "grad_norm": 0.40481454133987427, "learning_rate": 0.002828, "loss": 2.375, "step": 19648 }, { "epoch": 1.4793245778611632, "grad_norm": 0.45368635654449463, "learning_rate": 0.002828, "loss": 2.3707, "step": 19712 }, { "epoch": 1.484127579737336, "grad_norm": 0.4029395580291748, "learning_rate": 0.002828, "loss": 2.3733, "step": 19776 }, { "epoch": 1.4889305816135083, "grad_norm": 0.3748946785926819, "learning_rate": 0.002828, "loss": 2.3739, "step": 19840 }, { "epoch": 1.493733583489681, "grad_norm": 0.36640551686286926, "learning_rate": 0.002828, "loss": 2.3652, "step": 19904 }, { "epoch": 1.4985365853658537, "grad_norm": 0.4150533676147461, "learning_rate": 0.002828, "loss": 2.3709, "step": 19968 }, { "epoch": 1.5033395872420263, "grad_norm": 0.49730879068374634, "learning_rate": 0.002828, "loss": 2.3668, "step": 20032 }, { "epoch": 1.508142589118199, "grad_norm": 0.37675461173057556, "learning_rate": 0.002828, "loss": 2.3695, "step": 20096 }, { "epoch": 1.5129455909943714, "grad_norm": 0.3647516965866089, "learning_rate": 0.002828, "loss": 2.3733, "step": 20160 }, { "epoch": 1.517748592870544, "grad_norm": 1.2981253862380981, "learning_rate": 0.002828, "loss": 2.369, "step": 20224 }, { "epoch": 1.5225515947467168, "grad_norm": 0.5044511556625366, "learning_rate": 0.002828, "loss": 2.3578, "step": 20288 }, { "epoch": 1.5273545966228892, "grad_norm": 0.3651883006095886, "learning_rate": 0.002828, "loss": 2.3601, "step": 20352 }, { "epoch": 1.532157598499062, "grad_norm": 0.4419403076171875, "learning_rate": 0.002828, "loss": 2.3607, "step": 20416 }, { "epoch": 1.5369606003752345, "grad_norm": 0.38631224632263184, "learning_rate": 0.002828, "loss": 2.3619, "step": 20480 }, { "epoch": 1.5417636022514072, "grad_norm": 0.34725359082221985, "learning_rate": 0.002828, "loss": 2.3573, "step": 20544 }, { "epoch": 1.5465666041275798, "grad_norm": 0.3991786241531372, "learning_rate": 0.002828, "loss": 2.357, "step": 20608 }, { "epoch": 1.5513696060037523, "grad_norm": 0.3595084846019745, "learning_rate": 0.002828, "loss": 2.357, "step": 20672 }, { "epoch": 1.556172607879925, "grad_norm": 0.4021853804588318, "learning_rate": 0.002828, "loss": 2.3537, "step": 20736 }, { "epoch": 1.5609756097560976, "grad_norm": 0.3939075767993927, "learning_rate": 0.002828, "loss": 2.3594, "step": 20800 }, { "epoch": 1.56577861163227, "grad_norm": 0.3889540135860443, "learning_rate": 0.002828, "loss": 2.3573, "step": 20864 }, { "epoch": 1.570581613508443, "grad_norm": 0.41366517543792725, "learning_rate": 0.002828, "loss": 2.3442, "step": 20928 }, { "epoch": 1.5753846153846154, "grad_norm": 0.37127187848091125, "learning_rate": 0.002828, "loss": 2.3457, "step": 20992 }, { "epoch": 1.580187617260788, "grad_norm": 0.4014946520328522, "learning_rate": 0.002828, "loss": 2.3457, "step": 21056 }, { "epoch": 1.5849906191369607, "grad_norm": 0.35794708132743835, "learning_rate": 0.002828, "loss": 2.3508, "step": 21120 }, { "epoch": 1.5897936210131332, "grad_norm": 0.3924767076969147, "learning_rate": 0.002828, "loss": 2.3451, "step": 21184 }, { "epoch": 1.5945966228893058, "grad_norm": 0.34789031744003296, "learning_rate": 0.002828, "loss": 2.3444, "step": 21248 }, { "epoch": 1.5993996247654785, "grad_norm": 0.37461933493614197, "learning_rate": 0.002828, "loss": 2.3385, "step": 21312 }, { "epoch": 1.604202626641651, "grad_norm": 0.40146076679229736, "learning_rate": 0.002828, "loss": 2.3406, "step": 21376 }, { "epoch": 1.6090056285178238, "grad_norm": 0.4080921411514282, "learning_rate": 0.002828, "loss": 2.3423, "step": 21440 }, { "epoch": 1.6138086303939962, "grad_norm": 0.40802744030952454, "learning_rate": 0.002828, "loss": 2.3377, "step": 21504 }, { "epoch": 1.618611632270169, "grad_norm": 0.420188307762146, "learning_rate": 0.002828, "loss": 2.3399, "step": 21568 }, { "epoch": 1.6234146341463416, "grad_norm": 0.40739214420318604, "learning_rate": 0.002828, "loss": 2.34, "step": 21632 }, { "epoch": 1.628217636022514, "grad_norm": 0.41674676537513733, "learning_rate": 0.002828, "loss": 2.3326, "step": 21696 }, { "epoch": 1.6330206378986867, "grad_norm": 0.41856762766838074, "learning_rate": 0.002828, "loss": 2.3366, "step": 21760 }, { "epoch": 1.6378236397748593, "grad_norm": 0.39763346314430237, "learning_rate": 0.002828, "loss": 2.3339, "step": 21824 }, { "epoch": 1.6426266416510318, "grad_norm": 0.3777034282684326, "learning_rate": 0.002828, "loss": 2.343, "step": 21888 }, { "epoch": 1.6474296435272047, "grad_norm": 0.3617188036441803, "learning_rate": 0.002828, "loss": 2.3341, "step": 21952 }, { "epoch": 1.652232645403377, "grad_norm": 0.4504718482494354, "learning_rate": 0.002828, "loss": 2.3295, "step": 22016 }, { "epoch": 1.6570356472795498, "grad_norm": 0.37388357520103455, "learning_rate": 0.002828, "loss": 2.3408, "step": 22080 }, { "epoch": 1.6618386491557224, "grad_norm": 0.3807313144207001, "learning_rate": 0.002828, "loss": 2.3249, "step": 22144 }, { "epoch": 1.6666416510318949, "grad_norm": 0.4428509771823883, "learning_rate": 0.002828, "loss": 2.3347, "step": 22208 }, { "epoch": 1.6714446529080675, "grad_norm": 0.39028382301330566, "learning_rate": 0.002828, "loss": 2.336, "step": 22272 }, { "epoch": 1.6762476547842402, "grad_norm": 0.482424259185791, "learning_rate": 0.002828, "loss": 2.3212, "step": 22336 }, { "epoch": 1.6810506566604126, "grad_norm": 0.39801299571990967, "learning_rate": 0.002828, "loss": 2.314, "step": 22400 }, { "epoch": 1.6858536585365855, "grad_norm": 0.4351527988910675, "learning_rate": 0.002828, "loss": 2.3223, "step": 22464 }, { "epoch": 1.690656660412758, "grad_norm": 0.4509490430355072, "learning_rate": 0.002828, "loss": 2.3246, "step": 22528 }, { "epoch": 1.6954596622889306, "grad_norm": 0.35885152220726013, "learning_rate": 0.002828, "loss": 2.319, "step": 22592 }, { "epoch": 1.7002626641651033, "grad_norm": 0.4146900177001953, "learning_rate": 0.002828, "loss": 2.3214, "step": 22656 }, { "epoch": 1.7050656660412757, "grad_norm": 0.40194573998451233, "learning_rate": 0.002828, "loss": 2.322, "step": 22720 }, { "epoch": 1.7098686679174484, "grad_norm": 0.43570390343666077, "learning_rate": 0.002828, "loss": 2.3241, "step": 22784 }, { "epoch": 1.714671669793621, "grad_norm": 0.35558512806892395, "learning_rate": 0.002828, "loss": 2.3193, "step": 22848 }, { "epoch": 1.7194746716697935, "grad_norm": 0.3700902760028839, "learning_rate": 0.002828, "loss": 2.3219, "step": 22912 }, { "epoch": 1.7242776735459664, "grad_norm": 0.4287453591823578, "learning_rate": 0.002828, "loss": 2.3078, "step": 22976 }, { "epoch": 1.7290806754221388, "grad_norm": 0.41843536496162415, "learning_rate": 0.002828, "loss": 2.3103, "step": 23040 }, { "epoch": 1.7338836772983115, "grad_norm": 0.3938317596912384, "learning_rate": 0.002828, "loss": 2.3176, "step": 23104 }, { "epoch": 1.7386866791744842, "grad_norm": 0.44625958800315857, "learning_rate": 0.002828, "loss": 2.307, "step": 23168 }, { "epoch": 1.7434896810506566, "grad_norm": 0.4598078727722168, "learning_rate": 0.002828, "loss": 2.2958, "step": 23232 }, { "epoch": 1.7482926829268293, "grad_norm": 0.4126788377761841, "learning_rate": 0.002828, "loss": 2.3094, "step": 23296 }, { "epoch": 1.753095684803002, "grad_norm": 0.3801914155483246, "learning_rate": 0.002828, "loss": 2.3048, "step": 23360 }, { "epoch": 1.7578986866791744, "grad_norm": 0.4619985818862915, "learning_rate": 0.002828, "loss": 2.3069, "step": 23424 }, { "epoch": 1.7627016885553473, "grad_norm": 0.4068593680858612, "learning_rate": 0.002828, "loss": 2.299, "step": 23488 }, { "epoch": 1.7675046904315197, "grad_norm": 0.36146870255470276, "learning_rate": 0.002828, "loss": 2.3042, "step": 23552 }, { "epoch": 1.7723076923076924, "grad_norm": 0.3995908200740814, "learning_rate": 0.002828, "loss": 2.3006, "step": 23616 }, { "epoch": 1.777110694183865, "grad_norm": 0.3970596492290497, "learning_rate": 0.002828, "loss": 2.313, "step": 23680 }, { "epoch": 1.7819136960600375, "grad_norm": 0.4287073612213135, "learning_rate": 0.002828, "loss": 2.2974, "step": 23744 }, { "epoch": 1.7867166979362101, "grad_norm": 0.41250482201576233, "learning_rate": 0.002828, "loss": 2.2937, "step": 23808 }, { "epoch": 1.7915196998123828, "grad_norm": 0.411668062210083, "learning_rate": 0.002828, "loss": 2.2994, "step": 23872 }, { "epoch": 1.7963227016885552, "grad_norm": 0.4834740459918976, "learning_rate": 0.002828, "loss": 2.2895, "step": 23936 }, { "epoch": 1.8011257035647281, "grad_norm": 0.3624022603034973, "learning_rate": 0.002828, "loss": 2.2888, "step": 24000 }, { "epoch": 1.8059287054409006, "grad_norm": 0.36700454354286194, "learning_rate": 0.002828, "loss": 2.2917, "step": 24064 }, { "epoch": 1.8107317073170732, "grad_norm": 0.3666454255580902, "learning_rate": 0.002828, "loss": 2.2896, "step": 24128 }, { "epoch": 1.8155347091932459, "grad_norm": 0.4110506474971771, "learning_rate": 0.002828, "loss": 2.2947, "step": 24192 }, { "epoch": 1.8203377110694183, "grad_norm": 0.3604464530944824, "learning_rate": 0.002828, "loss": 2.289, "step": 24256 }, { "epoch": 1.825140712945591, "grad_norm": 0.40807706117630005, "learning_rate": 0.002828, "loss": 2.288, "step": 24320 }, { "epoch": 1.8299437148217637, "grad_norm": 0.3632533848285675, "learning_rate": 0.002828, "loss": 2.29, "step": 24384 }, { "epoch": 1.834746716697936, "grad_norm": 0.38520562648773193, "learning_rate": 0.002828, "loss": 2.2805, "step": 24448 }, { "epoch": 1.839549718574109, "grad_norm": 0.4228810966014862, "learning_rate": 0.002828, "loss": 2.2842, "step": 24512 }, { "epoch": 1.8443527204502814, "grad_norm": 0.4542325735092163, "learning_rate": 0.002828, "loss": 2.2781, "step": 24576 }, { "epoch": 1.849155722326454, "grad_norm": 0.37316882610321045, "learning_rate": 0.002828, "loss": 2.2829, "step": 24640 }, { "epoch": 1.8539587242026268, "grad_norm": 0.5505624413490295, "learning_rate": 0.002828, "loss": 2.2942, "step": 24704 }, { "epoch": 1.8587617260787992, "grad_norm": 0.4269484281539917, "learning_rate": 0.002828, "loss": 2.2793, "step": 24768 }, { "epoch": 1.8635647279549719, "grad_norm": 0.407760888338089, "learning_rate": 0.002828, "loss": 2.2803, "step": 24832 }, { "epoch": 1.8683677298311445, "grad_norm": 0.4192192554473877, "learning_rate": 0.002828, "loss": 2.2818, "step": 24896 }, { "epoch": 1.873170731707317, "grad_norm": 0.3924838602542877, "learning_rate": 0.002828, "loss": 2.2757, "step": 24960 }, { "epoch": 1.8779737335834898, "grad_norm": 0.3799656629562378, "learning_rate": 0.002828, "loss": 2.2695, "step": 25024 }, { "epoch": 1.8827767354596623, "grad_norm": 0.40570494532585144, "learning_rate": 0.002828, "loss": 2.2801, "step": 25088 }, { "epoch": 1.887579737335835, "grad_norm": 0.3898228704929352, "learning_rate": 0.002828, "loss": 2.2709, "step": 25152 }, { "epoch": 1.8923827392120076, "grad_norm": 0.393216073513031, "learning_rate": 0.002828, "loss": 2.282, "step": 25216 }, { "epoch": 1.89718574108818, "grad_norm": 0.4247749149799347, "learning_rate": 0.002828, "loss": 2.2776, "step": 25280 }, { "epoch": 1.9019887429643527, "grad_norm": 0.4670035243034363, "learning_rate": 0.002828, "loss": 2.2754, "step": 25344 }, { "epoch": 1.9067917448405254, "grad_norm": 0.40336644649505615, "learning_rate": 0.002828, "loss": 2.2768, "step": 25408 }, { "epoch": 1.9115947467166978, "grad_norm": 0.48462921380996704, "learning_rate": 0.002828, "loss": 2.2634, "step": 25472 }, { "epoch": 1.9163977485928707, "grad_norm": 0.44047805666923523, "learning_rate": 0.002828, "loss": 2.2674, "step": 25536 }, { "epoch": 1.9212007504690432, "grad_norm": 0.4221409261226654, "learning_rate": 0.002828, "loss": 2.2724, "step": 25600 }, { "epoch": 1.9260037523452158, "grad_norm": 0.4272362291812897, "learning_rate": 0.002828, "loss": 2.2683, "step": 25664 }, { "epoch": 1.9308067542213885, "grad_norm": 0.4309645891189575, "learning_rate": 0.002828, "loss": 2.2612, "step": 25728 }, { "epoch": 1.935609756097561, "grad_norm": 0.4220867156982422, "learning_rate": 0.002828, "loss": 2.2665, "step": 25792 }, { "epoch": 1.9404127579737336, "grad_norm": 0.3765920102596283, "learning_rate": 0.002828, "loss": 2.2652, "step": 25856 }, { "epoch": 1.9452157598499062, "grad_norm": 0.44643986225128174, "learning_rate": 0.002828, "loss": 2.2627, "step": 25920 }, { "epoch": 1.9500187617260787, "grad_norm": 0.4022061824798584, "learning_rate": 0.002828, "loss": 2.2665, "step": 25984 }, { "epoch": 1.9548217636022516, "grad_norm": 0.3935778737068176, "learning_rate": 0.002828, "loss": 2.2585, "step": 26048 }, { "epoch": 1.959624765478424, "grad_norm": 0.3877500295639038, "learning_rate": 0.002828, "loss": 2.2629, "step": 26112 }, { "epoch": 1.9644277673545967, "grad_norm": 0.3891729712486267, "learning_rate": 0.002828, "loss": 2.2594, "step": 26176 }, { "epoch": 1.9692307692307693, "grad_norm": 0.3616099953651428, "learning_rate": 0.002828, "loss": 2.2601, "step": 26240 }, { "epoch": 1.9740337711069418, "grad_norm": 0.3855280578136444, "learning_rate": 0.002828, "loss": 2.2677, "step": 26304 }, { "epoch": 1.9788367729831144, "grad_norm": 0.44039493799209595, "learning_rate": 0.002828, "loss": 2.2641, "step": 26368 }, { "epoch": 1.983639774859287, "grad_norm": 0.37217262387275696, "learning_rate": 0.002828, "loss": 2.2603, "step": 26432 }, { "epoch": 1.9884427767354595, "grad_norm": 0.3942553997039795, "learning_rate": 0.002828, "loss": 2.2508, "step": 26496 }, { "epoch": 1.9932457786116324, "grad_norm": 0.3975297808647156, "learning_rate": 0.002828, "loss": 2.2466, "step": 26560 }, { "epoch": 1.9980487804878049, "grad_norm": 0.39197394251823425, "learning_rate": 0.002828, "loss": 2.2515, "step": 26624 }, { "epoch": 2.0028517823639773, "grad_norm": 0.38722801208496094, "learning_rate": 0.002828, "loss": 2.2354, "step": 26688 }, { "epoch": 2.00765478424015, "grad_norm": 0.38619640469551086, "learning_rate": 0.002828, "loss": 2.2152, "step": 26752 }, { "epoch": 2.0124577861163226, "grad_norm": 0.49529945850372314, "learning_rate": 0.002828, "loss": 2.2167, "step": 26816 }, { "epoch": 2.0172607879924955, "grad_norm": 0.4199656844139099, "learning_rate": 0.002828, "loss": 2.2092, "step": 26880 }, { "epoch": 2.022063789868668, "grad_norm": 0.45820868015289307, "learning_rate": 0.002828, "loss": 2.2249, "step": 26944 }, { "epoch": 2.0268667917448404, "grad_norm": 0.4006725251674652, "learning_rate": 0.002828, "loss": 2.2165, "step": 27008 }, { "epoch": 2.0316697936210133, "grad_norm": 0.4596467614173889, "learning_rate": 0.002828, "loss": 2.2154, "step": 27072 }, { "epoch": 2.0364727954971857, "grad_norm": 0.38660213351249695, "learning_rate": 0.002828, "loss": 2.2062, "step": 27136 }, { "epoch": 2.041275797373358, "grad_norm": 0.44082361459732056, "learning_rate": 0.002828, "loss": 2.2124, "step": 27200 }, { "epoch": 2.046078799249531, "grad_norm": 0.3886605203151703, "learning_rate": 0.002828, "loss": 2.2182, "step": 27264 }, { "epoch": 2.0508818011257035, "grad_norm": 0.41386017203330994, "learning_rate": 0.002828, "loss": 2.2168, "step": 27328 }, { "epoch": 2.055684803001876, "grad_norm": 0.411478191614151, "learning_rate": 0.002828, "loss": 2.2092, "step": 27392 }, { "epoch": 2.060487804878049, "grad_norm": 0.47288912534713745, "learning_rate": 0.002828, "loss": 2.21, "step": 27456 }, { "epoch": 2.0652908067542213, "grad_norm": 0.36384883522987366, "learning_rate": 0.002828, "loss": 2.2095, "step": 27520 }, { "epoch": 2.070093808630394, "grad_norm": 0.40636852383613586, "learning_rate": 0.002828, "loss": 2.2092, "step": 27584 }, { "epoch": 2.0748968105065666, "grad_norm": 0.4425170421600342, "learning_rate": 0.002828, "loss": 2.212, "step": 27648 }, { "epoch": 2.079699812382739, "grad_norm": 0.48468896746635437, "learning_rate": 0.002828, "loss": 2.2078, "step": 27712 }, { "epoch": 2.084502814258912, "grad_norm": 0.40420570969581604, "learning_rate": 0.002828, "loss": 2.2158, "step": 27776 }, { "epoch": 2.0893058161350844, "grad_norm": 0.44314709305763245, "learning_rate": 0.002828, "loss": 2.2121, "step": 27840 }, { "epoch": 2.0941088180112573, "grad_norm": 0.5187743306159973, "learning_rate": 0.002828, "loss": 2.2138, "step": 27904 }, { "epoch": 2.0989118198874297, "grad_norm": 0.4796048104763031, "learning_rate": 0.002828, "loss": 2.2027, "step": 27968 }, { "epoch": 2.103714821763602, "grad_norm": 0.43605130910873413, "learning_rate": 0.002828, "loss": 2.2029, "step": 28032 }, { "epoch": 2.108517823639775, "grad_norm": 0.4523628056049347, "learning_rate": 0.002828, "loss": 2.2038, "step": 28096 }, { "epoch": 2.1133208255159475, "grad_norm": 0.4183247983455658, "learning_rate": 0.002828, "loss": 2.2026, "step": 28160 }, { "epoch": 2.11812382739212, "grad_norm": 0.5113268494606018, "learning_rate": 0.002828, "loss": 2.1998, "step": 28224 }, { "epoch": 2.122926829268293, "grad_norm": 0.40837016701698303, "learning_rate": 0.002828, "loss": 2.2054, "step": 28288 }, { "epoch": 2.1277298311444652, "grad_norm": 0.40093889832496643, "learning_rate": 0.002828, "loss": 2.208, "step": 28352 }, { "epoch": 2.1325328330206377, "grad_norm": 0.3988894820213318, "learning_rate": 0.002828, "loss": 2.2028, "step": 28416 }, { "epoch": 2.1373358348968106, "grad_norm": 0.42024731636047363, "learning_rate": 0.002828, "loss": 2.1952, "step": 28480 }, { "epoch": 2.142138836772983, "grad_norm": 0.38691264390945435, "learning_rate": 0.002828, "loss": 2.2035, "step": 28544 }, { "epoch": 2.146941838649156, "grad_norm": 0.41956332325935364, "learning_rate": 0.002828, "loss": 2.196, "step": 28608 }, { "epoch": 2.1517448405253283, "grad_norm": 0.4035188555717468, "learning_rate": 0.002828, "loss": 2.2038, "step": 28672 }, { "epoch": 2.1565478424015008, "grad_norm": 0.35282230377197266, "learning_rate": 0.002828, "loss": 2.1835, "step": 28736 }, { "epoch": 2.1613508442776737, "grad_norm": 0.43618568778038025, "learning_rate": 0.002828, "loss": 2.1946, "step": 28800 }, { "epoch": 2.166153846153846, "grad_norm": 0.4310976564884186, "learning_rate": 0.002828, "loss": 2.1873, "step": 28864 }, { "epoch": 2.170956848030019, "grad_norm": 0.4475420415401459, "learning_rate": 0.002828, "loss": 2.1946, "step": 28928 }, { "epoch": 2.1757598499061914, "grad_norm": 0.4384845197200775, "learning_rate": 0.002828, "loss": 2.1935, "step": 28992 }, { "epoch": 2.180562851782364, "grad_norm": 0.40141811966896057, "learning_rate": 0.002828, "loss": 2.1925, "step": 29056 }, { "epoch": 2.1853658536585368, "grad_norm": 0.3754780888557434, "learning_rate": 0.002828, "loss": 2.19, "step": 29120 }, { "epoch": 2.190168855534709, "grad_norm": 0.40471306443214417, "learning_rate": 0.002828, "loss": 2.1915, "step": 29184 }, { "epoch": 2.1949718574108816, "grad_norm": 1.464024543762207, "learning_rate": 0.002828, "loss": 2.1909, "step": 29248 }, { "epoch": 2.1997748592870545, "grad_norm": 0.3818819522857666, "learning_rate": 0.002828, "loss": 2.1915, "step": 29312 }, { "epoch": 2.204577861163227, "grad_norm": 0.3688436448574066, "learning_rate": 0.002828, "loss": 2.184, "step": 29376 }, { "epoch": 2.2093808630393994, "grad_norm": 0.4367921054363251, "learning_rate": 0.002828, "loss": 2.185, "step": 29440 }, { "epoch": 2.2141838649155723, "grad_norm": 0.3566763401031494, "learning_rate": 0.002828, "loss": 2.1871, "step": 29504 }, { "epoch": 2.2189868667917447, "grad_norm": 0.4481133222579956, "learning_rate": 0.002828, "loss": 2.1826, "step": 29568 }, { "epoch": 2.2237898686679176, "grad_norm": 0.44622039794921875, "learning_rate": 0.002828, "loss": 2.1885, "step": 29632 }, { "epoch": 2.22859287054409, "grad_norm": 0.4857657253742218, "learning_rate": 0.002828, "loss": 2.1784, "step": 29696 }, { "epoch": 2.2333958724202625, "grad_norm": 0.41923773288726807, "learning_rate": 0.002828, "loss": 2.1807, "step": 29760 }, { "epoch": 2.2381988742964354, "grad_norm": 0.4176802933216095, "learning_rate": 0.002828, "loss": 2.1798, "step": 29824 }, { "epoch": 2.243001876172608, "grad_norm": 0.4086935520172119, "learning_rate": 0.002828, "loss": 2.1739, "step": 29888 }, { "epoch": 2.2478048780487807, "grad_norm": 0.40138566493988037, "learning_rate": 0.002828, "loss": 2.1857, "step": 29952 }, { "epoch": 2.252607879924953, "grad_norm": 0.393996000289917, "learning_rate": 0.002828, "loss": 2.1818, "step": 30016 }, { "epoch": 2.2574108818011256, "grad_norm": 0.3962005078792572, "learning_rate": 0.002828, "loss": 2.1724, "step": 30080 }, { "epoch": 2.2622138836772985, "grad_norm": 0.41648438572883606, "learning_rate": 0.002828, "loss": 2.1835, "step": 30144 }, { "epoch": 2.267016885553471, "grad_norm": 0.3810112774372101, "learning_rate": 0.002828, "loss": 2.1736, "step": 30208 }, { "epoch": 2.2718198874296434, "grad_norm": 0.4520975649356842, "learning_rate": 0.002828, "loss": 2.1793, "step": 30272 }, { "epoch": 2.2766228893058162, "grad_norm": 0.4406943917274475, "learning_rate": 0.002828, "loss": 2.1732, "step": 30336 }, { "epoch": 2.2814258911819887, "grad_norm": 0.4186633825302124, "learning_rate": 0.002828, "loss": 2.1774, "step": 30400 } ], "logging_steps": 64, "max_steps": 333125, "num_input_tokens_seen": 0, "num_train_epochs": 25, "save_steps": 320, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.993193875177472e+17, "train_batch_size": 200, "trial_name": null, "trial_params": null }