{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 10000, "global_step": 100000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 12.391390800476074, "learning_rate": 3.920000000000001e-06, "loss": 1.8027, "step": 100 }, { "epoch": 0.01, "grad_norm": 7.110462665557861, "learning_rate": 7.92e-06, "loss": 1.6358, "step": 200 }, { "epoch": 0.01, "grad_norm": 10.526795387268066, "learning_rate": 1.1920000000000001e-05, "loss": 1.603, "step": 300 }, { "epoch": 0.02, "grad_norm": 9.175031661987305, "learning_rate": 1.5920000000000003e-05, "loss": 1.6249, "step": 400 }, { "epoch": 0.03, "grad_norm": 4.193933486938477, "learning_rate": 1.9920000000000002e-05, "loss": 1.6364, "step": 500 }, { "epoch": 0.03, "grad_norm": 8.9299955368042, "learning_rate": 1.998030150753769e-05, "loss": 1.6265, "step": 600 }, { "epoch": 0.04, "grad_norm": 11.564770698547363, "learning_rate": 1.996020100502513e-05, "loss": 1.5935, "step": 700 }, { "epoch": 0.04, "grad_norm": 9.529921531677246, "learning_rate": 1.9940100502512564e-05, "loss": 1.5959, "step": 800 }, { "epoch": 0.04, "grad_norm": 5.335429668426514, "learning_rate": 1.9920000000000002e-05, "loss": 1.6342, "step": 900 }, { "epoch": 0.05, "grad_norm": 8.102309226989746, "learning_rate": 1.9899899497487437e-05, "loss": 1.572, "step": 1000 }, { "epoch": 0.06, "grad_norm": 5.742166042327881, "learning_rate": 1.987979899497488e-05, "loss": 1.5645, "step": 1100 }, { "epoch": 0.06, "grad_norm": 5.3909735679626465, "learning_rate": 1.9859698492462313e-05, "loss": 1.547, "step": 1200 }, { "epoch": 0.07, "grad_norm": 6.765148639678955, "learning_rate": 1.983959798994975e-05, "loss": 1.5399, "step": 1300 }, { "epoch": 0.07, "grad_norm": 6.0268378257751465, "learning_rate": 1.9819497487437185e-05, "loss": 1.4985, "step": 1400 }, { "epoch": 0.07, "grad_norm": 7.305541515350342, "learning_rate": 1.9799396984924623e-05, "loss": 1.5076, "step": 1500 }, { "epoch": 0.08, "grad_norm": 8.56618595123291, "learning_rate": 1.977929648241206e-05, "loss": 1.52, "step": 1600 }, { "epoch": 0.09, "grad_norm": 5.847652435302734, "learning_rate": 1.97591959798995e-05, "loss": 1.4976, "step": 1700 }, { "epoch": 0.09, "grad_norm": 6.940663814544678, "learning_rate": 1.9739095477386937e-05, "loss": 1.4983, "step": 1800 }, { "epoch": 0.1, "grad_norm": 5.06433629989624, "learning_rate": 1.9718994974874372e-05, "loss": 1.4951, "step": 1900 }, { "epoch": 0.1, "grad_norm": 5.1144022941589355, "learning_rate": 1.969889447236181e-05, "loss": 1.5256, "step": 2000 }, { "epoch": 0.1, "grad_norm": 6.515092849731445, "learning_rate": 1.9678793969849248e-05, "loss": 1.4677, "step": 2100 }, { "epoch": 0.11, "grad_norm": 5.787613868713379, "learning_rate": 1.9658693467336686e-05, "loss": 1.4841, "step": 2200 }, { "epoch": 0.12, "grad_norm": 7.798993110656738, "learning_rate": 1.963859296482412e-05, "loss": 1.4941, "step": 2300 }, { "epoch": 0.12, "grad_norm": 4.808990955352783, "learning_rate": 1.9618492462311562e-05, "loss": 1.4775, "step": 2400 }, { "epoch": 0.12, "grad_norm": 6.113214015960693, "learning_rate": 1.9598391959798996e-05, "loss": 1.4757, "step": 2500 }, { "epoch": 0.13, "grad_norm": 6.038852214813232, "learning_rate": 1.9578291457286434e-05, "loss": 1.4413, "step": 2600 }, { "epoch": 0.14, "grad_norm": 7.736110687255859, "learning_rate": 1.955819095477387e-05, "loss": 1.5001, "step": 2700 }, { "epoch": 0.14, "grad_norm": 6.173422336578369, "learning_rate": 1.953809045226131e-05, "loss": 1.4183, "step": 2800 }, { "epoch": 0.14, "grad_norm": 5.368058681488037, "learning_rate": 1.9517989949748745e-05, "loss": 1.4877, "step": 2900 }, { "epoch": 0.15, "grad_norm": 5.35443639755249, "learning_rate": 1.9497889447236183e-05, "loss": 1.4079, "step": 3000 }, { "epoch": 0.15, "grad_norm": 8.716644287109375, "learning_rate": 1.9477788944723618e-05, "loss": 1.4386, "step": 3100 }, { "epoch": 0.16, "grad_norm": 5.639494895935059, "learning_rate": 1.945768844221106e-05, "loss": 1.4524, "step": 3200 }, { "epoch": 0.17, "grad_norm": 3.3629064559936523, "learning_rate": 1.9437587939698493e-05, "loss": 1.4218, "step": 3300 }, { "epoch": 0.17, "grad_norm": 4.7631402015686035, "learning_rate": 1.941748743718593e-05, "loss": 1.4357, "step": 3400 }, { "epoch": 0.17, "grad_norm": 6.286344528198242, "learning_rate": 1.939738693467337e-05, "loss": 1.4025, "step": 3500 }, { "epoch": 0.18, "grad_norm": 4.501611232757568, "learning_rate": 1.9377286432160804e-05, "loss": 1.4002, "step": 3600 }, { "epoch": 0.18, "grad_norm": 6.302520275115967, "learning_rate": 1.9357185929648242e-05, "loss": 1.4128, "step": 3700 }, { "epoch": 0.19, "grad_norm": 6.156075477600098, "learning_rate": 1.933708542713568e-05, "loss": 1.4136, "step": 3800 }, { "epoch": 0.2, "grad_norm": 5.4391913414001465, "learning_rate": 1.9316984924623118e-05, "loss": 1.4307, "step": 3900 }, { "epoch": 0.2, "grad_norm": 6.862305641174316, "learning_rate": 1.9296884422110552e-05, "loss": 1.3605, "step": 4000 }, { "epoch": 0.2, "grad_norm": 5.392678737640381, "learning_rate": 1.9276783919597994e-05, "loss": 1.4059, "step": 4100 }, { "epoch": 0.21, "grad_norm": 5.686226844787598, "learning_rate": 1.925668341708543e-05, "loss": 1.3474, "step": 4200 }, { "epoch": 0.21, "grad_norm": 4.506126403808594, "learning_rate": 1.9236582914572866e-05, "loss": 1.3708, "step": 4300 }, { "epoch": 0.22, "grad_norm": 7.255539894104004, "learning_rate": 1.92164824120603e-05, "loss": 1.3803, "step": 4400 }, { "epoch": 0.23, "grad_norm": 6.463212966918945, "learning_rate": 1.9196381909547742e-05, "loss": 1.3371, "step": 4500 }, { "epoch": 0.23, "grad_norm": 7.1397294998168945, "learning_rate": 1.9176281407035177e-05, "loss": 1.3787, "step": 4600 }, { "epoch": 0.23, "grad_norm": 7.188973426818848, "learning_rate": 1.9156180904522615e-05, "loss": 1.3699, "step": 4700 }, { "epoch": 0.24, "grad_norm": 4.161841869354248, "learning_rate": 1.913608040201005e-05, "loss": 1.3819, "step": 4800 }, { "epoch": 0.24, "grad_norm": 3.420564889907837, "learning_rate": 1.911597989949749e-05, "loss": 1.3719, "step": 4900 }, { "epoch": 0.25, "grad_norm": 5.769357681274414, "learning_rate": 1.9095879396984925e-05, "loss": 1.366, "step": 5000 }, { "epoch": 0.26, "grad_norm": 6.374185562133789, "learning_rate": 1.9075778894472363e-05, "loss": 1.3377, "step": 5100 }, { "epoch": 0.26, "grad_norm": 6.3521575927734375, "learning_rate": 1.90556783919598e-05, "loss": 1.3632, "step": 5200 }, { "epoch": 0.27, "grad_norm": 4.51761531829834, "learning_rate": 1.903557788944724e-05, "loss": 1.3505, "step": 5300 }, { "epoch": 0.27, "grad_norm": 6.074390411376953, "learning_rate": 1.9015477386934674e-05, "loss": 1.3644, "step": 5400 }, { "epoch": 0.28, "grad_norm": 4.369632244110107, "learning_rate": 1.8995376884422112e-05, "loss": 1.3807, "step": 5500 }, { "epoch": 0.28, "grad_norm": 7.657780170440674, "learning_rate": 1.897527638190955e-05, "loss": 1.3125, "step": 5600 }, { "epoch": 0.28, "grad_norm": 9.048200607299805, "learning_rate": 1.8955175879396988e-05, "loss": 1.3216, "step": 5700 }, { "epoch": 0.29, "grad_norm": 5.997036933898926, "learning_rate": 1.8935075376884426e-05, "loss": 1.3262, "step": 5800 }, { "epoch": 0.29, "grad_norm": 4.751107692718506, "learning_rate": 1.891497487437186e-05, "loss": 1.3566, "step": 5900 }, { "epoch": 0.3, "grad_norm": 5.662681579589844, "learning_rate": 1.88948743718593e-05, "loss": 1.3645, "step": 6000 }, { "epoch": 0.3, "grad_norm": 5.755290508270264, "learning_rate": 1.887497487437186e-05, "loss": 1.2714, "step": 6100 }, { "epoch": 0.31, "grad_norm": 5.199550151824951, "learning_rate": 1.88548743718593e-05, "loss": 1.3427, "step": 6200 }, { "epoch": 0.32, "grad_norm": 7.531371116638184, "learning_rate": 1.8834773869346733e-05, "loss": 1.3198, "step": 6300 }, { "epoch": 0.32, "grad_norm": 4.267923831939697, "learning_rate": 1.881467336683417e-05, "loss": 1.334, "step": 6400 }, { "epoch": 0.33, "grad_norm": 5.429295063018799, "learning_rate": 1.879457286432161e-05, "loss": 1.2949, "step": 6500 }, { "epoch": 0.33, "grad_norm": 4.842006206512451, "learning_rate": 1.8774472361809047e-05, "loss": 1.3123, "step": 6600 }, { "epoch": 0.34, "grad_norm": 4.693381309509277, "learning_rate": 1.8754371859296482e-05, "loss": 1.3218, "step": 6700 }, { "epoch": 0.34, "grad_norm": 3.555487632751465, "learning_rate": 1.8734271356783923e-05, "loss": 1.3077, "step": 6800 }, { "epoch": 0.34, "grad_norm": 7.314678192138672, "learning_rate": 1.8714170854271358e-05, "loss": 1.2855, "step": 6900 }, { "epoch": 0.35, "grad_norm": 6.160294532775879, "learning_rate": 1.8694070351758796e-05, "loss": 1.2901, "step": 7000 }, { "epoch": 0.35, "grad_norm": 7.399959087371826, "learning_rate": 1.867396984924623e-05, "loss": 1.264, "step": 7100 }, { "epoch": 0.36, "grad_norm": 4.204007625579834, "learning_rate": 1.8653869346733672e-05, "loss": 1.323, "step": 7200 }, { "epoch": 0.36, "grad_norm": 5.531479358673096, "learning_rate": 1.8633768844221106e-05, "loss": 1.3211, "step": 7300 }, { "epoch": 0.37, "grad_norm": 4.645538806915283, "learning_rate": 1.8613668341708544e-05, "loss": 1.2941, "step": 7400 }, { "epoch": 0.38, "grad_norm": 6.326472282409668, "learning_rate": 1.8593567839195982e-05, "loss": 1.3025, "step": 7500 }, { "epoch": 0.38, "grad_norm": 6.338307857513428, "learning_rate": 1.857346733668342e-05, "loss": 1.2924, "step": 7600 }, { "epoch": 0.39, "grad_norm": 7.802080154418945, "learning_rate": 1.8553366834170855e-05, "loss": 1.3061, "step": 7700 }, { "epoch": 0.39, "grad_norm": 4.98875093460083, "learning_rate": 1.8533266331658293e-05, "loss": 1.321, "step": 7800 }, { "epoch": 0.4, "grad_norm": 5.888318061828613, "learning_rate": 1.851316582914573e-05, "loss": 1.2746, "step": 7900 }, { "epoch": 0.4, "grad_norm": 6.636387825012207, "learning_rate": 1.849306532663317e-05, "loss": 1.2653, "step": 8000 }, { "epoch": 0.41, "grad_norm": 6.1142449378967285, "learning_rate": 1.8473165829145728e-05, "loss": 1.2347, "step": 8100 }, { "epoch": 0.41, "grad_norm": 5.41117525100708, "learning_rate": 1.845306532663317e-05, "loss": 1.3062, "step": 8200 }, { "epoch": 0.41, "grad_norm": 5.025302886962891, "learning_rate": 1.8432964824120604e-05, "loss": 1.3162, "step": 8300 }, { "epoch": 0.42, "grad_norm": 7.1088972091674805, "learning_rate": 1.8412864321608042e-05, "loss": 1.2573, "step": 8400 }, { "epoch": 0.42, "grad_norm": 5.86447811126709, "learning_rate": 1.839276381909548e-05, "loss": 1.2855, "step": 8500 }, { "epoch": 0.43, "grad_norm": 4.323820114135742, "learning_rate": 1.8372663316582918e-05, "loss": 1.2272, "step": 8600 }, { "epoch": 0.43, "grad_norm": 7.335355758666992, "learning_rate": 1.8352562814070352e-05, "loss": 1.2718, "step": 8700 }, { "epoch": 0.44, "grad_norm": 5.308874130249023, "learning_rate": 1.833246231155779e-05, "loss": 1.2727, "step": 8800 }, { "epoch": 0.45, "grad_norm": 3.919790506362915, "learning_rate": 1.8312361809045228e-05, "loss": 1.28, "step": 8900 }, { "epoch": 0.45, "grad_norm": 7.291688442230225, "learning_rate": 1.8292261306532663e-05, "loss": 1.2768, "step": 9000 }, { "epoch": 0.46, "grad_norm": 5.098793029785156, "learning_rate": 1.8272160804020104e-05, "loss": 1.2441, "step": 9100 }, { "epoch": 0.46, "grad_norm": 5.242636203765869, "learning_rate": 1.825206030150754e-05, "loss": 1.2534, "step": 9200 }, { "epoch": 0.47, "grad_norm": 5.310051918029785, "learning_rate": 1.8231959798994977e-05, "loss": 1.2878, "step": 9300 }, { "epoch": 0.47, "grad_norm": 6.058734893798828, "learning_rate": 1.821185929648241e-05, "loss": 1.2964, "step": 9400 }, { "epoch": 0.47, "grad_norm": 6.912698745727539, "learning_rate": 1.8191758793969853e-05, "loss": 1.2511, "step": 9500 }, { "epoch": 0.48, "grad_norm": 6.428102016448975, "learning_rate": 1.8171658291457287e-05, "loss": 1.2605, "step": 9600 }, { "epoch": 0.48, "grad_norm": 5.642975807189941, "learning_rate": 1.8151557788944725e-05, "loss": 1.264, "step": 9700 }, { "epoch": 0.49, "grad_norm": 6.23274040222168, "learning_rate": 1.813145728643216e-05, "loss": 1.2583, "step": 9800 }, { "epoch": 0.49, "grad_norm": 7.3280792236328125, "learning_rate": 1.81113567839196e-05, "loss": 1.2324, "step": 9900 }, { "epoch": 0.5, "grad_norm": 6.048460483551025, "learning_rate": 1.8091256281407036e-05, "loss": 1.2477, "step": 10000 }, { "epoch": 0.5, "eval_loss": 1.2569069862365723, "eval_runtime": 21.5797, "eval_samples_per_second": 46.34, "eval_steps_per_second": 5.792, "step": 10000 }, { "epoch": 0.51, "grad_norm": 5.294989109039307, "learning_rate": 1.80713567839196e-05, "loss": 1.3038, "step": 10100 }, { "epoch": 0.51, "grad_norm": 6.7187981605529785, "learning_rate": 1.8051256281407036e-05, "loss": 1.2584, "step": 10200 }, { "epoch": 0.52, "grad_norm": 7.11021089553833, "learning_rate": 1.8031155778894474e-05, "loss": 1.2612, "step": 10300 }, { "epoch": 0.52, "grad_norm": 6.111474990844727, "learning_rate": 1.801105527638191e-05, "loss": 1.2638, "step": 10400 }, { "epoch": 0.53, "grad_norm": 6.04983377456665, "learning_rate": 1.799095477386935e-05, "loss": 1.2381, "step": 10500 }, { "epoch": 0.53, "grad_norm": 5.682928562164307, "learning_rate": 1.7970854271356785e-05, "loss": 1.233, "step": 10600 }, { "epoch": 0.54, "grad_norm": 6.028292179107666, "learning_rate": 1.7950753768844223e-05, "loss": 1.2572, "step": 10700 }, { "epoch": 0.54, "grad_norm": 4.738650798797607, "learning_rate": 1.793065326633166e-05, "loss": 1.2125, "step": 10800 }, { "epoch": 0.55, "grad_norm": 5.227931976318359, "learning_rate": 1.7910753768844223e-05, "loss": 1.2862, "step": 10900 }, { "epoch": 0.55, "grad_norm": 6.476836204528809, "learning_rate": 1.7890653266331658e-05, "loss": 1.243, "step": 11000 }, { "epoch": 0.56, "grad_norm": 4.261963844299316, "learning_rate": 1.78705527638191e-05, "loss": 1.2118, "step": 11100 }, { "epoch": 0.56, "grad_norm": 6.414599418640137, "learning_rate": 1.7850452261306534e-05, "loss": 1.222, "step": 11200 }, { "epoch": 0.56, "grad_norm": 5.642942905426025, "learning_rate": 1.783035175879397e-05, "loss": 1.1809, "step": 11300 }, { "epoch": 0.57, "grad_norm": 4.094428539276123, "learning_rate": 1.781025125628141e-05, "loss": 1.2362, "step": 11400 }, { "epoch": 0.57, "grad_norm": 5.5772881507873535, "learning_rate": 1.7790150753768847e-05, "loss": 1.2005, "step": 11500 }, { "epoch": 0.58, "grad_norm": 4.420604705810547, "learning_rate": 1.7770050251256282e-05, "loss": 1.2138, "step": 11600 }, { "epoch": 0.58, "grad_norm": 5.298806667327881, "learning_rate": 1.774994974874372e-05, "loss": 1.1693, "step": 11700 }, { "epoch": 0.59, "grad_norm": 5.862612247467041, "learning_rate": 1.7729849246231158e-05, "loss": 1.1728, "step": 11800 }, { "epoch": 0.59, "grad_norm": 3.835301637649536, "learning_rate": 1.7709748743718593e-05, "loss": 1.2159, "step": 11900 }, { "epoch": 0.6, "grad_norm": 5.67401123046875, "learning_rate": 1.768964824120603e-05, "loss": 1.2393, "step": 12000 }, { "epoch": 0.6, "grad_norm": 5.424498558044434, "learning_rate": 1.766954773869347e-05, "loss": 1.2255, "step": 12100 }, { "epoch": 0.61, "grad_norm": 5.532503604888916, "learning_rate": 1.7649447236180907e-05, "loss": 1.2024, "step": 12200 }, { "epoch": 0.61, "grad_norm": 5.404232501983643, "learning_rate": 1.762934673366834e-05, "loss": 1.2202, "step": 12300 }, { "epoch": 0.62, "grad_norm": 3.9564428329467773, "learning_rate": 1.7609246231155782e-05, "loss": 1.1655, "step": 12400 }, { "epoch": 0.62, "grad_norm": 3.2090141773223877, "learning_rate": 1.7589145728643217e-05, "loss": 1.1563, "step": 12500 }, { "epoch": 0.63, "grad_norm": 6.341458320617676, "learning_rate": 1.7569045226130655e-05, "loss": 1.1982, "step": 12600 }, { "epoch": 0.64, "grad_norm": 7.190246105194092, "learning_rate": 1.754894472361809e-05, "loss": 1.1817, "step": 12700 }, { "epoch": 0.64, "grad_norm": 6.108299255371094, "learning_rate": 1.752884422110553e-05, "loss": 1.2123, "step": 12800 }, { "epoch": 0.65, "grad_norm": 5.769379615783691, "learning_rate": 1.7508743718592966e-05, "loss": 1.1964, "step": 12900 }, { "epoch": 0.65, "grad_norm": 5.177648067474365, "learning_rate": 1.7488643216080404e-05, "loss": 1.2103, "step": 13000 }, { "epoch": 0.66, "grad_norm": 5.531684875488281, "learning_rate": 1.7468542713567838e-05, "loss": 1.1801, "step": 13100 }, { "epoch": 0.66, "grad_norm": 5.700603008270264, "learning_rate": 1.744844221105528e-05, "loss": 1.1943, "step": 13200 }, { "epoch": 0.67, "grad_norm": 9.25114917755127, "learning_rate": 1.7428341708542714e-05, "loss": 1.2286, "step": 13300 }, { "epoch": 0.67, "grad_norm": 4.238541126251221, "learning_rate": 1.7408241206030152e-05, "loss": 1.1869, "step": 13400 }, { "epoch": 0.68, "grad_norm": 5.6147260665893555, "learning_rate": 1.738814070351759e-05, "loss": 1.1854, "step": 13500 }, { "epoch": 0.68, "grad_norm": 4.879734039306641, "learning_rate": 1.7368040201005028e-05, "loss": 1.1941, "step": 13600 }, { "epoch": 0.69, "grad_norm": 3.612379312515259, "learning_rate": 1.7347939698492463e-05, "loss": 1.1649, "step": 13700 }, { "epoch": 0.69, "grad_norm": 4.583663463592529, "learning_rate": 1.73278391959799e-05, "loss": 1.1796, "step": 13800 }, { "epoch": 0.69, "grad_norm": 4.3080339431762695, "learning_rate": 1.7307939698492463e-05, "loss": 1.2092, "step": 13900 }, { "epoch": 0.7, "grad_norm": 5.9151506423950195, "learning_rate": 1.72878391959799e-05, "loss": 1.1809, "step": 14000 }, { "epoch": 0.7, "grad_norm": 5.167910575866699, "learning_rate": 1.726773869346734e-05, "loss": 1.2063, "step": 14100 }, { "epoch": 0.71, "grad_norm": 7.372837543487549, "learning_rate": 1.7247638190954777e-05, "loss": 1.147, "step": 14200 }, { "epoch": 0.71, "grad_norm": 3.6992413997650146, "learning_rate": 1.722753768844221e-05, "loss": 1.2312, "step": 14300 }, { "epoch": 0.72, "grad_norm": 6.654348850250244, "learning_rate": 1.720743718592965e-05, "loss": 1.1956, "step": 14400 }, { "epoch": 0.72, "grad_norm": 4.683749675750732, "learning_rate": 1.7187336683417087e-05, "loss": 1.1598, "step": 14500 }, { "epoch": 0.73, "grad_norm": 5.769094467163086, "learning_rate": 1.7167236180904522e-05, "loss": 1.1387, "step": 14600 }, { "epoch": 0.73, "grad_norm": 7.586219310760498, "learning_rate": 1.7147135678391963e-05, "loss": 1.1994, "step": 14700 }, { "epoch": 0.74, "grad_norm": 5.573954105377197, "learning_rate": 1.7127035175879398e-05, "loss": 1.1887, "step": 14800 }, { "epoch": 0.74, "grad_norm": 6.4866251945495605, "learning_rate": 1.7106934673366836e-05, "loss": 1.1892, "step": 14900 }, { "epoch": 0.75, "grad_norm": 4.954825401306152, "learning_rate": 1.708683417085427e-05, "loss": 1.1742, "step": 15000 }, { "epoch": 0.76, "grad_norm": 3.952847480773926, "learning_rate": 1.7066733668341712e-05, "loss": 1.143, "step": 15100 }, { "epoch": 0.76, "grad_norm": 5.170006275177002, "learning_rate": 1.7046633165829146e-05, "loss": 1.1881, "step": 15200 }, { "epoch": 0.77, "grad_norm": 4.910400390625, "learning_rate": 1.7026532663316584e-05, "loss": 1.131, "step": 15300 }, { "epoch": 0.77, "grad_norm": 4.728166580200195, "learning_rate": 1.700643216080402e-05, "loss": 1.1854, "step": 15400 }, { "epoch": 0.78, "grad_norm": 6.516223430633545, "learning_rate": 1.698633165829146e-05, "loss": 1.2069, "step": 15500 }, { "epoch": 0.78, "grad_norm": 5.914300918579102, "learning_rate": 1.6966231155778895e-05, "loss": 1.1663, "step": 15600 }, { "epoch": 0.79, "grad_norm": 4.6894378662109375, "learning_rate": 1.6946130653266333e-05, "loss": 1.145, "step": 15700 }, { "epoch": 0.79, "grad_norm": 4.994019031524658, "learning_rate": 1.692603015075377e-05, "loss": 1.1156, "step": 15800 }, { "epoch": 0.8, "grad_norm": 5.994630813598633, "learning_rate": 1.690592964824121e-05, "loss": 1.1583, "step": 15900 }, { "epoch": 0.8, "grad_norm": 6.7444562911987305, "learning_rate": 1.6885829145728643e-05, "loss": 1.1821, "step": 16000 }, { "epoch": 0.81, "grad_norm": 5.461032867431641, "learning_rate": 1.686572864321608e-05, "loss": 1.1388, "step": 16100 }, { "epoch": 0.81, "grad_norm": 5.0775251388549805, "learning_rate": 1.684562814070352e-05, "loss": 1.1576, "step": 16200 }, { "epoch": 0.81, "grad_norm": 4.469027042388916, "learning_rate": 1.6825527638190957e-05, "loss": 1.1792, "step": 16300 }, { "epoch": 0.82, "grad_norm": 6.780773639678955, "learning_rate": 1.6805427135678395e-05, "loss": 1.1441, "step": 16400 }, { "epoch": 0.82, "grad_norm": 6.338268756866455, "learning_rate": 1.678532663316583e-05, "loss": 1.1087, "step": 16500 }, { "epoch": 0.83, "grad_norm": 4.28759241104126, "learning_rate": 1.6765226130653268e-05, "loss": 1.1616, "step": 16600 }, { "epoch": 0.83, "grad_norm": 4.656599998474121, "learning_rate": 1.6745125628140706e-05, "loss": 1.1086, "step": 16700 }, { "epoch": 0.84, "grad_norm": 4.506341457366943, "learning_rate": 1.6725025125628144e-05, "loss": 1.1821, "step": 16800 }, { "epoch": 0.84, "grad_norm": 5.074087142944336, "learning_rate": 1.670492462311558e-05, "loss": 1.1376, "step": 16900 }, { "epoch": 0.85, "grad_norm": 4.427557468414307, "learning_rate": 1.6684824120603016e-05, "loss": 1.1608, "step": 17000 }, { "epoch": 0.85, "grad_norm": 4.684313774108887, "learning_rate": 1.666472361809045e-05, "loss": 1.1374, "step": 17100 }, { "epoch": 0.86, "grad_norm": 4.981125354766846, "learning_rate": 1.6644623115577892e-05, "loss": 1.1157, "step": 17200 }, { "epoch": 0.86, "grad_norm": 6.36452579498291, "learning_rate": 1.6624522613065327e-05, "loss": 1.1547, "step": 17300 }, { "epoch": 0.87, "grad_norm": 4.986701011657715, "learning_rate": 1.6604422110552765e-05, "loss": 1.147, "step": 17400 }, { "epoch": 0.88, "grad_norm": 6.206230640411377, "learning_rate": 1.6584321608040203e-05, "loss": 1.1235, "step": 17500 }, { "epoch": 0.88, "grad_norm": 5.597214221954346, "learning_rate": 1.656422110552764e-05, "loss": 1.1472, "step": 17600 }, { "epoch": 0.89, "grad_norm": 5.753964424133301, "learning_rate": 1.6544120603015076e-05, "loss": 1.0838, "step": 17700 }, { "epoch": 0.89, "grad_norm": 5.263125896453857, "learning_rate": 1.6524020100502513e-05, "loss": 1.1149, "step": 17800 }, { "epoch": 0.9, "grad_norm": 2.9451704025268555, "learning_rate": 1.6504120603015076e-05, "loss": 1.1162, "step": 17900 }, { "epoch": 0.9, "grad_norm": 6.694633960723877, "learning_rate": 1.6484020100502514e-05, "loss": 1.1268, "step": 18000 }, { "epoch": 0.91, "grad_norm": 5.449553489685059, "learning_rate": 1.6463919597989952e-05, "loss": 1.1307, "step": 18100 }, { "epoch": 0.91, "grad_norm": 5.502272129058838, "learning_rate": 1.644381909547739e-05, "loss": 1.1031, "step": 18200 }, { "epoch": 0.92, "grad_norm": 6.899608612060547, "learning_rate": 1.6423718592964824e-05, "loss": 1.1389, "step": 18300 }, { "epoch": 0.92, "grad_norm": 4.675032615661621, "learning_rate": 1.6403618090452262e-05, "loss": 1.1541, "step": 18400 }, { "epoch": 0.93, "grad_norm": 7.353012561798096, "learning_rate": 1.63835175879397e-05, "loss": 1.1213, "step": 18500 }, { "epoch": 0.93, "grad_norm": 4.253681659698486, "learning_rate": 1.636341708542714e-05, "loss": 1.1672, "step": 18600 }, { "epoch": 0.94, "grad_norm": 6.5902018547058105, "learning_rate": 1.6343316582914573e-05, "loss": 1.1349, "step": 18700 }, { "epoch": 0.94, "grad_norm": 5.40578556060791, "learning_rate": 1.632321608040201e-05, "loss": 1.1283, "step": 18800 }, { "epoch": 0.94, "grad_norm": 3.9744160175323486, "learning_rate": 1.630311557788945e-05, "loss": 1.1463, "step": 18900 }, { "epoch": 0.95, "grad_norm": 6.115358352661133, "learning_rate": 1.6283015075376887e-05, "loss": 1.1443, "step": 19000 }, { "epoch": 0.95, "grad_norm": 2.9785940647125244, "learning_rate": 1.6262914572864325e-05, "loss": 1.1409, "step": 19100 }, { "epoch": 0.96, "grad_norm": 5.200758934020996, "learning_rate": 1.6243015075376887e-05, "loss": 1.1629, "step": 19200 }, { "epoch": 0.96, "grad_norm": 5.975739479064941, "learning_rate": 1.6222914572864322e-05, "loss": 1.083, "step": 19300 }, { "epoch": 0.97, "grad_norm": 6.220870018005371, "learning_rate": 1.620281407035176e-05, "loss": 1.1305, "step": 19400 }, { "epoch": 0.97, "grad_norm": 4.187997341156006, "learning_rate": 1.6182713567839198e-05, "loss": 1.1028, "step": 19500 }, { "epoch": 0.98, "grad_norm": 5.540648937225342, "learning_rate": 1.6162613065326636e-05, "loss": 1.1176, "step": 19600 }, { "epoch": 0.98, "grad_norm": 5.99765157699585, "learning_rate": 1.6142512562814074e-05, "loss": 1.0932, "step": 19700 }, { "epoch": 0.99, "grad_norm": 4.647700786590576, "learning_rate": 1.6122412060301508e-05, "loss": 1.1294, "step": 19800 }, { "epoch": 0.99, "grad_norm": 6.05048131942749, "learning_rate": 1.6102311557788946e-05, "loss": 1.0828, "step": 19900 }, { "epoch": 1.0, "grad_norm": 4.912966251373291, "learning_rate": 1.608221105527638e-05, "loss": 1.0975, "step": 20000 }, { "epoch": 1.0, "eval_loss": 1.132000207901001, "eval_runtime": 21.5853, "eval_samples_per_second": 46.328, "eval_steps_per_second": 5.791, "step": 20000 }, { "epoch": 1.0, "grad_norm": 5.5869340896606445, "learning_rate": 1.6062110552763822e-05, "loss": 1.1428, "step": 20100 }, { "epoch": 1.01, "grad_norm": 4.5555739402771, "learning_rate": 1.6042010050251257e-05, "loss": 1.0939, "step": 20200 }, { "epoch": 1.01, "grad_norm": 3.527172803878784, "learning_rate": 1.6021909547738695e-05, "loss": 1.1184, "step": 20300 }, { "epoch": 1.02, "grad_norm": 2.7429285049438477, "learning_rate": 1.600180904522613e-05, "loss": 1.1028, "step": 20400 }, { "epoch": 1.02, "grad_norm": 3.6536190509796143, "learning_rate": 1.598170854271357e-05, "loss": 1.0954, "step": 20500 }, { "epoch": 1.03, "grad_norm": 4.48521089553833, "learning_rate": 1.5961608040201005e-05, "loss": 1.1001, "step": 20600 }, { "epoch": 1.03, "grad_norm": 7.937503814697266, "learning_rate": 1.5941507537688443e-05, "loss": 1.0676, "step": 20700 }, { "epoch": 1.04, "grad_norm": 7.802252769470215, "learning_rate": 1.592140703517588e-05, "loss": 1.1007, "step": 20800 }, { "epoch": 1.04, "grad_norm": 6.141603469848633, "learning_rate": 1.590130653266332e-05, "loss": 1.0749, "step": 20900 }, { "epoch": 1.05, "grad_norm": 5.166286945343018, "learning_rate": 1.5881206030150754e-05, "loss": 1.0704, "step": 21000 }, { "epoch": 1.05, "grad_norm": 5.407045364379883, "learning_rate": 1.5861105527638192e-05, "loss": 1.0852, "step": 21100 }, { "epoch": 1.06, "grad_norm": 5.4536967277526855, "learning_rate": 1.584100502512563e-05, "loss": 1.1152, "step": 21200 }, { "epoch": 1.06, "grad_norm": 5.464064121246338, "learning_rate": 1.5820904522613068e-05, "loss": 1.0546, "step": 21300 }, { "epoch": 1.07, "grad_norm": 3.853875160217285, "learning_rate": 1.580100502512563e-05, "loss": 1.0858, "step": 21400 }, { "epoch": 1.07, "grad_norm": 4.8497443199157715, "learning_rate": 1.5780904522613068e-05, "loss": 1.0973, "step": 21500 }, { "epoch": 1.08, "grad_norm": 4.255434513092041, "learning_rate": 1.5760804020100503e-05, "loss": 1.0872, "step": 21600 }, { "epoch": 1.08, "grad_norm": 4.134657382965088, "learning_rate": 1.574070351758794e-05, "loss": 1.1127, "step": 21700 }, { "epoch": 1.09, "grad_norm": 9.425840377807617, "learning_rate": 1.572060301507538e-05, "loss": 1.1147, "step": 21800 }, { "epoch": 1.09, "grad_norm": 5.42075777053833, "learning_rate": 1.5700502512562817e-05, "loss": 1.0719, "step": 21900 }, { "epoch": 1.1, "grad_norm": 5.076992988586426, "learning_rate": 1.5680402010050255e-05, "loss": 1.095, "step": 22000 }, { "epoch": 1.1, "grad_norm": 5.668195724487305, "learning_rate": 1.566030150753769e-05, "loss": 1.0799, "step": 22100 }, { "epoch": 1.11, "grad_norm": 5.9342474937438965, "learning_rate": 1.5640201005025127e-05, "loss": 1.0965, "step": 22200 }, { "epoch": 1.11, "grad_norm": 5.112601280212402, "learning_rate": 1.5620100502512565e-05, "loss": 1.0951, "step": 22300 }, { "epoch": 1.12, "grad_norm": 10.206339836120605, "learning_rate": 1.5600000000000003e-05, "loss": 1.0837, "step": 22400 }, { "epoch": 1.12, "grad_norm": 3.8015480041503906, "learning_rate": 1.5579899497487438e-05, "loss": 1.0871, "step": 22500 }, { "epoch": 1.13, "grad_norm": 4.524369239807129, "learning_rate": 1.5559798994974876e-05, "loss": 1.1263, "step": 22600 }, { "epoch": 1.14, "grad_norm": 5.1671671867370605, "learning_rate": 1.553969849246231e-05, "loss": 1.085, "step": 22700 }, { "epoch": 1.14, "grad_norm": 4.96006965637207, "learning_rate": 1.551959798994975e-05, "loss": 1.0893, "step": 22800 }, { "epoch": 1.15, "grad_norm": 6.482675075531006, "learning_rate": 1.5499497487437186e-05, "loss": 1.0667, "step": 22900 }, { "epoch": 1.15, "grad_norm": 4.591585636138916, "learning_rate": 1.5479396984924624e-05, "loss": 1.0861, "step": 23000 }, { "epoch": 1.16, "grad_norm": 4.026520729064941, "learning_rate": 1.5459296482412062e-05, "loss": 1.0772, "step": 23100 }, { "epoch": 1.16, "grad_norm": 5.972117900848389, "learning_rate": 1.54391959798995e-05, "loss": 1.0818, "step": 23200 }, { "epoch": 1.17, "grad_norm": 4.737887382507324, "learning_rate": 1.5419095477386935e-05, "loss": 1.0752, "step": 23300 }, { "epoch": 1.17, "grad_norm": 4.748262882232666, "learning_rate": 1.5398994974874373e-05, "loss": 1.0803, "step": 23400 }, { "epoch": 1.18, "grad_norm": 4.94175386428833, "learning_rate": 1.537889447236181e-05, "loss": 1.0754, "step": 23500 }, { "epoch": 1.18, "grad_norm": 4.3259172439575195, "learning_rate": 1.535879396984925e-05, "loss": 1.0463, "step": 23600 }, { "epoch": 1.19, "grad_norm": 5.240546703338623, "learning_rate": 1.5338693467336687e-05, "loss": 1.0547, "step": 23700 }, { "epoch": 1.19, "grad_norm": 6.120886325836182, "learning_rate": 1.531859296482412e-05, "loss": 1.0861, "step": 23800 }, { "epoch": 1.2, "grad_norm": 5.634921550750732, "learning_rate": 1.529849246231156e-05, "loss": 1.0722, "step": 23900 }, { "epoch": 1.2, "grad_norm": 5.39201021194458, "learning_rate": 1.5278391959798997e-05, "loss": 1.07, "step": 24000 }, { "epoch": 1.21, "grad_norm": 6.85221004486084, "learning_rate": 1.5258291457286433e-05, "loss": 1.0578, "step": 24100 }, { "epoch": 1.21, "grad_norm": 4.522882461547852, "learning_rate": 1.523819095477387e-05, "loss": 1.0895, "step": 24200 }, { "epoch": 1.22, "grad_norm": 4.020057201385498, "learning_rate": 1.5218090452261308e-05, "loss": 1.0377, "step": 24300 }, { "epoch": 1.22, "grad_norm": 4.188474655151367, "learning_rate": 1.5197989949748746e-05, "loss": 1.0469, "step": 24400 }, { "epoch": 1.23, "grad_norm": 6.872804164886475, "learning_rate": 1.5177889447236182e-05, "loss": 1.0795, "step": 24500 }, { "epoch": 1.23, "grad_norm": 5.834617614746094, "learning_rate": 1.515778894472362e-05, "loss": 1.0827, "step": 24600 }, { "epoch": 1.23, "grad_norm": 4.008932590484619, "learning_rate": 1.5137688442211056e-05, "loss": 1.069, "step": 24700 }, { "epoch": 1.24, "grad_norm": 5.309475898742676, "learning_rate": 1.5117587939698494e-05, "loss": 1.0668, "step": 24800 }, { "epoch": 1.25, "grad_norm": 6.02021598815918, "learning_rate": 1.5097487437185932e-05, "loss": 1.0611, "step": 24900 }, { "epoch": 1.25, "grad_norm": 4.143280029296875, "learning_rate": 1.5077587939698495e-05, "loss": 1.0526, "step": 25000 }, { "epoch": 1.25, "grad_norm": 4.231622695922852, "learning_rate": 1.505748743718593e-05, "loss": 1.0706, "step": 25100 }, { "epoch": 1.26, "grad_norm": 4.0399322509765625, "learning_rate": 1.5037386934673369e-05, "loss": 1.0878, "step": 25200 }, { "epoch": 1.27, "grad_norm": 4.2283759117126465, "learning_rate": 1.5017286432160805e-05, "loss": 1.0903, "step": 25300 }, { "epoch": 1.27, "grad_norm": 6.159567356109619, "learning_rate": 1.4997185929648241e-05, "loss": 1.069, "step": 25400 }, { "epoch": 1.27, "grad_norm": 5.181605815887451, "learning_rate": 1.4977085427135681e-05, "loss": 1.0712, "step": 25500 }, { "epoch": 1.28, "grad_norm": 4.90966796875, "learning_rate": 1.4956984924623117e-05, "loss": 1.0672, "step": 25600 }, { "epoch": 1.28, "grad_norm": 4.765697479248047, "learning_rate": 1.4936884422110554e-05, "loss": 1.0338, "step": 25700 }, { "epoch": 1.29, "grad_norm": 4.3462018966674805, "learning_rate": 1.491678391959799e-05, "loss": 1.0408, "step": 25800 }, { "epoch": 1.29, "grad_norm": 5.249480247497559, "learning_rate": 1.489668341708543e-05, "loss": 1.0576, "step": 25900 }, { "epoch": 1.3, "grad_norm": 5.543900489807129, "learning_rate": 1.4876582914572866e-05, "loss": 1.0651, "step": 26000 }, { "epoch": 1.3, "grad_norm": 6.526113033294678, "learning_rate": 1.4856482412060302e-05, "loss": 1.0596, "step": 26100 }, { "epoch": 1.31, "grad_norm": 4.725895404815674, "learning_rate": 1.4836381909547738e-05, "loss": 1.0969, "step": 26200 }, { "epoch": 1.31, "grad_norm": 6.068490028381348, "learning_rate": 1.4816281407035178e-05, "loss": 1.0284, "step": 26300 }, { "epoch": 1.32, "grad_norm": 4.363389015197754, "learning_rate": 1.4796180904522614e-05, "loss": 1.0589, "step": 26400 }, { "epoch": 1.32, "grad_norm": 6.8659257888793945, "learning_rate": 1.477608040201005e-05, "loss": 1.0803, "step": 26500 }, { "epoch": 1.33, "grad_norm": 5.061355113983154, "learning_rate": 1.4755979899497489e-05, "loss": 1.066, "step": 26600 }, { "epoch": 1.33, "grad_norm": 4.511940956115723, "learning_rate": 1.4735879396984927e-05, "loss": 1.0447, "step": 26700 }, { "epoch": 1.34, "grad_norm": 4.449003219604492, "learning_rate": 1.4715778894472363e-05, "loss": 1.0532, "step": 26800 }, { "epoch": 1.34, "grad_norm": 5.1782307624816895, "learning_rate": 1.46956783919598e-05, "loss": 1.0608, "step": 26900 }, { "epoch": 1.35, "grad_norm": 5.087260723114014, "learning_rate": 1.4675577889447237e-05, "loss": 1.0371, "step": 27000 }, { "epoch": 1.35, "grad_norm": 4.387496471405029, "learning_rate": 1.4655477386934675e-05, "loss": 1.055, "step": 27100 }, { "epoch": 1.36, "grad_norm": 4.9253010749816895, "learning_rate": 1.4635376884422113e-05, "loss": 1.0385, "step": 27200 }, { "epoch": 1.36, "grad_norm": 4.611992835998535, "learning_rate": 1.461527638190955e-05, "loss": 1.0338, "step": 27300 }, { "epoch": 1.37, "grad_norm": 2.981304168701172, "learning_rate": 1.4595175879396986e-05, "loss": 1.0516, "step": 27400 }, { "epoch": 1.38, "grad_norm": 5.678966045379639, "learning_rate": 1.4575075376884422e-05, "loss": 1.0788, "step": 27500 }, { "epoch": 1.38, "grad_norm": 5.3079752922058105, "learning_rate": 1.4554974874371862e-05, "loss": 1.0853, "step": 27600 }, { "epoch": 1.39, "grad_norm": 5.990561485290527, "learning_rate": 1.4534874371859298e-05, "loss": 1.0187, "step": 27700 }, { "epoch": 1.39, "grad_norm": 7.396142482757568, "learning_rate": 1.4514773869346734e-05, "loss": 1.0694, "step": 27800 }, { "epoch": 1.4, "grad_norm": 4.319200038909912, "learning_rate": 1.449467336683417e-05, "loss": 1.0668, "step": 27900 }, { "epoch": 1.4, "grad_norm": 2.7691450119018555, "learning_rate": 1.447457286432161e-05, "loss": 1.0652, "step": 28000 }, { "epoch": 1.41, "grad_norm": 8.814241409301758, "learning_rate": 1.4454472361809046e-05, "loss": 1.0423, "step": 28100 }, { "epoch": 1.41, "grad_norm": 5.264801979064941, "learning_rate": 1.4434371859296483e-05, "loss": 1.0918, "step": 28200 }, { "epoch": 1.42, "grad_norm": 4.573727130889893, "learning_rate": 1.441427135678392e-05, "loss": 1.0822, "step": 28300 }, { "epoch": 1.42, "grad_norm": 3.6568844318389893, "learning_rate": 1.4394170854271359e-05, "loss": 1.0492, "step": 28400 }, { "epoch": 1.43, "grad_norm": 4.999285697937012, "learning_rate": 1.437427135678392e-05, "loss": 1.0583, "step": 28500 }, { "epoch": 1.43, "grad_norm": 4.125443458557129, "learning_rate": 1.4354170854271359e-05, "loss": 1.0422, "step": 28600 }, { "epoch": 1.44, "grad_norm": 6.014279365539551, "learning_rate": 1.4334070351758795e-05, "loss": 1.0347, "step": 28700 }, { "epoch": 1.44, "grad_norm": 8.18229866027832, "learning_rate": 1.4313969849246232e-05, "loss": 1.0133, "step": 28800 }, { "epoch": 1.45, "grad_norm": 3.3756470680236816, "learning_rate": 1.4294070351758796e-05, "loss": 1.0684, "step": 28900 }, { "epoch": 1.45, "grad_norm": 5.568530559539795, "learning_rate": 1.4273969849246232e-05, "loss": 1.0666, "step": 29000 }, { "epoch": 1.46, "grad_norm": 4.440110683441162, "learning_rate": 1.4253869346733668e-05, "loss": 1.057, "step": 29100 }, { "epoch": 1.46, "grad_norm": 6.835775852203369, "learning_rate": 1.4233768844221108e-05, "loss": 1.0176, "step": 29200 }, { "epoch": 1.47, "grad_norm": 5.715722560882568, "learning_rate": 1.4213668341708544e-05, "loss": 1.0996, "step": 29300 }, { "epoch": 1.47, "grad_norm": 6.401480674743652, "learning_rate": 1.419356783919598e-05, "loss": 1.0459, "step": 29400 }, { "epoch": 1.48, "grad_norm": 7.125598430633545, "learning_rate": 1.4173467336683417e-05, "loss": 1.0067, "step": 29500 }, { "epoch": 1.48, "grad_norm": 5.287647724151611, "learning_rate": 1.4153366834170856e-05, "loss": 1.0475, "step": 29600 }, { "epoch": 1.48, "grad_norm": 5.175357818603516, "learning_rate": 1.4133266331658293e-05, "loss": 1.0361, "step": 29700 }, { "epoch": 1.49, "grad_norm": 4.676697731018066, "learning_rate": 1.4113165829145729e-05, "loss": 0.9925, "step": 29800 }, { "epoch": 1.5, "grad_norm": 4.375120162963867, "learning_rate": 1.4093065326633167e-05, "loss": 1.0145, "step": 29900 }, { "epoch": 1.5, "grad_norm": 4.380770683288574, "learning_rate": 1.4072964824120605e-05, "loss": 1.0763, "step": 30000 }, { "epoch": 1.5, "eval_loss": 1.0519436597824097, "eval_runtime": 21.613, "eval_samples_per_second": 46.269, "eval_steps_per_second": 5.784, "step": 30000 }, { "epoch": 1.5, "grad_norm": 5.796531677246094, "learning_rate": 1.4052864321608041e-05, "loss": 1.0563, "step": 30100 }, { "epoch": 1.51, "grad_norm": 2.713714361190796, "learning_rate": 1.4032763819095479e-05, "loss": 1.0549, "step": 30200 }, { "epoch": 1.52, "grad_norm": 6.333755016326904, "learning_rate": 1.4012663316582915e-05, "loss": 1.042, "step": 30300 }, { "epoch": 1.52, "grad_norm": 3.8109474182128906, "learning_rate": 1.3992562814070353e-05, "loss": 1.0773, "step": 30400 }, { "epoch": 1.52, "grad_norm": 6.425621509552002, "learning_rate": 1.3972462311557791e-05, "loss": 1.0066, "step": 30500 }, { "epoch": 1.53, "grad_norm": 4.9127607345581055, "learning_rate": 1.3952361809045228e-05, "loss": 1.0022, "step": 30600 }, { "epoch": 1.54, "grad_norm": 4.212081432342529, "learning_rate": 1.3932261306532664e-05, "loss": 1.0358, "step": 30700 }, { "epoch": 1.54, "grad_norm": 7.6413187980651855, "learning_rate": 1.39121608040201e-05, "loss": 1.0413, "step": 30800 }, { "epoch": 1.54, "grad_norm": 4.2576494216918945, "learning_rate": 1.389206030150754e-05, "loss": 1.0332, "step": 30900 }, { "epoch": 1.55, "grad_norm": 4.797669887542725, "learning_rate": 1.3871959798994976e-05, "loss": 1.0396, "step": 31000 }, { "epoch": 1.56, "grad_norm": 5.891973972320557, "learning_rate": 1.3851859296482412e-05, "loss": 1.0281, "step": 31100 }, { "epoch": 1.56, "grad_norm": 5.9344964027404785, "learning_rate": 1.3831758793969849e-05, "loss": 1.024, "step": 31200 }, { "epoch": 1.56, "grad_norm": 4.902309417724609, "learning_rate": 1.3811658291457288e-05, "loss": 1.027, "step": 31300 }, { "epoch": 1.57, "grad_norm": 6.387609958648682, "learning_rate": 1.3791557788944725e-05, "loss": 1.0207, "step": 31400 }, { "epoch": 1.57, "grad_norm": 5.870815277099609, "learning_rate": 1.3771457286432161e-05, "loss": 1.0128, "step": 31500 }, { "epoch": 1.58, "grad_norm": 6.101361274719238, "learning_rate": 1.3751356783919599e-05, "loss": 1.0412, "step": 31600 }, { "epoch": 1.58, "grad_norm": 5.250607967376709, "learning_rate": 1.3731256281407037e-05, "loss": 1.0146, "step": 31700 }, { "epoch": 1.59, "grad_norm": 5.449378967285156, "learning_rate": 1.3711155778894473e-05, "loss": 1.03, "step": 31800 }, { "epoch": 1.59, "grad_norm": 4.564045429229736, "learning_rate": 1.3691055276381911e-05, "loss": 1.0567, "step": 31900 }, { "epoch": 1.6, "grad_norm": 5.84417200088501, "learning_rate": 1.3670954773869347e-05, "loss": 1.0782, "step": 32000 }, { "epoch": 1.6, "grad_norm": 4.725462436676025, "learning_rate": 1.3650854271356785e-05, "loss": 1.0327, "step": 32100 }, { "epoch": 1.61, "grad_norm": 6.999115943908691, "learning_rate": 1.3630753768844223e-05, "loss": 1.0405, "step": 32200 }, { "epoch": 1.61, "grad_norm": 4.241363525390625, "learning_rate": 1.361065326633166e-05, "loss": 1.02, "step": 32300 }, { "epoch": 1.62, "grad_norm": 5.884255886077881, "learning_rate": 1.3590552763819096e-05, "loss": 1.0634, "step": 32400 }, { "epoch": 1.62, "grad_norm": 3.674698829650879, "learning_rate": 1.3570452261306536e-05, "loss": 1.0389, "step": 32500 }, { "epoch": 1.63, "grad_norm": 4.227616310119629, "learning_rate": 1.3550351758793972e-05, "loss": 0.9992, "step": 32600 }, { "epoch": 1.64, "grad_norm": 4.682816982269287, "learning_rate": 1.3530251256281408e-05, "loss": 1.0111, "step": 32700 }, { "epoch": 1.64, "grad_norm": 4.632464408874512, "learning_rate": 1.3510150753768844e-05, "loss": 1.0223, "step": 32800 }, { "epoch": 1.65, "grad_norm": 6.061766147613525, "learning_rate": 1.349005025125628e-05, "loss": 0.9837, "step": 32900 }, { "epoch": 1.65, "grad_norm": 5.4998908042907715, "learning_rate": 1.346994974874372e-05, "loss": 1.041, "step": 33000 }, { "epoch": 1.66, "grad_norm": 6.294175624847412, "learning_rate": 1.3449849246231157e-05, "loss": 1.0311, "step": 33100 }, { "epoch": 1.66, "grad_norm": 5.177206039428711, "learning_rate": 1.3429748743718593e-05, "loss": 1.0435, "step": 33200 }, { "epoch": 1.67, "grad_norm": 4.389501571655273, "learning_rate": 1.3409648241206031e-05, "loss": 1.0104, "step": 33300 }, { "epoch": 1.67, "grad_norm": 5.105901718139648, "learning_rate": 1.3389547738693469e-05, "loss": 0.9868, "step": 33400 }, { "epoch": 1.68, "grad_norm": 3.407482147216797, "learning_rate": 1.3369447236180905e-05, "loss": 1.0559, "step": 33500 }, { "epoch": 1.68, "grad_norm": 6.496652126312256, "learning_rate": 1.3349346733668343e-05, "loss": 0.9849, "step": 33600 }, { "epoch": 1.69, "grad_norm": 6.241397857666016, "learning_rate": 1.332924623115578e-05, "loss": 0.9995, "step": 33700 }, { "epoch": 1.69, "grad_norm": 5.998499870300293, "learning_rate": 1.3309145728643217e-05, "loss": 1.0355, "step": 33800 }, { "epoch": 1.69, "grad_norm": 5.380569934844971, "learning_rate": 1.3289045226130655e-05, "loss": 1.0082, "step": 33900 }, { "epoch": 1.7, "grad_norm": 5.168824195861816, "learning_rate": 1.3268944723618092e-05, "loss": 1.052, "step": 34000 }, { "epoch": 1.71, "grad_norm": 5.691008567810059, "learning_rate": 1.3248844221105528e-05, "loss": 1.0424, "step": 34100 }, { "epoch": 1.71, "grad_norm": 5.678094387054443, "learning_rate": 1.3228743718592968e-05, "loss": 1.0083, "step": 34200 }, { "epoch": 1.71, "grad_norm": 6.432235240936279, "learning_rate": 1.3208643216080404e-05, "loss": 0.9766, "step": 34300 }, { "epoch": 1.72, "grad_norm": 6.016462326049805, "learning_rate": 1.318854271356784e-05, "loss": 1.0059, "step": 34400 }, { "epoch": 1.73, "grad_norm": 4.596778392791748, "learning_rate": 1.3168442211055276e-05, "loss": 0.9462, "step": 34500 }, { "epoch": 1.73, "grad_norm": 7.2965850830078125, "learning_rate": 1.3148341708542716e-05, "loss": 0.972, "step": 34600 }, { "epoch": 1.73, "grad_norm": 5.232773780822754, "learning_rate": 1.3128241206030152e-05, "loss": 1.0532, "step": 34700 }, { "epoch": 1.74, "grad_norm": 5.5057783126831055, "learning_rate": 1.3108140703517589e-05, "loss": 0.9835, "step": 34800 }, { "epoch": 1.75, "grad_norm": 3.0561375617980957, "learning_rate": 1.3088040201005025e-05, "loss": 1.0293, "step": 34900 }, { "epoch": 1.75, "grad_norm": 4.761837959289551, "learning_rate": 1.3068140703517589e-05, "loss": 1.0232, "step": 35000 }, { "epoch": 1.75, "grad_norm": 7.006007671356201, "learning_rate": 1.3048040201005025e-05, "loss": 0.9945, "step": 35100 }, { "epoch": 1.76, "grad_norm": 4.829462051391602, "learning_rate": 1.3027939698492465e-05, "loss": 1.0589, "step": 35200 }, { "epoch": 1.77, "grad_norm": 3.8825013637542725, "learning_rate": 1.3007839195979901e-05, "loss": 0.9984, "step": 35300 }, { "epoch": 1.77, "grad_norm": 5.655978202819824, "learning_rate": 1.2987738693467338e-05, "loss": 1.0004, "step": 35400 }, { "epoch": 1.77, "grad_norm": 5.612642765045166, "learning_rate": 1.2967638190954774e-05, "loss": 0.9874, "step": 35500 }, { "epoch": 1.78, "grad_norm": 17.78661346435547, "learning_rate": 1.2947537688442212e-05, "loss": 1.0322, "step": 35600 }, { "epoch": 1.79, "grad_norm": 4.723743915557861, "learning_rate": 1.292743718592965e-05, "loss": 0.9984, "step": 35700 }, { "epoch": 1.79, "grad_norm": 5.048336982727051, "learning_rate": 1.2907336683417086e-05, "loss": 1.0588, "step": 35800 }, { "epoch": 1.79, "grad_norm": 6.086093425750732, "learning_rate": 1.2887236180904524e-05, "loss": 1.0075, "step": 35900 }, { "epoch": 1.8, "grad_norm": 6.542403697967529, "learning_rate": 1.286713567839196e-05, "loss": 1.0219, "step": 36000 }, { "epoch": 1.81, "grad_norm": 5.013860702514648, "learning_rate": 1.2847035175879398e-05, "loss": 1.0307, "step": 36100 }, { "epoch": 1.81, "grad_norm": 5.978675365447998, "learning_rate": 1.2826934673366835e-05, "loss": 1.0026, "step": 36200 }, { "epoch": 1.81, "grad_norm": 6.217547416687012, "learning_rate": 1.2806834170854273e-05, "loss": 1.0196, "step": 36300 }, { "epoch": 1.82, "grad_norm": 4.577905654907227, "learning_rate": 1.2786733668341709e-05, "loss": 0.9767, "step": 36400 }, { "epoch": 1.82, "grad_norm": 4.999172210693359, "learning_rate": 1.2766633165829147e-05, "loss": 1.0261, "step": 36500 }, { "epoch": 1.83, "grad_norm": 3.3435771465301514, "learning_rate": 1.2746532663316585e-05, "loss": 0.9751, "step": 36600 }, { "epoch": 1.83, "grad_norm": 6.218837261199951, "learning_rate": 1.2726432160804021e-05, "loss": 0.9887, "step": 36700 }, { "epoch": 1.84, "grad_norm": 2.914499044418335, "learning_rate": 1.2706331658291457e-05, "loss": 1.0172, "step": 36800 }, { "epoch": 1.84, "grad_norm": 4.287944793701172, "learning_rate": 1.2686231155778897e-05, "loss": 1.0336, "step": 36900 }, { "epoch": 1.85, "grad_norm": 9.045112609863281, "learning_rate": 1.2666331658291458e-05, "loss": 0.9966, "step": 37000 }, { "epoch": 1.85, "grad_norm": 3.9664063453674316, "learning_rate": 1.2646231155778896e-05, "loss": 1.0315, "step": 37100 }, { "epoch": 1.86, "grad_norm": 5.26336145401001, "learning_rate": 1.2626130653266334e-05, "loss": 1.031, "step": 37200 }, { "epoch": 1.86, "grad_norm": 5.820954322814941, "learning_rate": 1.260603015075377e-05, "loss": 0.9786, "step": 37300 }, { "epoch": 1.87, "grad_norm": 3.7999236583709717, "learning_rate": 1.2585929648241206e-05, "loss": 1.0008, "step": 37400 }, { "epoch": 1.88, "grad_norm": 4.96231746673584, "learning_rate": 1.2565829145728646e-05, "loss": 0.9823, "step": 37500 }, { "epoch": 1.88, "grad_norm": 5.442008018493652, "learning_rate": 1.2545728643216082e-05, "loss": 0.9993, "step": 37600 }, { "epoch": 1.89, "grad_norm": 3.0178353786468506, "learning_rate": 1.2525628140703518e-05, "loss": 1.009, "step": 37700 }, { "epoch": 1.89, "grad_norm": 4.0404052734375, "learning_rate": 1.2505527638190955e-05, "loss": 1.0047, "step": 37800 }, { "epoch": 1.9, "grad_norm": 3.924924850463867, "learning_rate": 1.2485427135678394e-05, "loss": 0.9681, "step": 37900 }, { "epoch": 1.9, "grad_norm": 6.560153961181641, "learning_rate": 1.246532663316583e-05, "loss": 0.9346, "step": 38000 }, { "epoch": 1.91, "grad_norm": 4.826027870178223, "learning_rate": 1.2445226130653267e-05, "loss": 0.9878, "step": 38100 }, { "epoch": 1.91, "grad_norm": 3.489680767059326, "learning_rate": 1.2425125628140703e-05, "loss": 0.9943, "step": 38200 }, { "epoch": 1.92, "grad_norm": 4.7767014503479, "learning_rate": 1.2405025125628141e-05, "loss": 1.02, "step": 38300 }, { "epoch": 1.92, "grad_norm": 7.311853408813477, "learning_rate": 1.238492462311558e-05, "loss": 0.946, "step": 38400 }, { "epoch": 1.93, "grad_norm": 4.217949390411377, "learning_rate": 1.236502512562814e-05, "loss": 0.974, "step": 38500 }, { "epoch": 1.93, "grad_norm": 8.919093132019043, "learning_rate": 1.234492462311558e-05, "loss": 0.9628, "step": 38600 }, { "epoch": 1.94, "grad_norm": 4.355369567871094, "learning_rate": 1.2324824120603016e-05, "loss": 0.9325, "step": 38700 }, { "epoch": 1.94, "grad_norm": 5.474518775939941, "learning_rate": 1.2304723618090452e-05, "loss": 0.9505, "step": 38800 }, { "epoch": 1.94, "grad_norm": 6.389540195465088, "learning_rate": 1.228462311557789e-05, "loss": 0.9574, "step": 38900 }, { "epoch": 1.95, "grad_norm": 6.9164719581604, "learning_rate": 1.2264522613065328e-05, "loss": 0.9644, "step": 39000 }, { "epoch": 1.96, "grad_norm": 4.559136390686035, "learning_rate": 1.2244422110552764e-05, "loss": 1.0306, "step": 39100 }, { "epoch": 1.96, "grad_norm": 6.381926536560059, "learning_rate": 1.2224321608040202e-05, "loss": 0.9542, "step": 39200 }, { "epoch": 1.96, "grad_norm": 7.826279163360596, "learning_rate": 1.2204221105527639e-05, "loss": 0.9818, "step": 39300 }, { "epoch": 1.97, "grad_norm": 5.7296929359436035, "learning_rate": 1.2184120603015077e-05, "loss": 0.9591, "step": 39400 }, { "epoch": 1.98, "grad_norm": 6.479053974151611, "learning_rate": 1.2164020100502515e-05, "loss": 1.0083, "step": 39500 }, { "epoch": 1.98, "grad_norm": 5.9377241134643555, "learning_rate": 1.2144120603015077e-05, "loss": 0.9969, "step": 39600 }, { "epoch": 1.98, "grad_norm": 4.59481143951416, "learning_rate": 1.2124020100502513e-05, "loss": 1.015, "step": 39700 }, { "epoch": 1.99, "grad_norm": 4.979703903198242, "learning_rate": 1.2103919597989951e-05, "loss": 0.977, "step": 39800 }, { "epoch": 2.0, "grad_norm": 6.539973735809326, "learning_rate": 1.2083819095477388e-05, "loss": 0.9938, "step": 39900 }, { "epoch": 2.0, "grad_norm": 5.971490383148193, "learning_rate": 1.2063718592964825e-05, "loss": 0.9848, "step": 40000 }, { "epoch": 2.0, "eval_loss": 0.9915822744369507, "eval_runtime": 21.5957, "eval_samples_per_second": 46.305, "eval_steps_per_second": 5.788, "step": 40000 }, { "epoch": 2.0, "grad_norm": 5.639512538909912, "learning_rate": 1.2043618090452262e-05, "loss": 0.9401, "step": 40100 }, { "epoch": 2.01, "grad_norm": 3.0007824897766113, "learning_rate": 1.20235175879397e-05, "loss": 0.9769, "step": 40200 }, { "epoch": 2.02, "grad_norm": 4.346365451812744, "learning_rate": 1.2003417085427136e-05, "loss": 0.9247, "step": 40300 }, { "epoch": 2.02, "grad_norm": 6.308602809906006, "learning_rate": 1.1983316582914574e-05, "loss": 0.9685, "step": 40400 }, { "epoch": 2.02, "grad_norm": 4.597143173217773, "learning_rate": 1.1963216080402012e-05, "loss": 0.907, "step": 40500 }, { "epoch": 2.03, "grad_norm": 6.000264644622803, "learning_rate": 1.1943115577889448e-05, "loss": 0.9311, "step": 40600 }, { "epoch": 2.04, "grad_norm": 4.718263149261475, "learning_rate": 1.1923015075376885e-05, "loss": 0.9707, "step": 40700 }, { "epoch": 2.04, "grad_norm": 3.7472355365753174, "learning_rate": 1.1902914572864324e-05, "loss": 0.9812, "step": 40800 }, { "epoch": 2.04, "grad_norm": 4.8061017990112305, "learning_rate": 1.188281407035176e-05, "loss": 0.9461, "step": 40900 }, { "epoch": 2.05, "grad_norm": 5.2381391525268555, "learning_rate": 1.1862713567839197e-05, "loss": 0.9972, "step": 41000 }, { "epoch": 2.06, "grad_norm": 6.1567583084106445, "learning_rate": 1.1842613065326633e-05, "loss": 0.9611, "step": 41100 }, { "epoch": 2.06, "grad_norm": 5.496160984039307, "learning_rate": 1.1822512562814071e-05, "loss": 0.9612, "step": 41200 }, { "epoch": 2.06, "grad_norm": 6.659996509552002, "learning_rate": 1.1802412060301509e-05, "loss": 0.9593, "step": 41300 }, { "epoch": 2.07, "grad_norm": 7.010763645172119, "learning_rate": 1.1782311557788945e-05, "loss": 0.9079, "step": 41400 }, { "epoch": 2.08, "grad_norm": 5.539340496063232, "learning_rate": 1.1762211055276383e-05, "loss": 0.946, "step": 41500 }, { "epoch": 2.08, "grad_norm": 4.7269368171691895, "learning_rate": 1.174211055276382e-05, "loss": 0.9702, "step": 41600 }, { "epoch": 2.08, "grad_norm": 6.573697090148926, "learning_rate": 1.1722010050251257e-05, "loss": 0.9166, "step": 41700 }, { "epoch": 2.09, "grad_norm": 5.467616558074951, "learning_rate": 1.1701909547738694e-05, "loss": 0.9479, "step": 41800 }, { "epoch": 2.1, "grad_norm": 7.292219638824463, "learning_rate": 1.1681809045226132e-05, "loss": 0.9694, "step": 41900 }, { "epoch": 2.1, "grad_norm": 5.9063849449157715, "learning_rate": 1.1661708542713568e-05, "loss": 0.9467, "step": 42000 }, { "epoch": 2.1, "grad_norm": 7.106956958770752, "learning_rate": 1.1641608040201006e-05, "loss": 0.9344, "step": 42100 }, { "epoch": 2.11, "grad_norm": 2.7898268699645996, "learning_rate": 1.1621507537688444e-05, "loss": 0.9174, "step": 42200 }, { "epoch": 2.12, "grad_norm": 5.543144226074219, "learning_rate": 1.160140703517588e-05, "loss": 0.9399, "step": 42300 }, { "epoch": 2.12, "grad_norm": 4.507541656494141, "learning_rate": 1.1581306532663317e-05, "loss": 0.8989, "step": 42400 }, { "epoch": 2.12, "grad_norm": 7.4493937492370605, "learning_rate": 1.1561206030150756e-05, "loss": 0.9663, "step": 42500 }, { "epoch": 2.13, "grad_norm": 5.758662700653076, "learning_rate": 1.1541105527638192e-05, "loss": 0.983, "step": 42600 }, { "epoch": 2.13, "grad_norm": 4.6601386070251465, "learning_rate": 1.1521005025125629e-05, "loss": 0.936, "step": 42700 }, { "epoch": 2.14, "grad_norm": 6.687641620635986, "learning_rate": 1.1500904522613065e-05, "loss": 0.9452, "step": 42800 }, { "epoch": 2.15, "grad_norm": 6.454759120941162, "learning_rate": 1.1480804020100505e-05, "loss": 0.9494, "step": 42900 }, { "epoch": 2.15, "grad_norm": 6.235274314880371, "learning_rate": 1.1460703517587941e-05, "loss": 0.9107, "step": 43000 }, { "epoch": 2.15, "grad_norm": 6.445216655731201, "learning_rate": 1.1440603015075377e-05, "loss": 0.9448, "step": 43100 }, { "epoch": 2.16, "grad_norm": 4.565326690673828, "learning_rate": 1.1420502512562814e-05, "loss": 0.9435, "step": 43200 }, { "epoch": 2.17, "grad_norm": 4.653913497924805, "learning_rate": 1.1400402010050253e-05, "loss": 0.9492, "step": 43300 }, { "epoch": 2.17, "grad_norm": 4.022702693939209, "learning_rate": 1.138030150753769e-05, "loss": 0.9365, "step": 43400 }, { "epoch": 2.17, "grad_norm": 6.998848915100098, "learning_rate": 1.1360201005025126e-05, "loss": 0.9215, "step": 43500 }, { "epoch": 2.18, "grad_norm": 3.925429344177246, "learning_rate": 1.1340100502512564e-05, "loss": 0.9408, "step": 43600 }, { "epoch": 2.19, "grad_norm": 5.22701358795166, "learning_rate": 1.132e-05, "loss": 0.9755, "step": 43700 }, { "epoch": 2.19, "grad_norm": 5.142667293548584, "learning_rate": 1.1299899497487438e-05, "loss": 0.8938, "step": 43800 }, { "epoch": 2.19, "grad_norm": 5.2655158042907715, "learning_rate": 1.1279798994974876e-05, "loss": 0.9751, "step": 43900 }, { "epoch": 2.2, "grad_norm": 5.084207057952881, "learning_rate": 1.1259698492462312e-05, "loss": 0.9141, "step": 44000 }, { "epoch": 2.21, "grad_norm": 4.578594207763672, "learning_rate": 1.1239597989949749e-05, "loss": 0.9403, "step": 44100 }, { "epoch": 2.21, "grad_norm": 3.3010849952697754, "learning_rate": 1.1219497487437188e-05, "loss": 0.9657, "step": 44200 }, { "epoch": 2.21, "grad_norm": 6.451618194580078, "learning_rate": 1.1199396984924624e-05, "loss": 0.9297, "step": 44300 }, { "epoch": 2.22, "grad_norm": 5.8492655754089355, "learning_rate": 1.117929648241206e-05, "loss": 0.9157, "step": 44400 }, { "epoch": 2.23, "grad_norm": 5.015758037567139, "learning_rate": 1.1159195979899497e-05, "loss": 0.9508, "step": 44500 }, { "epoch": 2.23, "grad_norm": 4.814078330993652, "learning_rate": 1.1139095477386937e-05, "loss": 0.9361, "step": 44600 }, { "epoch": 2.23, "grad_norm": 5.004156112670898, "learning_rate": 1.1118994974874373e-05, "loss": 0.958, "step": 44700 }, { "epoch": 2.24, "grad_norm": 5.016057968139648, "learning_rate": 1.109889447236181e-05, "loss": 0.9755, "step": 44800 }, { "epoch": 2.25, "grad_norm": 5.041826248168945, "learning_rate": 1.1078793969849246e-05, "loss": 0.9082, "step": 44900 }, { "epoch": 2.25, "grad_norm": 5.787368297576904, "learning_rate": 1.1058693467336685e-05, "loss": 0.9076, "step": 45000 }, { "epoch": 2.25, "grad_norm": 5.170538902282715, "learning_rate": 1.1038592964824122e-05, "loss": 0.9117, "step": 45100 }, { "epoch": 2.26, "grad_norm": 7.477475643157959, "learning_rate": 1.1018492462311558e-05, "loss": 0.8987, "step": 45200 }, { "epoch": 2.27, "grad_norm": 4.626328945159912, "learning_rate": 1.0998391959798996e-05, "loss": 0.9197, "step": 45300 }, { "epoch": 2.27, "grad_norm": 5.800539016723633, "learning_rate": 1.0978291457286434e-05, "loss": 0.9025, "step": 45400 }, { "epoch": 2.27, "grad_norm": 4.291562080383301, "learning_rate": 1.0958391959798994e-05, "loss": 0.9348, "step": 45500 }, { "epoch": 2.28, "grad_norm": 5.439847946166992, "learning_rate": 1.0938291457286434e-05, "loss": 0.9416, "step": 45600 }, { "epoch": 2.29, "grad_norm": 5.728611946105957, "learning_rate": 1.091819095477387e-05, "loss": 0.9124, "step": 45700 }, { "epoch": 2.29, "grad_norm": 3.7975008487701416, "learning_rate": 1.0898090452261307e-05, "loss": 0.9345, "step": 45800 }, { "epoch": 2.29, "grad_norm": 7.625438690185547, "learning_rate": 1.0877989949748745e-05, "loss": 0.8925, "step": 45900 }, { "epoch": 2.3, "grad_norm": 4.858023643493652, "learning_rate": 1.0857889447236183e-05, "loss": 0.9103, "step": 46000 }, { "epoch": 2.31, "grad_norm": 6.363548755645752, "learning_rate": 1.0837788944723619e-05, "loss": 0.9523, "step": 46100 }, { "epoch": 2.31, "grad_norm": 4.639822959899902, "learning_rate": 1.0817688442211057e-05, "loss": 0.9322, "step": 46200 }, { "epoch": 2.31, "grad_norm": 4.796472072601318, "learning_rate": 1.0797587939698493e-05, "loss": 0.9242, "step": 46300 }, { "epoch": 2.32, "grad_norm": 3.8870980739593506, "learning_rate": 1.077748743718593e-05, "loss": 0.9048, "step": 46400 }, { "epoch": 2.33, "grad_norm": 6.010646343231201, "learning_rate": 1.0757386934673369e-05, "loss": 0.9566, "step": 46500 }, { "epoch": 2.33, "grad_norm": 3.925715684890747, "learning_rate": 1.0737286432160805e-05, "loss": 0.9274, "step": 46600 }, { "epoch": 2.33, "grad_norm": 5.222326278686523, "learning_rate": 1.0717185929648242e-05, "loss": 0.8914, "step": 46700 }, { "epoch": 2.34, "grad_norm": 5.363781929016113, "learning_rate": 1.0697085427135678e-05, "loss": 0.922, "step": 46800 }, { "epoch": 2.34, "grad_norm": 6.332427024841309, "learning_rate": 1.0676984924623118e-05, "loss": 0.9017, "step": 46900 }, { "epoch": 2.35, "grad_norm": 4.68159818649292, "learning_rate": 1.0656884422110554e-05, "loss": 0.9089, "step": 47000 }, { "epoch": 2.35, "grad_norm": 4.770488739013672, "learning_rate": 1.063678391959799e-05, "loss": 0.9738, "step": 47100 }, { "epoch": 2.36, "grad_norm": 6.209041595458984, "learning_rate": 1.0616683417085426e-05, "loss": 0.9301, "step": 47200 }, { "epoch": 2.37, "grad_norm": 5.330206394195557, "learning_rate": 1.0596582914572866e-05, "loss": 0.9515, "step": 47300 }, { "epoch": 2.37, "grad_norm": 7.701655387878418, "learning_rate": 1.0576482412060302e-05, "loss": 0.9072, "step": 47400 }, { "epoch": 2.38, "grad_norm": 4.921889305114746, "learning_rate": 1.0556381909547739e-05, "loss": 0.9326, "step": 47500 }, { "epoch": 2.38, "grad_norm": 5.353864669799805, "learning_rate": 1.0536281407035177e-05, "loss": 0.902, "step": 47600 }, { "epoch": 2.38, "grad_norm": 4.63252592086792, "learning_rate": 1.0516180904522615e-05, "loss": 0.9357, "step": 47700 }, { "epoch": 2.39, "grad_norm": 5.968425750732422, "learning_rate": 1.0496281407035175e-05, "loss": 0.9416, "step": 47800 }, { "epoch": 2.4, "grad_norm": 5.979503154754639, "learning_rate": 1.0476180904522615e-05, "loss": 0.9461, "step": 47900 }, { "epoch": 2.4, "grad_norm": 5.928488731384277, "learning_rate": 1.0456080402010051e-05, "loss": 0.9045, "step": 48000 }, { "epoch": 2.41, "grad_norm": 12.569512367248535, "learning_rate": 1.0435979899497488e-05, "loss": 0.9205, "step": 48100 }, { "epoch": 2.41, "grad_norm": 4.5606865882873535, "learning_rate": 1.0415879396984926e-05, "loss": 0.9005, "step": 48200 }, { "epoch": 2.42, "grad_norm": 5.511040210723877, "learning_rate": 1.0395778894472364e-05, "loss": 0.9105, "step": 48300 }, { "epoch": 2.42, "grad_norm": 5.660979747772217, "learning_rate": 1.03756783919598e-05, "loss": 0.8911, "step": 48400 }, { "epoch": 2.42, "grad_norm": 3.5720648765563965, "learning_rate": 1.0355577889447238e-05, "loss": 0.9468, "step": 48500 }, { "epoch": 2.43, "grad_norm": 5.770594120025635, "learning_rate": 1.0335477386934674e-05, "loss": 0.9296, "step": 48600 }, { "epoch": 2.44, "grad_norm": 4.0545477867126465, "learning_rate": 1.0315376884422112e-05, "loss": 0.9133, "step": 48700 }, { "epoch": 2.44, "grad_norm": 4.586203575134277, "learning_rate": 1.0295276381909548e-05, "loss": 0.906, "step": 48800 }, { "epoch": 2.44, "grad_norm": 5.315196514129639, "learning_rate": 1.0275175879396986e-05, "loss": 0.9065, "step": 48900 }, { "epoch": 2.45, "grad_norm": 5.344489574432373, "learning_rate": 1.0255075376884423e-05, "loss": 0.9363, "step": 49000 }, { "epoch": 2.46, "grad_norm": 6.762577533721924, "learning_rate": 1.0234974874371859e-05, "loss": 0.9366, "step": 49100 }, { "epoch": 2.46, "grad_norm": 4.087870121002197, "learning_rate": 1.0214874371859299e-05, "loss": 0.8812, "step": 49200 }, { "epoch": 2.46, "grad_norm": 5.586741924285889, "learning_rate": 1.0194773869346735e-05, "loss": 0.9341, "step": 49300 }, { "epoch": 2.47, "grad_norm": 8.180070877075195, "learning_rate": 1.0174673366834171e-05, "loss": 0.9381, "step": 49400 }, { "epoch": 2.48, "grad_norm": 4.389576435089111, "learning_rate": 1.0154572864321607e-05, "loss": 0.9288, "step": 49500 }, { "epoch": 2.48, "grad_norm": 4.339807033538818, "learning_rate": 1.0134472361809047e-05, "loss": 0.9282, "step": 49600 }, { "epoch": 2.48, "grad_norm": 7.801273345947266, "learning_rate": 1.0114371859296483e-05, "loss": 0.9657, "step": 49700 }, { "epoch": 2.49, "grad_norm": 6.016520977020264, "learning_rate": 1.009427135678392e-05, "loss": 0.8704, "step": 49800 }, { "epoch": 2.5, "grad_norm": 5.2764506340026855, "learning_rate": 1.0074170854271358e-05, "loss": 0.9226, "step": 49900 }, { "epoch": 2.5, "grad_norm": 4.923444747924805, "learning_rate": 1.0054070351758796e-05, "loss": 0.9084, "step": 50000 }, { "epoch": 2.5, "eval_loss": 0.9846327900886536, "eval_runtime": 21.5925, "eval_samples_per_second": 46.312, "eval_steps_per_second": 5.789, "step": 50000 }, { "epoch": 2.5, "grad_norm": 6.061006546020508, "learning_rate": 1.0033969849246232e-05, "loss": 0.9218, "step": 50100 }, { "epoch": 2.51, "grad_norm": 4.1440348625183105, "learning_rate": 1.0013869346733668e-05, "loss": 0.9324, "step": 50200 }, { "epoch": 2.52, "grad_norm": 4.084045886993408, "learning_rate": 9.993768844221106e-06, "loss": 0.8859, "step": 50300 }, { "epoch": 2.52, "grad_norm": 3.723971366882324, "learning_rate": 9.973668341708544e-06, "loss": 0.9128, "step": 50400 }, { "epoch": 2.52, "grad_norm": 3.9887030124664307, "learning_rate": 9.95356783919598e-06, "loss": 0.8987, "step": 50500 }, { "epoch": 2.53, "grad_norm": 5.572610855102539, "learning_rate": 9.933467336683418e-06, "loss": 0.9287, "step": 50600 }, { "epoch": 2.54, "grad_norm": 5.956911087036133, "learning_rate": 9.913366834170856e-06, "loss": 0.8808, "step": 50700 }, { "epoch": 2.54, "grad_norm": 3.948564052581787, "learning_rate": 9.893266331658293e-06, "loss": 0.9243, "step": 50800 }, { "epoch": 2.54, "grad_norm": 5.561892509460449, "learning_rate": 9.87316582914573e-06, "loss": 0.9174, "step": 50900 }, { "epoch": 2.55, "grad_norm": 5.9155755043029785, "learning_rate": 9.853065326633167e-06, "loss": 0.8951, "step": 51000 }, { "epoch": 2.56, "grad_norm": 5.4488348960876465, "learning_rate": 9.832964824120603e-06, "loss": 0.8864, "step": 51100 }, { "epoch": 2.56, "grad_norm": 4.52565860748291, "learning_rate": 9.812864321608041e-06, "loss": 0.9103, "step": 51200 }, { "epoch": 2.56, "grad_norm": 3.995807647705078, "learning_rate": 9.792763819095477e-06, "loss": 0.8999, "step": 51300 }, { "epoch": 2.57, "grad_norm": 9.156529426574707, "learning_rate": 9.772663316582915e-06, "loss": 0.9383, "step": 51400 }, { "epoch": 2.58, "grad_norm": 6.388377666473389, "learning_rate": 9.752562814070352e-06, "loss": 0.908, "step": 51500 }, { "epoch": 2.58, "grad_norm": 3.975545644760132, "learning_rate": 9.73246231155779e-06, "loss": 0.9006, "step": 51600 }, { "epoch": 2.58, "grad_norm": 4.579479694366455, "learning_rate": 9.712361809045226e-06, "loss": 0.9443, "step": 51700 }, { "epoch": 2.59, "grad_norm": 5.22560977935791, "learning_rate": 9.69246231155779e-06, "loss": 0.909, "step": 51800 }, { "epoch": 2.59, "grad_norm": 5.2606587409973145, "learning_rate": 9.672361809045226e-06, "loss": 0.9255, "step": 51900 }, { "epoch": 2.6, "grad_norm": 4.772227764129639, "learning_rate": 9.652261306532664e-06, "loss": 0.9161, "step": 52000 }, { "epoch": 2.6, "grad_norm": 4.636828899383545, "learning_rate": 9.6321608040201e-06, "loss": 0.874, "step": 52100 }, { "epoch": 2.61, "grad_norm": 4.5946784019470215, "learning_rate": 9.612060301507538e-06, "loss": 0.902, "step": 52200 }, { "epoch": 2.62, "grad_norm": 4.0993266105651855, "learning_rate": 9.591959798994975e-06, "loss": 0.9369, "step": 52300 }, { "epoch": 2.62, "grad_norm": 5.468399524688721, "learning_rate": 9.571859296482413e-06, "loss": 0.9359, "step": 52400 }, { "epoch": 2.62, "grad_norm": 10.92428970336914, "learning_rate": 9.551758793969849e-06, "loss": 0.8889, "step": 52500 }, { "epoch": 2.63, "grad_norm": 6.2350029945373535, "learning_rate": 9.531658291457287e-06, "loss": 0.9304, "step": 52600 }, { "epoch": 2.63, "grad_norm": 4.780547618865967, "learning_rate": 9.511557788944725e-06, "loss": 0.9396, "step": 52700 }, { "epoch": 2.64, "grad_norm": 3.1009738445281982, "learning_rate": 9.491457286432161e-06, "loss": 0.9077, "step": 52800 }, { "epoch": 2.65, "grad_norm": 7.036947727203369, "learning_rate": 9.4713567839196e-06, "loss": 0.8753, "step": 52900 }, { "epoch": 2.65, "grad_norm": 4.945110321044922, "learning_rate": 9.451256281407035e-06, "loss": 0.9157, "step": 53000 }, { "epoch": 2.66, "grad_norm": 5.361321926116943, "learning_rate": 9.431155778894473e-06, "loss": 0.8929, "step": 53100 }, { "epoch": 2.66, "grad_norm": 3.351379632949829, "learning_rate": 9.411055276381911e-06, "loss": 0.8636, "step": 53200 }, { "epoch": 2.67, "grad_norm": 5.34309720993042, "learning_rate": 9.390954773869348e-06, "loss": 0.8865, "step": 53300 }, { "epoch": 2.67, "grad_norm": 5.316425800323486, "learning_rate": 9.370854271356786e-06, "loss": 0.9178, "step": 53400 }, { "epoch": 2.67, "grad_norm": 4.478712558746338, "learning_rate": 9.350753768844222e-06, "loss": 0.9181, "step": 53500 }, { "epoch": 2.68, "grad_norm": 5.095877647399902, "learning_rate": 9.33065326633166e-06, "loss": 0.902, "step": 53600 }, { "epoch": 2.69, "grad_norm": 4.4164862632751465, "learning_rate": 9.310552763819096e-06, "loss": 0.887, "step": 53700 }, { "epoch": 2.69, "grad_norm": 6.3961591720581055, "learning_rate": 9.290452261306533e-06, "loss": 0.8778, "step": 53800 }, { "epoch": 2.69, "grad_norm": 7.141729354858398, "learning_rate": 9.270552763819097e-06, "loss": 0.9144, "step": 53900 }, { "epoch": 2.7, "grad_norm": 5.858211040496826, "learning_rate": 9.250452261306535e-06, "loss": 0.8889, "step": 54000 }, { "epoch": 2.71, "grad_norm": 5.192725658416748, "learning_rate": 9.230351758793971e-06, "loss": 0.8928, "step": 54100 }, { "epoch": 2.71, "grad_norm": 6.190788745880127, "learning_rate": 9.210251256281407e-06, "loss": 0.8683, "step": 54200 }, { "epoch": 2.71, "grad_norm": 4.610683441162109, "learning_rate": 9.190150753768845e-06, "loss": 0.9473, "step": 54300 }, { "epoch": 2.72, "grad_norm": 5.043734550476074, "learning_rate": 9.170050251256281e-06, "loss": 0.9142, "step": 54400 }, { "epoch": 2.73, "grad_norm": 5.166931629180908, "learning_rate": 9.14994974874372e-06, "loss": 0.8894, "step": 54500 }, { "epoch": 2.73, "grad_norm": 5.05250358581543, "learning_rate": 9.129849246231156e-06, "loss": 0.8799, "step": 54600 }, { "epoch": 2.73, "grad_norm": 5.468914031982422, "learning_rate": 9.109748743718594e-06, "loss": 0.9099, "step": 54700 }, { "epoch": 2.74, "grad_norm": 4.162414073944092, "learning_rate": 9.08964824120603e-06, "loss": 0.8859, "step": 54800 }, { "epoch": 2.75, "grad_norm": 5.149291515350342, "learning_rate": 9.069547738693468e-06, "loss": 0.9096, "step": 54900 }, { "epoch": 2.75, "grad_norm": 4.889472961425781, "learning_rate": 9.049447236180904e-06, "loss": 0.8953, "step": 55000 }, { "epoch": 2.75, "grad_norm": 4.146818161010742, "learning_rate": 9.029346733668342e-06, "loss": 0.8917, "step": 55100 }, { "epoch": 2.76, "grad_norm": 5.937385559082031, "learning_rate": 9.00924623115578e-06, "loss": 0.9295, "step": 55200 }, { "epoch": 2.77, "grad_norm": 4.749314785003662, "learning_rate": 8.989145728643216e-06, "loss": 0.8776, "step": 55300 }, { "epoch": 2.77, "grad_norm": 6.271254539489746, "learning_rate": 8.969045226130654e-06, "loss": 0.8593, "step": 55400 }, { "epoch": 2.77, "grad_norm": 5.769760608673096, "learning_rate": 8.948944723618092e-06, "loss": 0.891, "step": 55500 }, { "epoch": 2.78, "grad_norm": 4.185112476348877, "learning_rate": 8.928844221105529e-06, "loss": 0.8869, "step": 55600 }, { "epoch": 2.79, "grad_norm": 3.2164394855499268, "learning_rate": 8.908743718592967e-06, "loss": 0.8992, "step": 55700 }, { "epoch": 2.79, "grad_norm": 4.406613349914551, "learning_rate": 8.888643216080403e-06, "loss": 0.8971, "step": 55800 }, { "epoch": 2.79, "grad_norm": 5.101110458374023, "learning_rate": 8.868542713567841e-06, "loss": 0.9066, "step": 55900 }, { "epoch": 2.8, "grad_norm": 4.963405132293701, "learning_rate": 8.848643216080403e-06, "loss": 0.881, "step": 56000 }, { "epoch": 2.81, "grad_norm": 7.5268683433532715, "learning_rate": 8.82854271356784e-06, "loss": 0.8692, "step": 56100 }, { "epoch": 2.81, "grad_norm": 5.325132369995117, "learning_rate": 8.808442211055278e-06, "loss": 0.895, "step": 56200 }, { "epoch": 2.81, "grad_norm": 4.687073707580566, "learning_rate": 8.788341708542715e-06, "loss": 0.9007, "step": 56300 }, { "epoch": 2.82, "grad_norm": 4.215831279754639, "learning_rate": 8.768241206030152e-06, "loss": 0.8783, "step": 56400 }, { "epoch": 2.83, "grad_norm": 6.363833427429199, "learning_rate": 8.74814070351759e-06, "loss": 0.9276, "step": 56500 }, { "epoch": 2.83, "grad_norm": 4.2875518798828125, "learning_rate": 8.728040201005026e-06, "loss": 0.8758, "step": 56600 }, { "epoch": 2.83, "grad_norm": 4.461952209472656, "learning_rate": 8.707939698492464e-06, "loss": 0.8789, "step": 56700 }, { "epoch": 2.84, "grad_norm": 7.590397834777832, "learning_rate": 8.6878391959799e-06, "loss": 0.8726, "step": 56800 }, { "epoch": 2.84, "grad_norm": 5.754077911376953, "learning_rate": 8.667738693467337e-06, "loss": 0.9022, "step": 56900 }, { "epoch": 2.85, "grad_norm": 4.305074214935303, "learning_rate": 8.647638190954775e-06, "loss": 0.8933, "step": 57000 }, { "epoch": 2.85, "grad_norm": 4.451827526092529, "learning_rate": 8.627738693467337e-06, "loss": 0.934, "step": 57100 }, { "epoch": 2.86, "grad_norm": 6.323834419250488, "learning_rate": 8.607638190954775e-06, "loss": 0.8858, "step": 57200 }, { "epoch": 2.87, "grad_norm": 6.937102317810059, "learning_rate": 8.587537688442211e-06, "loss": 0.9263, "step": 57300 }, { "epoch": 2.87, "grad_norm": 7.153318881988525, "learning_rate": 8.56743718592965e-06, "loss": 0.8868, "step": 57400 }, { "epoch": 2.88, "grad_norm": 4.7994842529296875, "learning_rate": 8.547336683417085e-06, "loss": 0.877, "step": 57500 }, { "epoch": 2.88, "grad_norm": 6.6480793952941895, "learning_rate": 8.527236180904523e-06, "loss": 0.8774, "step": 57600 }, { "epoch": 2.88, "grad_norm": 7.170138835906982, "learning_rate": 8.50713567839196e-06, "loss": 0.8704, "step": 57700 }, { "epoch": 2.89, "grad_norm": 4.006447792053223, "learning_rate": 8.487035175879398e-06, "loss": 0.8736, "step": 57800 }, { "epoch": 2.9, "grad_norm": 4.998128414154053, "learning_rate": 8.466934673366834e-06, "loss": 0.8653, "step": 57900 }, { "epoch": 2.9, "grad_norm": 5.756192207336426, "learning_rate": 8.446834170854272e-06, "loss": 0.8756, "step": 58000 }, { "epoch": 2.91, "grad_norm": 5.486929893493652, "learning_rate": 8.426733668341708e-06, "loss": 0.9047, "step": 58100 }, { "epoch": 2.91, "grad_norm": 4.589926242828369, "learning_rate": 8.406633165829146e-06, "loss": 0.8766, "step": 58200 }, { "epoch": 2.92, "grad_norm": 4.535083293914795, "learning_rate": 8.386532663316584e-06, "loss": 0.8758, "step": 58300 }, { "epoch": 2.92, "grad_norm": 3.2254798412323, "learning_rate": 8.36643216080402e-06, "loss": 0.8813, "step": 58400 }, { "epoch": 2.92, "grad_norm": 6.055229187011719, "learning_rate": 8.346331658291458e-06, "loss": 0.8779, "step": 58500 }, { "epoch": 2.93, "grad_norm": 4.221169471740723, "learning_rate": 8.326231155778895e-06, "loss": 0.9325, "step": 58600 }, { "epoch": 2.94, "grad_norm": 5.035799026489258, "learning_rate": 8.306130653266333e-06, "loss": 0.8896, "step": 58700 }, { "epoch": 2.94, "grad_norm": 6.551968574523926, "learning_rate": 8.28603015075377e-06, "loss": 0.8644, "step": 58800 }, { "epoch": 2.94, "grad_norm": 4.297557353973389, "learning_rate": 8.265929648241207e-06, "loss": 0.8853, "step": 58900 }, { "epoch": 2.95, "grad_norm": 6.603255271911621, "learning_rate": 8.245829145728645e-06, "loss": 0.9237, "step": 59000 }, { "epoch": 2.96, "grad_norm": 6.272432804107666, "learning_rate": 8.225728643216081e-06, "loss": 0.8708, "step": 59100 }, { "epoch": 2.96, "grad_norm": 5.919680595397949, "learning_rate": 8.20562814070352e-06, "loss": 0.8525, "step": 59200 }, { "epoch": 2.96, "grad_norm": 4.834166049957275, "learning_rate": 8.185527638190955e-06, "loss": 0.8576, "step": 59300 }, { "epoch": 2.97, "grad_norm": 5.948410987854004, "learning_rate": 8.165427135678393e-06, "loss": 0.9017, "step": 59400 }, { "epoch": 2.98, "grad_norm": 7.001020431518555, "learning_rate": 8.14532663316583e-06, "loss": 0.891, "step": 59500 }, { "epoch": 2.98, "grad_norm": 5.623896598815918, "learning_rate": 8.125226130653266e-06, "loss": 0.8255, "step": 59600 }, { "epoch": 2.98, "grad_norm": 5.0935468673706055, "learning_rate": 8.105125628140704e-06, "loss": 0.8709, "step": 59700 }, { "epoch": 2.99, "grad_norm": 6.403896808624268, "learning_rate": 8.08502512562814e-06, "loss": 0.8957, "step": 59800 }, { "epoch": 3.0, "grad_norm": 5.92683744430542, "learning_rate": 8.064924623115578e-06, "loss": 0.9102, "step": 59900 }, { "epoch": 3.0, "grad_norm": 3.4657108783721924, "learning_rate": 8.04502512562814e-06, "loss": 0.916, "step": 60000 }, { "epoch": 3.0, "eval_loss": 0.9292559623718262, "eval_runtime": 21.5879, "eval_samples_per_second": 46.322, "eval_steps_per_second": 5.79, "step": 60000 }, { "epoch": 3.0, "grad_norm": 5.614874839782715, "learning_rate": 8.024924623115579e-06, "loss": 0.8151, "step": 60100 }, { "epoch": 3.01, "grad_norm": 6.177361011505127, "learning_rate": 8.004824120603015e-06, "loss": 0.8266, "step": 60200 }, { "epoch": 3.02, "grad_norm": 5.4862213134765625, "learning_rate": 7.984723618090453e-06, "loss": 0.7741, "step": 60300 }, { "epoch": 3.02, "grad_norm": 6.674380779266357, "learning_rate": 7.964623115577889e-06, "loss": 0.8134, "step": 60400 }, { "epoch": 3.02, "grad_norm": 6.712404251098633, "learning_rate": 7.944522613065327e-06, "loss": 0.8332, "step": 60500 }, { "epoch": 3.03, "grad_norm": 4.442228317260742, "learning_rate": 7.924422110552763e-06, "loss": 0.832, "step": 60600 }, { "epoch": 3.04, "grad_norm": 5.503748416900635, "learning_rate": 7.904321608040201e-06, "loss": 0.844, "step": 60700 }, { "epoch": 3.04, "grad_norm": 4.290737628936768, "learning_rate": 7.88422110552764e-06, "loss": 0.8593, "step": 60800 }, { "epoch": 3.04, "grad_norm": 4.687915802001953, "learning_rate": 7.864120603015076e-06, "loss": 0.8506, "step": 60900 }, { "epoch": 3.05, "grad_norm": 5.838376998901367, "learning_rate": 7.844020100502514e-06, "loss": 0.8297, "step": 61000 }, { "epoch": 3.06, "grad_norm": 7.26198148727417, "learning_rate": 7.823919597989952e-06, "loss": 0.8463, "step": 61100 }, { "epoch": 3.06, "grad_norm": 5.693443298339844, "learning_rate": 7.803819095477388e-06, "loss": 0.8405, "step": 61200 }, { "epoch": 3.06, "grad_norm": 5.379219055175781, "learning_rate": 7.783718592964826e-06, "loss": 0.8431, "step": 61300 }, { "epoch": 3.07, "grad_norm": 5.703670501708984, "learning_rate": 7.763618090452262e-06, "loss": 0.8484, "step": 61400 }, { "epoch": 3.08, "grad_norm": 5.679072380065918, "learning_rate": 7.7435175879397e-06, "loss": 0.8272, "step": 61500 }, { "epoch": 3.08, "grad_norm": 4.1109113693237305, "learning_rate": 7.723417085427136e-06, "loss": 0.83, "step": 61600 }, { "epoch": 3.08, "grad_norm": 5.94366979598999, "learning_rate": 7.703316582914574e-06, "loss": 0.8102, "step": 61700 }, { "epoch": 3.09, "grad_norm": 8.418631553649902, "learning_rate": 7.68321608040201e-06, "loss": 0.8396, "step": 61800 }, { "epoch": 3.1, "grad_norm": 5.8859100341796875, "learning_rate": 7.663115577889449e-06, "loss": 0.8142, "step": 61900 }, { "epoch": 3.1, "grad_norm": 5.267168045043945, "learning_rate": 7.643015075376885e-06, "loss": 0.8087, "step": 62000 }, { "epoch": 3.1, "grad_norm": 5.58022403717041, "learning_rate": 7.622914572864322e-06, "loss": 0.8407, "step": 62100 }, { "epoch": 3.11, "grad_norm": 5.999646186828613, "learning_rate": 7.602814070351759e-06, "loss": 0.8214, "step": 62200 }, { "epoch": 3.12, "grad_norm": 4.449764251708984, "learning_rate": 7.582713567839196e-06, "loss": 0.8562, "step": 62300 }, { "epoch": 3.12, "grad_norm": 6.2914137840271, "learning_rate": 7.562613065326634e-06, "loss": 0.8359, "step": 62400 }, { "epoch": 3.12, "grad_norm": 5.262882709503174, "learning_rate": 7.5425125628140705e-06, "loss": 0.8368, "step": 62500 }, { "epoch": 3.13, "grad_norm": 4.981582164764404, "learning_rate": 7.5224120603015085e-06, "loss": 0.8138, "step": 62600 }, { "epoch": 3.13, "grad_norm": 5.330999374389648, "learning_rate": 7.502311557788945e-06, "loss": 0.8292, "step": 62700 }, { "epoch": 3.14, "grad_norm": 5.176852226257324, "learning_rate": 7.482211055276383e-06, "loss": 0.8108, "step": 62800 }, { "epoch": 3.15, "grad_norm": 8.819506645202637, "learning_rate": 7.462110552763819e-06, "loss": 0.854, "step": 62900 }, { "epoch": 3.15, "grad_norm": 5.1937642097473145, "learning_rate": 7.442010050251257e-06, "loss": 0.7943, "step": 63000 }, { "epoch": 3.15, "grad_norm": 4.399514675140381, "learning_rate": 7.421909547738694e-06, "loss": 0.7815, "step": 63100 }, { "epoch": 3.16, "grad_norm": 5.575798034667969, "learning_rate": 7.402010050251257e-06, "loss": 0.8481, "step": 63200 }, { "epoch": 3.17, "grad_norm": 5.097688674926758, "learning_rate": 7.381909547738694e-06, "loss": 0.8412, "step": 63300 }, { "epoch": 3.17, "grad_norm": 4.748641490936279, "learning_rate": 7.361809045226132e-06, "loss": 0.8058, "step": 63400 }, { "epoch": 3.17, "grad_norm": 7.151881694793701, "learning_rate": 7.341708542713568e-06, "loss": 0.7944, "step": 63500 }, { "epoch": 3.18, "grad_norm": 4.642664909362793, "learning_rate": 7.321608040201006e-06, "loss": 0.8185, "step": 63600 }, { "epoch": 3.19, "grad_norm": 5.354043483734131, "learning_rate": 7.301507537688442e-06, "loss": 0.7833, "step": 63700 }, { "epoch": 3.19, "grad_norm": 5.168720245361328, "learning_rate": 7.28140703517588e-06, "loss": 0.7966, "step": 63800 }, { "epoch": 3.19, "grad_norm": 4.343645095825195, "learning_rate": 7.261306532663317e-06, "loss": 0.7851, "step": 63900 }, { "epoch": 3.2, "grad_norm": 4.882009506225586, "learning_rate": 7.241206030150754e-06, "loss": 0.8069, "step": 64000 }, { "epoch": 3.21, "grad_norm": 4.954422473907471, "learning_rate": 7.2211055276381915e-06, "loss": 0.8029, "step": 64100 }, { "epoch": 3.21, "grad_norm": 3.5329108238220215, "learning_rate": 7.2010050251256295e-06, "loss": 0.8262, "step": 64200 }, { "epoch": 3.21, "grad_norm": 4.995691776275635, "learning_rate": 7.180904522613066e-06, "loss": 0.7899, "step": 64300 }, { "epoch": 3.22, "grad_norm": 4.367786884307861, "learning_rate": 7.160804020100504e-06, "loss": 0.8014, "step": 64400 }, { "epoch": 3.23, "grad_norm": 3.8841774463653564, "learning_rate": 7.14070351758794e-06, "loss": 0.8207, "step": 64500 }, { "epoch": 3.23, "grad_norm": 4.118581295013428, "learning_rate": 7.120603015075378e-06, "loss": 0.8399, "step": 64600 }, { "epoch": 3.23, "grad_norm": 5.320229530334473, "learning_rate": 7.100502512562814e-06, "loss": 0.8407, "step": 64700 }, { "epoch": 3.24, "grad_norm": 4.324894428253174, "learning_rate": 7.080402010050251e-06, "loss": 0.7897, "step": 64800 }, { "epoch": 3.25, "grad_norm": 6.917771816253662, "learning_rate": 7.060301507537689e-06, "loss": 0.8019, "step": 64900 }, { "epoch": 3.25, "grad_norm": 7.098691463470459, "learning_rate": 7.040201005025126e-06, "loss": 0.8058, "step": 65000 }, { "epoch": 3.25, "grad_norm": 5.166707992553711, "learning_rate": 7.020100502512564e-06, "loss": 0.7839, "step": 65100 }, { "epoch": 3.26, "grad_norm": 5.616134166717529, "learning_rate": 7e-06, "loss": 0.7821, "step": 65200 }, { "epoch": 3.27, "grad_norm": 7.216468334197998, "learning_rate": 6.979899497487438e-06, "loss": 0.7974, "step": 65300 }, { "epoch": 3.27, "grad_norm": 7.116774082183838, "learning_rate": 6.959798994974874e-06, "loss": 0.8446, "step": 65400 }, { "epoch": 3.27, "grad_norm": 6.275495529174805, "learning_rate": 6.939698492462312e-06, "loss": 0.8185, "step": 65500 }, { "epoch": 3.28, "grad_norm": 4.431950092315674, "learning_rate": 6.919597989949749e-06, "loss": 0.8203, "step": 65600 }, { "epoch": 3.29, "grad_norm": 6.8355302810668945, "learning_rate": 6.899497487437186e-06, "loss": 0.789, "step": 65700 }, { "epoch": 3.29, "grad_norm": 4.217498779296875, "learning_rate": 6.8793969849246235e-06, "loss": 0.7909, "step": 65800 }, { "epoch": 3.29, "grad_norm": 9.218932151794434, "learning_rate": 6.859899497487438e-06, "loss": 0.8387, "step": 65900 }, { "epoch": 3.3, "grad_norm": 5.607006072998047, "learning_rate": 6.8397989949748745e-06, "loss": 0.787, "step": 66000 }, { "epoch": 3.31, "grad_norm": 5.220907688140869, "learning_rate": 6.8196984924623124e-06, "loss": 0.8274, "step": 66100 }, { "epoch": 3.31, "grad_norm": 4.795065402984619, "learning_rate": 6.799597989949749e-06, "loss": 0.7833, "step": 66200 }, { "epoch": 3.31, "grad_norm": 5.653503894805908, "learning_rate": 6.779497487437187e-06, "loss": 0.7919, "step": 66300 }, { "epoch": 3.32, "grad_norm": 5.359546184539795, "learning_rate": 6.759396984924623e-06, "loss": 0.8, "step": 66400 }, { "epoch": 3.33, "grad_norm": 3.9278500080108643, "learning_rate": 6.739296482412061e-06, "loss": 0.816, "step": 66500 }, { "epoch": 3.33, "grad_norm": 6.889082908630371, "learning_rate": 6.719195979899498e-06, "loss": 0.8559, "step": 66600 }, { "epoch": 3.33, "grad_norm": 6.555418491363525, "learning_rate": 6.699095477386935e-06, "loss": 0.8084, "step": 66700 }, { "epoch": 3.34, "grad_norm": 5.0188798904418945, "learning_rate": 6.678994974874372e-06, "loss": 0.8199, "step": 66800 }, { "epoch": 3.34, "grad_norm": 5.341757297515869, "learning_rate": 6.6588944723618094e-06, "loss": 0.8301, "step": 66900 }, { "epoch": 3.35, "grad_norm": 7.638245105743408, "learning_rate": 6.6387939698492466e-06, "loss": 0.8156, "step": 67000 }, { "epoch": 3.35, "grad_norm": 4.004561424255371, "learning_rate": 6.6186934673366845e-06, "loss": 0.779, "step": 67100 }, { "epoch": 3.36, "grad_norm": 5.197673320770264, "learning_rate": 6.598592964824121e-06, "loss": 0.8086, "step": 67200 }, { "epoch": 3.37, "grad_norm": 5.757644176483154, "learning_rate": 6.578492462311559e-06, "loss": 0.8609, "step": 67300 }, { "epoch": 3.37, "grad_norm": 3.9802143573760986, "learning_rate": 6.558391959798995e-06, "loss": 0.814, "step": 67400 }, { "epoch": 3.38, "grad_norm": 4.6707892417907715, "learning_rate": 6.538291457286433e-06, "loss": 0.809, "step": 67500 }, { "epoch": 3.38, "grad_norm": 6.902073860168457, "learning_rate": 6.51819095477387e-06, "loss": 0.7862, "step": 67600 }, { "epoch": 3.38, "grad_norm": 4.793231010437012, "learning_rate": 6.498090452261307e-06, "loss": 0.8139, "step": 67700 }, { "epoch": 3.39, "grad_norm": 10.16287612915039, "learning_rate": 6.4779899497487444e-06, "loss": 0.78, "step": 67800 }, { "epoch": 3.4, "grad_norm": 5.308049201965332, "learning_rate": 6.4578894472361816e-06, "loss": 0.8235, "step": 67900 }, { "epoch": 3.4, "grad_norm": 5.0899271965026855, "learning_rate": 6.437788944723619e-06, "loss": 0.8222, "step": 68000 }, { "epoch": 3.41, "grad_norm": 5.174381732940674, "learning_rate": 6.417688442211055e-06, "loss": 0.7985, "step": 68100 }, { "epoch": 3.41, "grad_norm": 4.858529090881348, "learning_rate": 6.397587939698493e-06, "loss": 0.8224, "step": 68200 }, { "epoch": 3.42, "grad_norm": 8.091994285583496, "learning_rate": 6.37748743718593e-06, "loss": 0.8078, "step": 68300 }, { "epoch": 3.42, "grad_norm": 5.269526481628418, "learning_rate": 6.357386934673367e-06, "loss": 0.8006, "step": 68400 }, { "epoch": 3.42, "grad_norm": 5.161372184753418, "learning_rate": 6.337286432160804e-06, "loss": 0.814, "step": 68500 }, { "epoch": 3.43, "grad_norm": 4.547713279724121, "learning_rate": 6.3171859296482415e-06, "loss": 0.8024, "step": 68600 }, { "epoch": 3.44, "grad_norm": 5.171160697937012, "learning_rate": 6.297085427135679e-06, "loss": 0.7936, "step": 68700 }, { "epoch": 3.44, "grad_norm": 6.406951904296875, "learning_rate": 6.2769849246231166e-06, "loss": 0.7627, "step": 68800 }, { "epoch": 3.44, "grad_norm": 6.404531955718994, "learning_rate": 6.256884422110553e-06, "loss": 0.8081, "step": 68900 }, { "epoch": 3.45, "grad_norm": 4.409193992614746, "learning_rate": 6.236783919597991e-06, "loss": 0.8284, "step": 69000 }, { "epoch": 3.46, "grad_norm": 6.643680572509766, "learning_rate": 6.216683417085427e-06, "loss": 0.7908, "step": 69100 }, { "epoch": 3.46, "grad_norm": 6.344150543212891, "learning_rate": 6.196582914572865e-06, "loss": 0.8028, "step": 69200 }, { "epoch": 3.46, "grad_norm": 4.83349609375, "learning_rate": 6.176482412060301e-06, "loss": 0.7946, "step": 69300 }, { "epoch": 3.47, "grad_norm": 4.103985786437988, "learning_rate": 6.156381909547739e-06, "loss": 0.8089, "step": 69400 }, { "epoch": 3.48, "grad_norm": 4.681515693664551, "learning_rate": 6.1362814070351764e-06, "loss": 0.8104, "step": 69500 }, { "epoch": 3.48, "grad_norm": 3.5046350955963135, "learning_rate": 6.1161809045226136e-06, "loss": 0.8024, "step": 69600 }, { "epoch": 3.48, "grad_norm": 5.06920051574707, "learning_rate": 6.096080402010051e-06, "loss": 0.8043, "step": 69700 }, { "epoch": 3.49, "grad_norm": 6.419402599334717, "learning_rate": 6.075979899497489e-06, "loss": 0.8218, "step": 69800 }, { "epoch": 3.5, "grad_norm": 4.9620184898376465, "learning_rate": 6.055879396984925e-06, "loss": 0.7904, "step": 69900 }, { "epoch": 3.5, "grad_norm": 6.6012349128723145, "learning_rate": 6.035778894472363e-06, "loss": 0.8021, "step": 70000 }, { "epoch": 3.5, "eval_loss": 0.913910448551178, "eval_runtime": 21.5858, "eval_samples_per_second": 46.327, "eval_steps_per_second": 5.791, "step": 70000 }, { "epoch": 3.5, "grad_norm": 8.278429985046387, "learning_rate": 6.015678391959799e-06, "loss": 0.8255, "step": 70100 }, { "epoch": 3.51, "grad_norm": 5.309919834136963, "learning_rate": 5.995577889447237e-06, "loss": 0.8048, "step": 70200 }, { "epoch": 3.52, "grad_norm": 5.3151535987854, "learning_rate": 5.9754773869346735e-06, "loss": 0.796, "step": 70300 }, { "epoch": 3.52, "grad_norm": 6.962722301483154, "learning_rate": 5.9553768844221114e-06, "loss": 0.8448, "step": 70400 }, { "epoch": 3.52, "grad_norm": 6.564899444580078, "learning_rate": 5.9352763819095486e-06, "loss": 0.7782, "step": 70500 }, { "epoch": 3.53, "grad_norm": 4.522327423095703, "learning_rate": 5.915175879396985e-06, "loss": 0.8306, "step": 70600 }, { "epoch": 3.54, "grad_norm": 4.783290863037109, "learning_rate": 5.895075376884423e-06, "loss": 0.8448, "step": 70700 }, { "epoch": 3.54, "grad_norm": 8.016778945922852, "learning_rate": 5.874974874371859e-06, "loss": 0.805, "step": 70800 }, { "epoch": 3.54, "grad_norm": 6.962314605712891, "learning_rate": 5.854874371859297e-06, "loss": 0.7802, "step": 70900 }, { "epoch": 3.55, "grad_norm": 4.056068420410156, "learning_rate": 5.834773869346733e-06, "loss": 0.8146, "step": 71000 }, { "epoch": 3.56, "grad_norm": 4.548468589782715, "learning_rate": 5.814673366834171e-06, "loss": 0.7631, "step": 71100 }, { "epoch": 3.56, "grad_norm": 4.344750881195068, "learning_rate": 5.7945728643216085e-06, "loss": 0.8032, "step": 71200 }, { "epoch": 3.56, "grad_norm": 6.746843338012695, "learning_rate": 5.774472361809046e-06, "loss": 0.7622, "step": 71300 }, { "epoch": 3.57, "grad_norm": 5.048290729522705, "learning_rate": 5.754371859296483e-06, "loss": 0.8133, "step": 71400 }, { "epoch": 3.58, "grad_norm": 5.74857759475708, "learning_rate": 5.734271356783921e-06, "loss": 0.7834, "step": 71500 }, { "epoch": 3.58, "grad_norm": 4.5277934074401855, "learning_rate": 5.714170854271357e-06, "loss": 0.789, "step": 71600 }, { "epoch": 3.58, "grad_norm": 8.23270034790039, "learning_rate": 5.694070351758795e-06, "loss": 0.7613, "step": 71700 }, { "epoch": 3.59, "grad_norm": 3.9528987407684326, "learning_rate": 5.673969849246231e-06, "loss": 0.8081, "step": 71800 }, { "epoch": 3.59, "grad_norm": 5.704257965087891, "learning_rate": 5.653869346733669e-06, "loss": 0.8164, "step": 71900 }, { "epoch": 3.6, "grad_norm": 4.676042079925537, "learning_rate": 5.6337688442211055e-06, "loss": 0.8202, "step": 72000 }, { "epoch": 3.6, "grad_norm": 5.20451021194458, "learning_rate": 5.6136683417085434e-06, "loss": 0.7953, "step": 72100 }, { "epoch": 3.61, "grad_norm": 7.501960277557373, "learning_rate": 5.5935678391959806e-06, "loss": 0.8168, "step": 72200 }, { "epoch": 3.62, "grad_norm": 7.015203475952148, "learning_rate": 5.573467336683418e-06, "loss": 0.789, "step": 72300 }, { "epoch": 3.62, "grad_norm": 4.428484916687012, "learning_rate": 5.553366834170855e-06, "loss": 0.8092, "step": 72400 }, { "epoch": 3.62, "grad_norm": 4.477147102355957, "learning_rate": 5.533266331658293e-06, "loss": 0.7843, "step": 72500 }, { "epoch": 3.63, "grad_norm": 5.1699748039245605, "learning_rate": 5.513165829145729e-06, "loss": 0.7996, "step": 72600 }, { "epoch": 3.63, "grad_norm": 5.133453369140625, "learning_rate": 5.493065326633167e-06, "loss": 0.8233, "step": 72700 }, { "epoch": 3.64, "grad_norm": 4.902942657470703, "learning_rate": 5.472964824120603e-06, "loss": 0.7586, "step": 72800 }, { "epoch": 3.65, "grad_norm": 6.46637487411499, "learning_rate": 5.4528643216080405e-06, "loss": 0.7959, "step": 72900 }, { "epoch": 3.65, "grad_norm": 7.144857406616211, "learning_rate": 5.432763819095478e-06, "loss": 0.8197, "step": 73000 }, { "epoch": 3.66, "grad_norm": 6.084510326385498, "learning_rate": 5.412663316582915e-06, "loss": 0.8133, "step": 73100 }, { "epoch": 3.66, "grad_norm": 5.132942199707031, "learning_rate": 5.392562814070353e-06, "loss": 0.7482, "step": 73200 }, { "epoch": 3.67, "grad_norm": 6.69909143447876, "learning_rate": 5.372462311557789e-06, "loss": 0.7498, "step": 73300 }, { "epoch": 3.67, "grad_norm": 7.99722146987915, "learning_rate": 5.352361809045227e-06, "loss": 0.7857, "step": 73400 }, { "epoch": 3.67, "grad_norm": 7.380476951599121, "learning_rate": 5.332261306532663e-06, "loss": 0.8081, "step": 73500 }, { "epoch": 3.68, "grad_norm": 6.441634178161621, "learning_rate": 5.312160804020101e-06, "loss": 0.7737, "step": 73600 }, { "epoch": 3.69, "grad_norm": 5.027355194091797, "learning_rate": 5.2920603015075375e-06, "loss": 0.7991, "step": 73700 }, { "epoch": 3.69, "grad_norm": 8.128876686096191, "learning_rate": 5.2719597989949755e-06, "loss": 0.8271, "step": 73800 }, { "epoch": 3.69, "grad_norm": 4.09487247467041, "learning_rate": 5.251859296482413e-06, "loss": 0.775, "step": 73900 }, { "epoch": 3.7, "grad_norm": 6.368048667907715, "learning_rate": 5.231959798994976e-06, "loss": 0.7872, "step": 74000 }, { "epoch": 3.71, "grad_norm": 4.72104549407959, "learning_rate": 5.211859296482412e-06, "loss": 0.8057, "step": 74100 }, { "epoch": 3.71, "grad_norm": 5.083056926727295, "learning_rate": 5.19175879396985e-06, "loss": 0.7839, "step": 74200 }, { "epoch": 3.71, "grad_norm": 5.289855003356934, "learning_rate": 5.171658291457286e-06, "loss": 0.7829, "step": 74300 }, { "epoch": 3.72, "grad_norm": 5.842662811279297, "learning_rate": 5.151557788944724e-06, "loss": 0.7782, "step": 74400 }, { "epoch": 3.73, "grad_norm": 6.445068836212158, "learning_rate": 5.131457286432161e-06, "loss": 0.8335, "step": 74500 }, { "epoch": 3.73, "grad_norm": 4.2318220138549805, "learning_rate": 5.111356783919599e-06, "loss": 0.7942, "step": 74600 }, { "epoch": 3.73, "grad_norm": 8.975232124328613, "learning_rate": 5.091256281407036e-06, "loss": 0.8284, "step": 74700 }, { "epoch": 3.74, "grad_norm": 4.482039451599121, "learning_rate": 5.071155778894473e-06, "loss": 0.8281, "step": 74800 }, { "epoch": 3.75, "grad_norm": 4.330044269561768, "learning_rate": 5.05105527638191e-06, "loss": 0.7737, "step": 74900 }, { "epoch": 3.75, "grad_norm": 4.636693000793457, "learning_rate": 5.030954773869348e-06, "loss": 0.7882, "step": 75000 }, { "epoch": 3.75, "grad_norm": 4.175960540771484, "learning_rate": 5.010854271356784e-06, "loss": 0.7417, "step": 75100 }, { "epoch": 3.76, "grad_norm": 4.081864833831787, "learning_rate": 4.990753768844221e-06, "loss": 0.7579, "step": 75200 }, { "epoch": 3.77, "grad_norm": 4.608290672302246, "learning_rate": 4.9706532663316585e-06, "loss": 0.799, "step": 75300 }, { "epoch": 3.77, "grad_norm": 4.851296901702881, "learning_rate": 4.950552763819096e-06, "loss": 0.7998, "step": 75400 }, { "epoch": 3.77, "grad_norm": 4.3285112380981445, "learning_rate": 4.930452261306533e-06, "loss": 0.8093, "step": 75500 }, { "epoch": 3.78, "grad_norm": 4.927236080169678, "learning_rate": 4.910552763819096e-06, "loss": 0.7793, "step": 75600 }, { "epoch": 3.79, "grad_norm": 6.193936824798584, "learning_rate": 4.890452261306533e-06, "loss": 0.8072, "step": 75700 }, { "epoch": 3.79, "grad_norm": 4.687440872192383, "learning_rate": 4.87035175879397e-06, "loss": 0.8, "step": 75800 }, { "epoch": 3.79, "grad_norm": 4.473381519317627, "learning_rate": 4.850251256281407e-06, "loss": 0.8027, "step": 75900 }, { "epoch": 3.8, "grad_norm": 4.676540374755859, "learning_rate": 4.8301507537688445e-06, "loss": 0.8029, "step": 76000 }, { "epoch": 3.81, "grad_norm": 4.967388153076172, "learning_rate": 4.810050251256282e-06, "loss": 0.7539, "step": 76100 }, { "epoch": 3.81, "grad_norm": 4.699183940887451, "learning_rate": 4.789949748743719e-06, "loss": 0.7651, "step": 76200 }, { "epoch": 3.81, "grad_norm": 4.629420757293701, "learning_rate": 4.769849246231156e-06, "loss": 0.7803, "step": 76300 }, { "epoch": 3.82, "grad_norm": 5.920188903808594, "learning_rate": 4.749748743718594e-06, "loss": 0.8017, "step": 76400 }, { "epoch": 3.83, "grad_norm": 6.677817344665527, "learning_rate": 4.729648241206031e-06, "loss": 0.8216, "step": 76500 }, { "epoch": 3.83, "grad_norm": 5.312260627746582, "learning_rate": 4.709547738693468e-06, "loss": 0.7827, "step": 76600 }, { "epoch": 3.83, "grad_norm": 4.119052410125732, "learning_rate": 4.689447236180905e-06, "loss": 0.7483, "step": 76700 }, { "epoch": 3.84, "grad_norm": 4.5976715087890625, "learning_rate": 4.669346733668342e-06, "loss": 0.7657, "step": 76800 }, { "epoch": 3.84, "grad_norm": 5.721061706542969, "learning_rate": 4.649246231155779e-06, "loss": 0.7817, "step": 76900 }, { "epoch": 3.85, "grad_norm": 7.369571208953857, "learning_rate": 4.629145728643216e-06, "loss": 0.7402, "step": 77000 }, { "epoch": 3.85, "grad_norm": 5.615093231201172, "learning_rate": 4.609045226130654e-06, "loss": 0.811, "step": 77100 }, { "epoch": 3.86, "grad_norm": 6.276815414428711, "learning_rate": 4.588944723618091e-06, "loss": 0.7909, "step": 77200 }, { "epoch": 3.87, "grad_norm": 4.287708759307861, "learning_rate": 4.568844221105528e-06, "loss": 0.8012, "step": 77300 }, { "epoch": 3.87, "grad_norm": 4.280378818511963, "learning_rate": 4.548743718592965e-06, "loss": 0.8205, "step": 77400 }, { "epoch": 3.88, "grad_norm": 8.309846878051758, "learning_rate": 4.528643216080402e-06, "loss": 0.7785, "step": 77500 }, { "epoch": 3.88, "grad_norm": 5.504384517669678, "learning_rate": 4.508542713567839e-06, "loss": 0.7678, "step": 77600 }, { "epoch": 3.88, "grad_norm": 4.6738996505737305, "learning_rate": 4.4884422110552765e-06, "loss": 0.8207, "step": 77700 }, { "epoch": 3.89, "grad_norm": 8.038127899169922, "learning_rate": 4.468341708542714e-06, "loss": 0.7788, "step": 77800 }, { "epoch": 3.9, "grad_norm": 6.898759365081787, "learning_rate": 4.448241206030151e-06, "loss": 0.7575, "step": 77900 }, { "epoch": 3.9, "grad_norm": 5.893388271331787, "learning_rate": 4.428140703517588e-06, "loss": 0.7842, "step": 78000 }, { "epoch": 3.91, "grad_norm": 7.37433385848999, "learning_rate": 4.408040201005026e-06, "loss": 0.756, "step": 78100 }, { "epoch": 3.91, "grad_norm": 6.226987838745117, "learning_rate": 4.387939698492463e-06, "loss": 0.7818, "step": 78200 }, { "epoch": 3.92, "grad_norm": 6.20886754989624, "learning_rate": 4.368040201005025e-06, "loss": 0.8057, "step": 78300 }, { "epoch": 3.92, "grad_norm": 3.9309849739074707, "learning_rate": 4.3479396984924625e-06, "loss": 0.8052, "step": 78400 }, { "epoch": 3.92, "grad_norm": 4.972345352172852, "learning_rate": 4.3278391959799e-06, "loss": 0.7666, "step": 78500 }, { "epoch": 3.93, "grad_norm": 8.730260848999023, "learning_rate": 4.307738693467337e-06, "loss": 0.7897, "step": 78600 }, { "epoch": 3.94, "grad_norm": 6.734485626220703, "learning_rate": 4.287638190954774e-06, "loss": 0.7595, "step": 78700 }, { "epoch": 3.94, "grad_norm": 6.456557750701904, "learning_rate": 4.267537688442212e-06, "loss": 0.7924, "step": 78800 }, { "epoch": 3.94, "grad_norm": 4.421884059906006, "learning_rate": 4.247437185929649e-06, "loss": 0.7821, "step": 78900 }, { "epoch": 3.95, "grad_norm": 7.825852394104004, "learning_rate": 4.227336683417086e-06, "loss": 0.7834, "step": 79000 }, { "epoch": 3.96, "grad_norm": 6.445671081542969, "learning_rate": 4.207236180904523e-06, "loss": 0.7794, "step": 79100 }, { "epoch": 3.96, "grad_norm": 3.7435953617095947, "learning_rate": 4.18713567839196e-06, "loss": 0.7218, "step": 79200 }, { "epoch": 3.96, "grad_norm": 10.594905853271484, "learning_rate": 4.1670351758793975e-06, "loss": 0.7957, "step": 79300 }, { "epoch": 3.97, "grad_norm": 7.166194438934326, "learning_rate": 4.146934673366835e-06, "loss": 0.7936, "step": 79400 }, { "epoch": 3.98, "grad_norm": 4.773101329803467, "learning_rate": 4.126834170854272e-06, "loss": 0.7721, "step": 79500 }, { "epoch": 3.98, "grad_norm": 5.979006767272949, "learning_rate": 4.106733668341709e-06, "loss": 0.7899, "step": 79600 }, { "epoch": 3.98, "grad_norm": 6.46978235244751, "learning_rate": 4.086633165829146e-06, "loss": 0.7874, "step": 79700 }, { "epoch": 3.99, "grad_norm": 5.1106977462768555, "learning_rate": 4.066532663316583e-06, "loss": 0.7644, "step": 79800 }, { "epoch": 4.0, "grad_norm": 7.125823974609375, "learning_rate": 4.0466331658291464e-06, "loss": 0.792, "step": 79900 }, { "epoch": 4.0, "grad_norm": 5.539035797119141, "learning_rate": 4.026532663316583e-06, "loss": 0.7779, "step": 80000 }, { "epoch": 4.0, "eval_loss": 0.8846080303192139, "eval_runtime": 21.6073, "eval_samples_per_second": 46.281, "eval_steps_per_second": 5.785, "step": 80000 }, { "epoch": 4.0, "grad_norm": 5.7579193115234375, "learning_rate": 4.00643216080402e-06, "loss": 0.6947, "step": 80100 }, { "epoch": 4.01, "grad_norm": 5.583180904388428, "learning_rate": 3.986331658291458e-06, "loss": 0.6614, "step": 80200 }, { "epoch": 4.01, "grad_norm": 5.107233047485352, "learning_rate": 3.966231155778895e-06, "loss": 0.6936, "step": 80300 }, { "epoch": 4.02, "grad_norm": 5.804276466369629, "learning_rate": 3.946130653266332e-06, "loss": 0.6946, "step": 80400 }, { "epoch": 4.03, "grad_norm": 6.738204479217529, "learning_rate": 3.926030150753769e-06, "loss": 0.6681, "step": 80500 }, { "epoch": 4.03, "grad_norm": 6.331192970275879, "learning_rate": 3.905929648241206e-06, "loss": 0.6839, "step": 80600 }, { "epoch": 4.04, "grad_norm": 5.382104873657227, "learning_rate": 3.8858291457286434e-06, "loss": 0.6566, "step": 80700 }, { "epoch": 4.04, "grad_norm": 6.394933223724365, "learning_rate": 3.8657286432160806e-06, "loss": 0.7378, "step": 80800 }, { "epoch": 4.04, "grad_norm": 5.813870429992676, "learning_rate": 3.845628140703518e-06, "loss": 0.7112, "step": 80900 }, { "epoch": 4.05, "grad_norm": 6.095046520233154, "learning_rate": 3.825527638190955e-06, "loss": 0.6885, "step": 81000 }, { "epoch": 4.05, "grad_norm": 6.212576866149902, "learning_rate": 3.8054271356783924e-06, "loss": 0.6658, "step": 81100 }, { "epoch": 4.06, "grad_norm": 4.426722526550293, "learning_rate": 3.7853266331658295e-06, "loss": 0.6915, "step": 81200 }, { "epoch": 4.07, "grad_norm": 7.474303722381592, "learning_rate": 3.7652261306532666e-06, "loss": 0.6486, "step": 81300 }, { "epoch": 4.07, "grad_norm": 7.347512245178223, "learning_rate": 3.7451256281407038e-06, "loss": 0.7078, "step": 81400 }, { "epoch": 4.08, "grad_norm": 9.426233291625977, "learning_rate": 3.7250251256281413e-06, "loss": 0.6951, "step": 81500 }, { "epoch": 4.08, "grad_norm": 5.577968597412109, "learning_rate": 3.7049246231155784e-06, "loss": 0.6905, "step": 81600 }, { "epoch": 4.08, "grad_norm": 6.477217197418213, "learning_rate": 3.6848241206030156e-06, "loss": 0.663, "step": 81700 }, { "epoch": 4.09, "grad_norm": 6.228948593139648, "learning_rate": 3.6647236180904527e-06, "loss": 0.6677, "step": 81800 }, { "epoch": 4.09, "grad_norm": 5.777594089508057, "learning_rate": 3.64462311557789e-06, "loss": 0.6905, "step": 81900 }, { "epoch": 4.1, "grad_norm": 6.7552080154418945, "learning_rate": 3.624522613065327e-06, "loss": 0.7086, "step": 82000 }, { "epoch": 4.11, "grad_norm": 5.3912553787231445, "learning_rate": 3.6044221105527645e-06, "loss": 0.6833, "step": 82100 }, { "epoch": 4.11, "grad_norm": 7.366456508636475, "learning_rate": 3.5843216080402016e-06, "loss": 0.6618, "step": 82200 }, { "epoch": 4.12, "grad_norm": 4.593729019165039, "learning_rate": 3.5642211055276383e-06, "loss": 0.6397, "step": 82300 }, { "epoch": 4.12, "grad_norm": 6.743685722351074, "learning_rate": 3.5441206030150755e-06, "loss": 0.7233, "step": 82400 }, { "epoch": 4.12, "grad_norm": 6.125808238983154, "learning_rate": 3.5240201005025126e-06, "loss": 0.6804, "step": 82500 }, { "epoch": 4.13, "grad_norm": 7.0340752601623535, "learning_rate": 3.5039195979899497e-06, "loss": 0.699, "step": 82600 }, { "epoch": 4.13, "grad_norm": 7.293619632720947, "learning_rate": 3.4838190954773873e-06, "loss": 0.6572, "step": 82700 }, { "epoch": 4.14, "grad_norm": 6.3135552406311035, "learning_rate": 3.4637185929648244e-06, "loss": 0.6364, "step": 82800 }, { "epoch": 4.14, "grad_norm": 5.138033390045166, "learning_rate": 3.4436180904522615e-06, "loss": 0.6815, "step": 82900 }, { "epoch": 4.15, "grad_norm": 6.240560054779053, "learning_rate": 3.4235175879396986e-06, "loss": 0.6919, "step": 83000 }, { "epoch": 4.16, "grad_norm": 4.19957971572876, "learning_rate": 3.4034170854271358e-06, "loss": 0.6845, "step": 83100 }, { "epoch": 4.16, "grad_norm": 6.340314865112305, "learning_rate": 3.383316582914573e-06, "loss": 0.653, "step": 83200 }, { "epoch": 4.17, "grad_norm": 3.309894323348999, "learning_rate": 3.3632160804020104e-06, "loss": 0.6612, "step": 83300 }, { "epoch": 4.17, "grad_norm": 5.189826011657715, "learning_rate": 3.3431155778894476e-06, "loss": 0.6871, "step": 83400 }, { "epoch": 4.17, "grad_norm": 6.599611759185791, "learning_rate": 3.3230150753768847e-06, "loss": 0.6743, "step": 83500 }, { "epoch": 4.18, "grad_norm": 26.47356414794922, "learning_rate": 3.302914572864322e-06, "loss": 0.6312, "step": 83600 }, { "epoch": 4.18, "grad_norm": 8.280220985412598, "learning_rate": 3.282814070351759e-06, "loss": 0.6276, "step": 83700 }, { "epoch": 4.19, "grad_norm": 7.8088555335998535, "learning_rate": 3.2627135678391965e-06, "loss": 0.6514, "step": 83800 }, { "epoch": 4.2, "grad_norm": 5.11159086227417, "learning_rate": 3.2426130653266336e-06, "loss": 0.6262, "step": 83900 }, { "epoch": 4.2, "grad_norm": 6.656592845916748, "learning_rate": 3.2225125628140708e-06, "loss": 0.6889, "step": 84000 }, { "epoch": 4.21, "grad_norm": 7.140279769897461, "learning_rate": 3.202412060301508e-06, "loss": 0.6435, "step": 84100 }, { "epoch": 4.21, "grad_norm": 6.478577613830566, "learning_rate": 3.182311557788945e-06, "loss": 0.6593, "step": 84200 }, { "epoch": 4.21, "grad_norm": 6.854846477508545, "learning_rate": 3.1622110552763826e-06, "loss": 0.7097, "step": 84300 }, { "epoch": 4.22, "grad_norm": 5.070549488067627, "learning_rate": 3.1421105527638197e-06, "loss": 0.6736, "step": 84400 }, { "epoch": 4.22, "grad_norm": 7.519010543823242, "learning_rate": 3.122010050251257e-06, "loss": 0.6518, "step": 84500 }, { "epoch": 4.23, "grad_norm": 6.662156105041504, "learning_rate": 3.1019095477386935e-06, "loss": 0.675, "step": 84600 }, { "epoch": 4.24, "grad_norm": 7.687413215637207, "learning_rate": 3.0818090452261307e-06, "loss": 0.6477, "step": 84700 }, { "epoch": 4.24, "grad_norm": 5.934724807739258, "learning_rate": 3.0617085427135678e-06, "loss": 0.6492, "step": 84800 }, { "epoch": 4.25, "grad_norm": 9.457836151123047, "learning_rate": 3.041608040201005e-06, "loss": 0.633, "step": 84900 }, { "epoch": 4.25, "grad_norm": 6.666748523712158, "learning_rate": 3.0215075376884425e-06, "loss": 0.6693, "step": 85000 }, { "epoch": 4.25, "grad_norm": 6.439404487609863, "learning_rate": 3.0014070351758796e-06, "loss": 0.6643, "step": 85100 }, { "epoch": 4.26, "grad_norm": 7.257474422454834, "learning_rate": 2.9813065326633167e-06, "loss": 0.6623, "step": 85200 }, { "epoch": 4.26, "grad_norm": 4.707270622253418, "learning_rate": 2.961206030150754e-06, "loss": 0.6471, "step": 85300 }, { "epoch": 4.27, "grad_norm": 5.7160844802856445, "learning_rate": 2.941105527638191e-06, "loss": 0.683, "step": 85400 }, { "epoch": 4.28, "grad_norm": 6.038240432739258, "learning_rate": 2.9210050251256285e-06, "loss": 0.6742, "step": 85500 }, { "epoch": 4.28, "grad_norm": 6.851832866668701, "learning_rate": 2.9009045226130656e-06, "loss": 0.6748, "step": 85600 }, { "epoch": 4.29, "grad_norm": 5.691901683807373, "learning_rate": 2.8808040201005028e-06, "loss": 0.6703, "step": 85700 }, { "epoch": 4.29, "grad_norm": 6.378291130065918, "learning_rate": 2.86070351758794e-06, "loss": 0.6487, "step": 85800 }, { "epoch": 4.29, "grad_norm": 4.439263343811035, "learning_rate": 2.840603015075377e-06, "loss": 0.6598, "step": 85900 }, { "epoch": 4.3, "grad_norm": 6.466790199279785, "learning_rate": 2.8205025125628146e-06, "loss": 0.6914, "step": 86000 }, { "epoch": 4.3, "grad_norm": 6.0331902503967285, "learning_rate": 2.8004020100502517e-06, "loss": 0.6929, "step": 86100 }, { "epoch": 4.31, "grad_norm": 4.750064849853516, "learning_rate": 2.780301507537689e-06, "loss": 0.6715, "step": 86200 }, { "epoch": 4.32, "grad_norm": 8.289958953857422, "learning_rate": 2.760201005025126e-06, "loss": 0.6975, "step": 86300 }, { "epoch": 4.32, "grad_norm": 10.746756553649902, "learning_rate": 2.740100502512563e-06, "loss": 0.6454, "step": 86400 }, { "epoch": 4.33, "grad_norm": 6.792548656463623, "learning_rate": 2.720201005025126e-06, "loss": 0.7056, "step": 86500 }, { "epoch": 4.33, "grad_norm": 5.030031204223633, "learning_rate": 2.700100502512563e-06, "loss": 0.6711, "step": 86600 }, { "epoch": 4.33, "grad_norm": 4.626148223876953, "learning_rate": 2.680201005025126e-06, "loss": 0.676, "step": 86700 }, { "epoch": 4.34, "grad_norm": 8.56241512298584, "learning_rate": 2.660100502512563e-06, "loss": 0.6548, "step": 86800 }, { "epoch": 4.34, "grad_norm": 9.747623443603516, "learning_rate": 2.64e-06, "loss": 0.6883, "step": 86900 }, { "epoch": 4.35, "grad_norm": 8.002108573913574, "learning_rate": 2.6198994974874377e-06, "loss": 0.7166, "step": 87000 }, { "epoch": 4.36, "grad_norm": 6.09249210357666, "learning_rate": 2.599798994974875e-06, "loss": 0.6841, "step": 87100 }, { "epoch": 4.36, "grad_norm": 5.512220859527588, "learning_rate": 2.579698492462312e-06, "loss": 0.6816, "step": 87200 }, { "epoch": 4.37, "grad_norm": 5.139577388763428, "learning_rate": 2.559597989949749e-06, "loss": 0.6475, "step": 87300 }, { "epoch": 4.37, "grad_norm": 11.360005378723145, "learning_rate": 2.539497487437186e-06, "loss": 0.7434, "step": 87400 }, { "epoch": 4.38, "grad_norm": 5.06545877456665, "learning_rate": 2.5193969849246237e-06, "loss": 0.6626, "step": 87500 }, { "epoch": 4.38, "grad_norm": 4.432734966278076, "learning_rate": 2.4992964824120604e-06, "loss": 0.6357, "step": 87600 }, { "epoch": 4.38, "grad_norm": 7.90862512588501, "learning_rate": 2.4791959798994976e-06, "loss": 0.6039, "step": 87700 }, { "epoch": 4.39, "grad_norm": 4.959092617034912, "learning_rate": 2.459095477386935e-06, "loss": 0.6699, "step": 87800 }, { "epoch": 4.39, "grad_norm": 7.495928764343262, "learning_rate": 2.4389949748743723e-06, "loss": 0.6648, "step": 87900 }, { "epoch": 4.4, "grad_norm": 10.80557918548584, "learning_rate": 2.4188944723618094e-06, "loss": 0.6532, "step": 88000 }, { "epoch": 4.41, "grad_norm": 7.1374006271362305, "learning_rate": 2.3987939698492465e-06, "loss": 0.6903, "step": 88100 }, { "epoch": 4.41, "grad_norm": 12.275821685791016, "learning_rate": 2.3786934673366836e-06, "loss": 0.6433, "step": 88200 }, { "epoch": 4.42, "grad_norm": 8.747936248779297, "learning_rate": 2.3585929648241208e-06, "loss": 0.62, "step": 88300 }, { "epoch": 4.42, "grad_norm": 5.3552985191345215, "learning_rate": 2.338492462311558e-06, "loss": 0.6525, "step": 88400 }, { "epoch": 4.42, "grad_norm": 7.049367427825928, "learning_rate": 2.318391959798995e-06, "loss": 0.6742, "step": 88500 }, { "epoch": 4.43, "grad_norm": 8.841930389404297, "learning_rate": 2.298291457286432e-06, "loss": 0.6806, "step": 88600 }, { "epoch": 4.43, "grad_norm": 4.58371114730835, "learning_rate": 2.2781909547738697e-06, "loss": 0.6469, "step": 88700 }, { "epoch": 4.44, "grad_norm": 8.08278751373291, "learning_rate": 2.258090452261307e-06, "loss": 0.6918, "step": 88800 }, { "epoch": 4.45, "grad_norm": 5.989361763000488, "learning_rate": 2.237989949748744e-06, "loss": 0.7048, "step": 88900 }, { "epoch": 4.45, "grad_norm": 8.200750350952148, "learning_rate": 2.217889447236181e-06, "loss": 0.6222, "step": 89000 }, { "epoch": 4.46, "grad_norm": 7.658218860626221, "learning_rate": 2.197788944723618e-06, "loss": 0.653, "step": 89100 }, { "epoch": 4.46, "grad_norm": 6.744418621063232, "learning_rate": 2.177889447236181e-06, "loss": 0.6698, "step": 89200 }, { "epoch": 4.46, "grad_norm": 4.423871994018555, "learning_rate": 2.157788944723618e-06, "loss": 0.6665, "step": 89300 }, { "epoch": 4.47, "grad_norm": 7.368816375732422, "learning_rate": 2.1376884422110557e-06, "loss": 0.6766, "step": 89400 }, { "epoch": 4.47, "grad_norm": 4.649584770202637, "learning_rate": 2.117587939698493e-06, "loss": 0.6464, "step": 89500 }, { "epoch": 4.48, "grad_norm": 7.77773904800415, "learning_rate": 2.09748743718593e-06, "loss": 0.6721, "step": 89600 }, { "epoch": 4.49, "grad_norm": 6.5589280128479, "learning_rate": 2.0773869346733667e-06, "loss": 0.6817, "step": 89700 }, { "epoch": 4.49, "grad_norm": 10.153287887573242, "learning_rate": 2.0572864321608042e-06, "loss": 0.645, "step": 89800 }, { "epoch": 4.5, "grad_norm": 8.705924987792969, "learning_rate": 2.0371859296482414e-06, "loss": 0.707, "step": 89900 }, { "epoch": 4.5, "grad_norm": 5.7329511642456055, "learning_rate": 2.0170854271356785e-06, "loss": 0.6834, "step": 90000 }, { "epoch": 4.5, "eval_loss": 0.9503761529922485, "eval_runtime": 21.641, "eval_samples_per_second": 46.209, "eval_steps_per_second": 5.776, "step": 90000 }, { "epoch": 4.5, "grad_norm": 6.902284622192383, "learning_rate": 1.9969849246231156e-06, "loss": 0.6237, "step": 90100 }, { "epoch": 4.51, "grad_norm": 5.6710710525512695, "learning_rate": 1.9768844221105527e-06, "loss": 0.6638, "step": 90200 }, { "epoch": 4.51, "grad_norm": 6.364370346069336, "learning_rate": 1.9567839195979903e-06, "loss": 0.6537, "step": 90300 }, { "epoch": 4.52, "grad_norm": 5.928137302398682, "learning_rate": 1.9366834170854274e-06, "loss": 0.6266, "step": 90400 }, { "epoch": 4.53, "grad_norm": 8.740313529968262, "learning_rate": 1.9165829145728645e-06, "loss": 0.6198, "step": 90500 }, { "epoch": 4.53, "grad_norm": 8.339399337768555, "learning_rate": 1.8964824120603017e-06, "loss": 0.6482, "step": 90600 }, { "epoch": 4.54, "grad_norm": 8.13129997253418, "learning_rate": 1.876381909547739e-06, "loss": 0.6521, "step": 90700 }, { "epoch": 4.54, "grad_norm": 10.06900405883789, "learning_rate": 1.856281407035176e-06, "loss": 0.6472, "step": 90800 }, { "epoch": 4.54, "grad_norm": 6.953003406524658, "learning_rate": 1.836180904522613e-06, "loss": 0.6185, "step": 90900 }, { "epoch": 4.55, "grad_norm": 7.572219371795654, "learning_rate": 1.8160804020100504e-06, "loss": 0.664, "step": 91000 }, { "epoch": 4.55, "grad_norm": 8.318469047546387, "learning_rate": 1.7959798994974875e-06, "loss": 0.6442, "step": 91100 }, { "epoch": 4.56, "grad_norm": 6.608754634857178, "learning_rate": 1.7758793969849246e-06, "loss": 0.6398, "step": 91200 }, { "epoch": 4.56, "grad_norm": 7.397676467895508, "learning_rate": 1.755778894472362e-06, "loss": 0.6689, "step": 91300 }, { "epoch": 4.57, "grad_norm": 10.482325553894043, "learning_rate": 1.7356783919597991e-06, "loss": 0.6792, "step": 91400 }, { "epoch": 4.58, "grad_norm": 5.926417827606201, "learning_rate": 1.7155778894472364e-06, "loss": 0.6774, "step": 91500 }, { "epoch": 4.58, "grad_norm": 8.223274230957031, "learning_rate": 1.6954773869346736e-06, "loss": 0.6528, "step": 91600 }, { "epoch": 4.58, "grad_norm": 7.564822196960449, "learning_rate": 1.6753768844221107e-06, "loss": 0.6224, "step": 91700 }, { "epoch": 4.59, "grad_norm": 6.845765113830566, "learning_rate": 1.655276381909548e-06, "loss": 0.6984, "step": 91800 }, { "epoch": 4.59, "grad_norm": 6.044042587280273, "learning_rate": 1.6353768844221107e-06, "loss": 0.6211, "step": 91900 }, { "epoch": 4.6, "grad_norm": 12.825979232788086, "learning_rate": 1.615276381909548e-06, "loss": 0.6851, "step": 92000 }, { "epoch": 4.61, "grad_norm": 6.73763370513916, "learning_rate": 1.5951758793969851e-06, "loss": 0.6161, "step": 92100 }, { "epoch": 4.61, "grad_norm": 6.827399730682373, "learning_rate": 1.5750753768844223e-06, "loss": 0.6525, "step": 92200 }, { "epoch": 4.62, "grad_norm": 6.6664228439331055, "learning_rate": 1.5549748743718594e-06, "loss": 0.6617, "step": 92300 }, { "epoch": 4.62, "grad_norm": 9.772034645080566, "learning_rate": 1.5348743718592965e-06, "loss": 0.6687, "step": 92400 }, { "epoch": 4.62, "grad_norm": 6.625182151794434, "learning_rate": 1.5147738693467336e-06, "loss": 0.6545, "step": 92500 }, { "epoch": 4.63, "grad_norm": 10.207441329956055, "learning_rate": 1.494673366834171e-06, "loss": 0.6332, "step": 92600 }, { "epoch": 4.63, "grad_norm": 9.929265975952148, "learning_rate": 1.474572864321608e-06, "loss": 0.6391, "step": 92700 }, { "epoch": 4.64, "grad_norm": 6.050763130187988, "learning_rate": 1.4544723618090452e-06, "loss": 0.6708, "step": 92800 }, { "epoch": 4.64, "grad_norm": 5.504277229309082, "learning_rate": 1.4343718592964826e-06, "loss": 0.6578, "step": 92900 }, { "epoch": 4.65, "grad_norm": 7.113737106323242, "learning_rate": 1.4142713567839197e-06, "loss": 0.6419, "step": 93000 }, { "epoch": 4.66, "grad_norm": 7.181005001068115, "learning_rate": 1.394170854271357e-06, "loss": 0.6298, "step": 93100 }, { "epoch": 4.66, "grad_norm": 8.930741310119629, "learning_rate": 1.3740703517587942e-06, "loss": 0.6734, "step": 93200 }, { "epoch": 4.67, "grad_norm": 6.288244724273682, "learning_rate": 1.3539698492462313e-06, "loss": 0.6307, "step": 93300 }, { "epoch": 4.67, "grad_norm": 6.91972017288208, "learning_rate": 1.3338693467336686e-06, "loss": 0.676, "step": 93400 }, { "epoch": 4.67, "grad_norm": 8.017012596130371, "learning_rate": 1.3137688442211055e-06, "loss": 0.6157, "step": 93500 }, { "epoch": 4.68, "grad_norm": 4.738548755645752, "learning_rate": 1.2936683417085427e-06, "loss": 0.679, "step": 93600 }, { "epoch": 4.69, "grad_norm": 6.201863765716553, "learning_rate": 1.27356783919598e-06, "loss": 0.6542, "step": 93700 }, { "epoch": 4.69, "grad_norm": 7.595000267028809, "learning_rate": 1.2534673366834171e-06, "loss": 0.6659, "step": 93800 }, { "epoch": 4.7, "grad_norm": 5.57780647277832, "learning_rate": 1.2333668341708543e-06, "loss": 0.6381, "step": 93900 }, { "epoch": 4.7, "grad_norm": 8.426780700683594, "learning_rate": 1.2132663316582916e-06, "loss": 0.6705, "step": 94000 }, { "epoch": 4.71, "grad_norm": 7.012176990509033, "learning_rate": 1.1931658291457287e-06, "loss": 0.6874, "step": 94100 }, { "epoch": 4.71, "grad_norm": 7.747401237487793, "learning_rate": 1.173065326633166e-06, "loss": 0.6317, "step": 94200 }, { "epoch": 4.71, "grad_norm": 4.817531108856201, "learning_rate": 1.1529648241206032e-06, "loss": 0.6083, "step": 94300 }, { "epoch": 4.72, "grad_norm": 6.916783332824707, "learning_rate": 1.1328643216080403e-06, "loss": 0.6619, "step": 94400 }, { "epoch": 4.72, "grad_norm": 7.570366382598877, "learning_rate": 1.1127638190954775e-06, "loss": 0.6471, "step": 94500 }, { "epoch": 4.73, "grad_norm": 8.70361328125, "learning_rate": 1.0926633165829146e-06, "loss": 0.6483, "step": 94600 }, { "epoch": 4.74, "grad_norm": 9.341569900512695, "learning_rate": 1.072562814070352e-06, "loss": 0.6194, "step": 94700 }, { "epoch": 4.74, "grad_norm": 4.283209800720215, "learning_rate": 1.052462311557789e-06, "loss": 0.6111, "step": 94800 }, { "epoch": 4.75, "grad_norm": 8.134038925170898, "learning_rate": 1.0323618090452262e-06, "loss": 0.632, "step": 94900 }, { "epoch": 4.75, "grad_norm": 8.605172157287598, "learning_rate": 1.0122613065326633e-06, "loss": 0.6341, "step": 95000 }, { "epoch": 4.75, "grad_norm": 8.067020416259766, "learning_rate": 9.921608040201006e-07, "loss": 0.6694, "step": 95100 }, { "epoch": 4.76, "grad_norm": 6.967876434326172, "learning_rate": 9.720603015075378e-07, "loss": 0.648, "step": 95200 }, { "epoch": 4.76, "grad_norm": 8.443940162658691, "learning_rate": 9.51959798994975e-07, "loss": 0.6174, "step": 95300 }, { "epoch": 4.77, "grad_norm": 8.791583061218262, "learning_rate": 9.318592964824122e-07, "loss": 0.6463, "step": 95400 }, { "epoch": 4.78, "grad_norm": 8.055484771728516, "learning_rate": 9.117587939698493e-07, "loss": 0.5966, "step": 95500 }, { "epoch": 4.78, "grad_norm": 5.009509563446045, "learning_rate": 8.916582914572865e-07, "loss": 0.6147, "step": 95600 }, { "epoch": 4.79, "grad_norm": 5.755350589752197, "learning_rate": 8.715577889447237e-07, "loss": 0.6101, "step": 95700 }, { "epoch": 4.79, "grad_norm": 8.774045944213867, "learning_rate": 8.514572864321608e-07, "loss": 0.6332, "step": 95800 }, { "epoch": 4.79, "grad_norm": 6.463279724121094, "learning_rate": 8.315577889447237e-07, "loss": 0.6705, "step": 95900 }, { "epoch": 4.8, "grad_norm": 5.299009323120117, "learning_rate": 8.114572864321608e-07, "loss": 0.6605, "step": 96000 }, { "epoch": 4.8, "grad_norm": 6.5152130126953125, "learning_rate": 7.91356783919598e-07, "loss": 0.6456, "step": 96100 }, { "epoch": 4.81, "grad_norm": 8.499478340148926, "learning_rate": 7.712562814070353e-07, "loss": 0.6454, "step": 96200 }, { "epoch": 4.81, "grad_norm": 8.317819595336914, "learning_rate": 7.511557788944725e-07, "loss": 0.5961, "step": 96300 }, { "epoch": 4.82, "grad_norm": 7.257504940032959, "learning_rate": 7.310552763819095e-07, "loss": 0.614, "step": 96400 }, { "epoch": 4.83, "grad_norm": 3.862578868865967, "learning_rate": 7.109547738693468e-07, "loss": 0.6388, "step": 96500 }, { "epoch": 4.83, "grad_norm": 8.748353958129883, "learning_rate": 6.90854271356784e-07, "loss": 0.6222, "step": 96600 }, { "epoch": 4.83, "grad_norm": 8.883009910583496, "learning_rate": 6.707537688442211e-07, "loss": 0.639, "step": 96700 }, { "epoch": 4.84, "grad_norm": 7.332880973815918, "learning_rate": 6.506532663316584e-07, "loss": 0.6341, "step": 96800 }, { "epoch": 4.84, "grad_norm": 7.421239852905273, "learning_rate": 6.305527638190956e-07, "loss": 0.6378, "step": 96900 }, { "epoch": 4.85, "grad_norm": 6.633522033691406, "learning_rate": 6.104522613065327e-07, "loss": 0.6587, "step": 97000 }, { "epoch": 4.86, "grad_norm": 6.347668170928955, "learning_rate": 5.903517587939699e-07, "loss": 0.6355, "step": 97100 }, { "epoch": 4.86, "grad_norm": 5.266615390777588, "learning_rate": 5.702512562814071e-07, "loss": 0.5976, "step": 97200 }, { "epoch": 4.87, "grad_norm": 5.0562286376953125, "learning_rate": 5.501507537688443e-07, "loss": 0.6426, "step": 97300 }, { "epoch": 4.87, "grad_norm": 9.852864265441895, "learning_rate": 5.300502512562814e-07, "loss": 0.6434, "step": 97400 }, { "epoch": 4.88, "grad_norm": 5.227302551269531, "learning_rate": 5.099497487437187e-07, "loss": 0.674, "step": 97500 }, { "epoch": 4.88, "grad_norm": 7.586268424987793, "learning_rate": 4.900502512562814e-07, "loss": 0.6826, "step": 97600 }, { "epoch": 4.88, "grad_norm": 7.512186527252197, "learning_rate": 4.699497487437187e-07, "loss": 0.6428, "step": 97700 }, { "epoch": 4.89, "grad_norm": 8.383907318115234, "learning_rate": 4.498492462311558e-07, "loss": 0.6215, "step": 97800 }, { "epoch": 4.89, "grad_norm": 6.214056968688965, "learning_rate": 4.29748743718593e-07, "loss": 0.6066, "step": 97900 }, { "epoch": 4.9, "grad_norm": 8.587347030639648, "learning_rate": 4.096482412060302e-07, "loss": 0.6213, "step": 98000 }, { "epoch": 4.91, "grad_norm": 14.060787200927734, "learning_rate": 3.8954773869346735e-07, "loss": 0.6151, "step": 98100 }, { "epoch": 4.91, "grad_norm": 11.65833568572998, "learning_rate": 3.694472361809046e-07, "loss": 0.6226, "step": 98200 }, { "epoch": 4.92, "grad_norm": 5.729846477508545, "learning_rate": 3.4934673366834176e-07, "loss": 0.6265, "step": 98300 }, { "epoch": 4.92, "grad_norm": 5.596776485443115, "learning_rate": 3.292462311557789e-07, "loss": 0.6048, "step": 98400 }, { "epoch": 4.92, "grad_norm": 5.834877967834473, "learning_rate": 3.091457286432161e-07, "loss": 0.6358, "step": 98500 }, { "epoch": 4.93, "grad_norm": 7.830298900604248, "learning_rate": 2.890452261306533e-07, "loss": 0.6381, "step": 98600 }, { "epoch": 4.94, "grad_norm": 7.147890567779541, "learning_rate": 2.689447236180905e-07, "loss": 0.6428, "step": 98700 }, { "epoch": 4.94, "grad_norm": 5.18765926361084, "learning_rate": 2.4884422110552766e-07, "loss": 0.6098, "step": 98800 }, { "epoch": 4.95, "grad_norm": 7.276676654815674, "learning_rate": 2.2874371859296484e-07, "loss": 0.6329, "step": 98900 }, { "epoch": 4.95, "grad_norm": 7.58540678024292, "learning_rate": 2.0864321608040202e-07, "loss": 0.6095, "step": 99000 }, { "epoch": 4.96, "grad_norm": 5.402534008026123, "learning_rate": 1.8854271356783923e-07, "loss": 0.605, "step": 99100 }, { "epoch": 4.96, "grad_norm": 7.289499282836914, "learning_rate": 1.684422110552764e-07, "loss": 0.6694, "step": 99200 }, { "epoch": 4.96, "grad_norm": 7.618215560913086, "learning_rate": 1.483417085427136e-07, "loss": 0.6313, "step": 99300 }, { "epoch": 4.97, "grad_norm": 7.560898780822754, "learning_rate": 1.2824120603015077e-07, "loss": 0.6073, "step": 99400 }, { "epoch": 4.97, "grad_norm": 5.637300968170166, "learning_rate": 1.0834170854271359e-07, "loss": 0.6211, "step": 99500 }, { "epoch": 4.98, "grad_norm": 8.691441535949707, "learning_rate": 8.824120603015076e-08, "loss": 0.6085, "step": 99600 }, { "epoch": 4.99, "grad_norm": 4.510754585266113, "learning_rate": 6.814070351758795e-08, "loss": 0.6193, "step": 99700 }, { "epoch": 4.99, "grad_norm": 7.4050703048706055, "learning_rate": 4.804020100502513e-08, "loss": 0.6642, "step": 99800 }, { "epoch": 5.0, "grad_norm": 9.641931533813477, "learning_rate": 2.7939698492462312e-08, "loss": 0.6304, "step": 99900 }, { "epoch": 5.0, "grad_norm": 7.846133232116699, "learning_rate": 7.839195979899499e-09, "loss": 0.6181, "step": 100000 }, { "epoch": 5.0, "eval_loss": 0.9481298923492432, "eval_runtime": 21.6157, "eval_samples_per_second": 46.263, "eval_steps_per_second": 5.783, "step": 100000 } ], "logging_steps": 100, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 10000, "total_flos": 1.1800273747968e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }