diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7101 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 10000, + "global_step": 100000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "grad_norm": 12.391390800476074, + "learning_rate": 3.920000000000001e-06, + "loss": 1.8027, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 7.110462665557861, + "learning_rate": 7.92e-06, + "loss": 1.6358, + "step": 200 + }, + { + "epoch": 0.01, + "grad_norm": 10.526795387268066, + "learning_rate": 1.1920000000000001e-05, + "loss": 1.603, + "step": 300 + }, + { + "epoch": 0.02, + "grad_norm": 9.175031661987305, + "learning_rate": 1.5920000000000003e-05, + "loss": 1.6249, + "step": 400 + }, + { + "epoch": 0.03, + "grad_norm": 4.193933486938477, + "learning_rate": 1.9920000000000002e-05, + "loss": 1.6364, + "step": 500 + }, + { + "epoch": 0.03, + "grad_norm": 8.9299955368042, + "learning_rate": 1.998030150753769e-05, + "loss": 1.6265, + "step": 600 + }, + { + "epoch": 0.04, + "grad_norm": 11.564770698547363, + "learning_rate": 1.996020100502513e-05, + "loss": 1.5935, + "step": 700 + }, + { + "epoch": 0.04, + "grad_norm": 9.529921531677246, + "learning_rate": 1.9940100502512564e-05, + "loss": 1.5959, + "step": 800 + }, + { + "epoch": 0.04, + "grad_norm": 5.335429668426514, + "learning_rate": 1.9920000000000002e-05, + "loss": 1.6342, + "step": 900 + }, + { + "epoch": 0.05, + "grad_norm": 8.102309226989746, + "learning_rate": 1.9899899497487437e-05, + "loss": 1.572, + "step": 1000 + }, + { + "epoch": 0.06, + "grad_norm": 5.742166042327881, + "learning_rate": 1.987979899497488e-05, + "loss": 1.5645, + "step": 1100 + }, + { + "epoch": 0.06, + "grad_norm": 5.3909735679626465, + "learning_rate": 1.9859698492462313e-05, + "loss": 1.547, + "step": 1200 + }, + { + "epoch": 0.07, + "grad_norm": 6.765148639678955, + "learning_rate": 1.983959798994975e-05, + "loss": 1.5399, + "step": 1300 + }, + { + "epoch": 0.07, + "grad_norm": 6.0268378257751465, + "learning_rate": 1.9819497487437185e-05, + "loss": 1.4985, + "step": 1400 + }, + { + "epoch": 0.07, + "grad_norm": 7.305541515350342, + "learning_rate": 1.9799396984924623e-05, + "loss": 1.5076, + "step": 1500 + }, + { + "epoch": 0.08, + "grad_norm": 8.56618595123291, + "learning_rate": 1.977929648241206e-05, + "loss": 1.52, + "step": 1600 + }, + { + "epoch": 0.09, + "grad_norm": 5.847652435302734, + "learning_rate": 1.97591959798995e-05, + "loss": 1.4976, + "step": 1700 + }, + { + "epoch": 0.09, + "grad_norm": 6.940663814544678, + "learning_rate": 1.9739095477386937e-05, + "loss": 1.4983, + "step": 1800 + }, + { + "epoch": 0.1, + "grad_norm": 5.06433629989624, + "learning_rate": 1.9718994974874372e-05, + "loss": 1.4951, + "step": 1900 + }, + { + "epoch": 0.1, + "grad_norm": 5.1144022941589355, + "learning_rate": 1.969889447236181e-05, + "loss": 1.5256, + "step": 2000 + }, + { + "epoch": 0.1, + "grad_norm": 6.515092849731445, + "learning_rate": 1.9678793969849248e-05, + "loss": 1.4677, + "step": 2100 + }, + { + "epoch": 0.11, + "grad_norm": 5.787613868713379, + "learning_rate": 1.9658693467336686e-05, + "loss": 1.4841, + "step": 2200 + }, + { + "epoch": 0.12, + "grad_norm": 7.798993110656738, + "learning_rate": 1.963859296482412e-05, + "loss": 1.4941, + "step": 2300 + }, + { + "epoch": 0.12, + "grad_norm": 4.808990955352783, + "learning_rate": 1.9618492462311562e-05, + "loss": 1.4775, + "step": 2400 + }, + { + "epoch": 0.12, + "grad_norm": 6.113214015960693, + "learning_rate": 1.9598391959798996e-05, + "loss": 1.4757, + "step": 2500 + }, + { + "epoch": 0.13, + "grad_norm": 6.038852214813232, + "learning_rate": 1.9578291457286434e-05, + "loss": 1.4413, + "step": 2600 + }, + { + "epoch": 0.14, + "grad_norm": 7.736110687255859, + "learning_rate": 1.955819095477387e-05, + "loss": 1.5001, + "step": 2700 + }, + { + "epoch": 0.14, + "grad_norm": 6.173422336578369, + "learning_rate": 1.953809045226131e-05, + "loss": 1.4183, + "step": 2800 + }, + { + "epoch": 0.14, + "grad_norm": 5.368058681488037, + "learning_rate": 1.9517989949748745e-05, + "loss": 1.4877, + "step": 2900 + }, + { + "epoch": 0.15, + "grad_norm": 5.35443639755249, + "learning_rate": 1.9497889447236183e-05, + "loss": 1.4079, + "step": 3000 + }, + { + "epoch": 0.15, + "grad_norm": 8.716644287109375, + "learning_rate": 1.9477788944723618e-05, + "loss": 1.4386, + "step": 3100 + }, + { + "epoch": 0.16, + "grad_norm": 5.639494895935059, + "learning_rate": 1.945768844221106e-05, + "loss": 1.4524, + "step": 3200 + }, + { + "epoch": 0.17, + "grad_norm": 3.3629064559936523, + "learning_rate": 1.9437587939698493e-05, + "loss": 1.4218, + "step": 3300 + }, + { + "epoch": 0.17, + "grad_norm": 4.7631402015686035, + "learning_rate": 1.941748743718593e-05, + "loss": 1.4357, + "step": 3400 + }, + { + "epoch": 0.17, + "grad_norm": 6.286344528198242, + "learning_rate": 1.939738693467337e-05, + "loss": 1.4025, + "step": 3500 + }, + { + "epoch": 0.18, + "grad_norm": 4.501611232757568, + "learning_rate": 1.9377286432160804e-05, + "loss": 1.4002, + "step": 3600 + }, + { + "epoch": 0.18, + "grad_norm": 6.302520275115967, + "learning_rate": 1.9357185929648242e-05, + "loss": 1.4128, + "step": 3700 + }, + { + "epoch": 0.19, + "grad_norm": 6.156075477600098, + "learning_rate": 1.933708542713568e-05, + "loss": 1.4136, + "step": 3800 + }, + { + "epoch": 0.2, + "grad_norm": 5.4391913414001465, + "learning_rate": 1.9316984924623118e-05, + "loss": 1.4307, + "step": 3900 + }, + { + "epoch": 0.2, + "grad_norm": 6.862305641174316, + "learning_rate": 1.9296884422110552e-05, + "loss": 1.3605, + "step": 4000 + }, + { + "epoch": 0.2, + "grad_norm": 5.392678737640381, + "learning_rate": 1.9276783919597994e-05, + "loss": 1.4059, + "step": 4100 + }, + { + "epoch": 0.21, + "grad_norm": 5.686226844787598, + "learning_rate": 1.925668341708543e-05, + "loss": 1.3474, + "step": 4200 + }, + { + "epoch": 0.21, + "grad_norm": 4.506126403808594, + "learning_rate": 1.9236582914572866e-05, + "loss": 1.3708, + "step": 4300 + }, + { + "epoch": 0.22, + "grad_norm": 7.255539894104004, + "learning_rate": 1.92164824120603e-05, + "loss": 1.3803, + "step": 4400 + }, + { + "epoch": 0.23, + "grad_norm": 6.463212966918945, + "learning_rate": 1.9196381909547742e-05, + "loss": 1.3371, + "step": 4500 + }, + { + "epoch": 0.23, + "grad_norm": 7.1397294998168945, + "learning_rate": 1.9176281407035177e-05, + "loss": 1.3787, + "step": 4600 + }, + { + "epoch": 0.23, + "grad_norm": 7.188973426818848, + "learning_rate": 1.9156180904522615e-05, + "loss": 1.3699, + "step": 4700 + }, + { + "epoch": 0.24, + "grad_norm": 4.161841869354248, + "learning_rate": 1.913608040201005e-05, + "loss": 1.3819, + "step": 4800 + }, + { + "epoch": 0.24, + "grad_norm": 3.420564889907837, + "learning_rate": 1.911597989949749e-05, + "loss": 1.3719, + "step": 4900 + }, + { + "epoch": 0.25, + "grad_norm": 5.769357681274414, + "learning_rate": 1.9095879396984925e-05, + "loss": 1.366, + "step": 5000 + }, + { + "epoch": 0.26, + "grad_norm": 6.374185562133789, + "learning_rate": 1.9075778894472363e-05, + "loss": 1.3377, + "step": 5100 + }, + { + "epoch": 0.26, + "grad_norm": 6.3521575927734375, + "learning_rate": 1.90556783919598e-05, + "loss": 1.3632, + "step": 5200 + }, + { + "epoch": 0.27, + "grad_norm": 4.51761531829834, + "learning_rate": 1.903557788944724e-05, + "loss": 1.3505, + "step": 5300 + }, + { + "epoch": 0.27, + "grad_norm": 6.074390411376953, + "learning_rate": 1.9015477386934674e-05, + "loss": 1.3644, + "step": 5400 + }, + { + "epoch": 0.28, + "grad_norm": 4.369632244110107, + "learning_rate": 1.8995376884422112e-05, + "loss": 1.3807, + "step": 5500 + }, + { + "epoch": 0.28, + "grad_norm": 7.657780170440674, + "learning_rate": 1.897527638190955e-05, + "loss": 1.3125, + "step": 5600 + }, + { + "epoch": 0.28, + "grad_norm": 9.048200607299805, + "learning_rate": 1.8955175879396988e-05, + "loss": 1.3216, + "step": 5700 + }, + { + "epoch": 0.29, + "grad_norm": 5.997036933898926, + "learning_rate": 1.8935075376884426e-05, + "loss": 1.3262, + "step": 5800 + }, + { + "epoch": 0.29, + "grad_norm": 4.751107692718506, + "learning_rate": 1.891497487437186e-05, + "loss": 1.3566, + "step": 5900 + }, + { + "epoch": 0.3, + "grad_norm": 5.662681579589844, + "learning_rate": 1.88948743718593e-05, + "loss": 1.3645, + "step": 6000 + }, + { + "epoch": 0.3, + "grad_norm": 5.755290508270264, + "learning_rate": 1.887497487437186e-05, + "loss": 1.2714, + "step": 6100 + }, + { + "epoch": 0.31, + "grad_norm": 5.199550151824951, + "learning_rate": 1.88548743718593e-05, + "loss": 1.3427, + "step": 6200 + }, + { + "epoch": 0.32, + "grad_norm": 7.531371116638184, + "learning_rate": 1.8834773869346733e-05, + "loss": 1.3198, + "step": 6300 + }, + { + "epoch": 0.32, + "grad_norm": 4.267923831939697, + "learning_rate": 1.881467336683417e-05, + "loss": 1.334, + "step": 6400 + }, + { + "epoch": 0.33, + "grad_norm": 5.429295063018799, + "learning_rate": 1.879457286432161e-05, + "loss": 1.2949, + "step": 6500 + }, + { + "epoch": 0.33, + "grad_norm": 4.842006206512451, + "learning_rate": 1.8774472361809047e-05, + "loss": 1.3123, + "step": 6600 + }, + { + "epoch": 0.34, + "grad_norm": 4.693381309509277, + "learning_rate": 1.8754371859296482e-05, + "loss": 1.3218, + "step": 6700 + }, + { + "epoch": 0.34, + "grad_norm": 3.555487632751465, + "learning_rate": 1.8734271356783923e-05, + "loss": 1.3077, + "step": 6800 + }, + { + "epoch": 0.34, + "grad_norm": 7.314678192138672, + "learning_rate": 1.8714170854271358e-05, + "loss": 1.2855, + "step": 6900 + }, + { + "epoch": 0.35, + "grad_norm": 6.160294532775879, + "learning_rate": 1.8694070351758796e-05, + "loss": 1.2901, + "step": 7000 + }, + { + "epoch": 0.35, + "grad_norm": 7.399959087371826, + "learning_rate": 1.867396984924623e-05, + "loss": 1.264, + "step": 7100 + }, + { + "epoch": 0.36, + "grad_norm": 4.204007625579834, + "learning_rate": 1.8653869346733672e-05, + "loss": 1.323, + "step": 7200 + }, + { + "epoch": 0.36, + "grad_norm": 5.531479358673096, + "learning_rate": 1.8633768844221106e-05, + "loss": 1.3211, + "step": 7300 + }, + { + "epoch": 0.37, + "grad_norm": 4.645538806915283, + "learning_rate": 1.8613668341708544e-05, + "loss": 1.2941, + "step": 7400 + }, + { + "epoch": 0.38, + "grad_norm": 6.326472282409668, + "learning_rate": 1.8593567839195982e-05, + "loss": 1.3025, + "step": 7500 + }, + { + "epoch": 0.38, + "grad_norm": 6.338307857513428, + "learning_rate": 1.857346733668342e-05, + "loss": 1.2924, + "step": 7600 + }, + { + "epoch": 0.39, + "grad_norm": 7.802080154418945, + "learning_rate": 1.8553366834170855e-05, + "loss": 1.3061, + "step": 7700 + }, + { + "epoch": 0.39, + "grad_norm": 4.98875093460083, + "learning_rate": 1.8533266331658293e-05, + "loss": 1.321, + "step": 7800 + }, + { + "epoch": 0.4, + "grad_norm": 5.888318061828613, + "learning_rate": 1.851316582914573e-05, + "loss": 1.2746, + "step": 7900 + }, + { + "epoch": 0.4, + "grad_norm": 6.636387825012207, + "learning_rate": 1.849306532663317e-05, + "loss": 1.2653, + "step": 8000 + }, + { + "epoch": 0.41, + "grad_norm": 6.1142449378967285, + "learning_rate": 1.8473165829145728e-05, + "loss": 1.2347, + "step": 8100 + }, + { + "epoch": 0.41, + "grad_norm": 5.41117525100708, + "learning_rate": 1.845306532663317e-05, + "loss": 1.3062, + "step": 8200 + }, + { + "epoch": 0.41, + "grad_norm": 5.025302886962891, + "learning_rate": 1.8432964824120604e-05, + "loss": 1.3162, + "step": 8300 + }, + { + "epoch": 0.42, + "grad_norm": 7.1088972091674805, + "learning_rate": 1.8412864321608042e-05, + "loss": 1.2573, + "step": 8400 + }, + { + "epoch": 0.42, + "grad_norm": 5.86447811126709, + "learning_rate": 1.839276381909548e-05, + "loss": 1.2855, + "step": 8500 + }, + { + "epoch": 0.43, + "grad_norm": 4.323820114135742, + "learning_rate": 1.8372663316582918e-05, + "loss": 1.2272, + "step": 8600 + }, + { + "epoch": 0.43, + "grad_norm": 7.335355758666992, + "learning_rate": 1.8352562814070352e-05, + "loss": 1.2718, + "step": 8700 + }, + { + "epoch": 0.44, + "grad_norm": 5.308874130249023, + "learning_rate": 1.833246231155779e-05, + "loss": 1.2727, + "step": 8800 + }, + { + "epoch": 0.45, + "grad_norm": 3.919790506362915, + "learning_rate": 1.8312361809045228e-05, + "loss": 1.28, + "step": 8900 + }, + { + "epoch": 0.45, + "grad_norm": 7.291688442230225, + "learning_rate": 1.8292261306532663e-05, + "loss": 1.2768, + "step": 9000 + }, + { + "epoch": 0.46, + "grad_norm": 5.098793029785156, + "learning_rate": 1.8272160804020104e-05, + "loss": 1.2441, + "step": 9100 + }, + { + "epoch": 0.46, + "grad_norm": 5.242636203765869, + "learning_rate": 1.825206030150754e-05, + "loss": 1.2534, + "step": 9200 + }, + { + "epoch": 0.47, + "grad_norm": 5.310051918029785, + "learning_rate": 1.8231959798994977e-05, + "loss": 1.2878, + "step": 9300 + }, + { + "epoch": 0.47, + "grad_norm": 6.058734893798828, + "learning_rate": 1.821185929648241e-05, + "loss": 1.2964, + "step": 9400 + }, + { + "epoch": 0.47, + "grad_norm": 6.912698745727539, + "learning_rate": 1.8191758793969853e-05, + "loss": 1.2511, + "step": 9500 + }, + { + "epoch": 0.48, + "grad_norm": 6.428102016448975, + "learning_rate": 1.8171658291457287e-05, + "loss": 1.2605, + "step": 9600 + }, + { + "epoch": 0.48, + "grad_norm": 5.642975807189941, + "learning_rate": 1.8151557788944725e-05, + "loss": 1.264, + "step": 9700 + }, + { + "epoch": 0.49, + "grad_norm": 6.23274040222168, + "learning_rate": 1.813145728643216e-05, + "loss": 1.2583, + "step": 9800 + }, + { + "epoch": 0.49, + "grad_norm": 7.3280792236328125, + "learning_rate": 1.81113567839196e-05, + "loss": 1.2324, + "step": 9900 + }, + { + "epoch": 0.5, + "grad_norm": 6.048460483551025, + "learning_rate": 1.8091256281407036e-05, + "loss": 1.2477, + "step": 10000 + }, + { + "epoch": 0.5, + "eval_loss": 1.2569069862365723, + "eval_runtime": 21.5797, + "eval_samples_per_second": 46.34, + "eval_steps_per_second": 5.792, + "step": 10000 + }, + { + "epoch": 0.51, + "grad_norm": 5.294989109039307, + "learning_rate": 1.80713567839196e-05, + "loss": 1.3038, + "step": 10100 + }, + { + "epoch": 0.51, + "grad_norm": 6.7187981605529785, + "learning_rate": 1.8051256281407036e-05, + "loss": 1.2584, + "step": 10200 + }, + { + "epoch": 0.52, + "grad_norm": 7.11021089553833, + "learning_rate": 1.8031155778894474e-05, + "loss": 1.2612, + "step": 10300 + }, + { + "epoch": 0.52, + "grad_norm": 6.111474990844727, + "learning_rate": 1.801105527638191e-05, + "loss": 1.2638, + "step": 10400 + }, + { + "epoch": 0.53, + "grad_norm": 6.04983377456665, + "learning_rate": 1.799095477386935e-05, + "loss": 1.2381, + "step": 10500 + }, + { + "epoch": 0.53, + "grad_norm": 5.682928562164307, + "learning_rate": 1.7970854271356785e-05, + "loss": 1.233, + "step": 10600 + }, + { + "epoch": 0.54, + "grad_norm": 6.028292179107666, + "learning_rate": 1.7950753768844223e-05, + "loss": 1.2572, + "step": 10700 + }, + { + "epoch": 0.54, + "grad_norm": 4.738650798797607, + "learning_rate": 1.793065326633166e-05, + "loss": 1.2125, + "step": 10800 + }, + { + "epoch": 0.55, + "grad_norm": 5.227931976318359, + "learning_rate": 1.7910753768844223e-05, + "loss": 1.2862, + "step": 10900 + }, + { + "epoch": 0.55, + "grad_norm": 6.476836204528809, + "learning_rate": 1.7890653266331658e-05, + "loss": 1.243, + "step": 11000 + }, + { + "epoch": 0.56, + "grad_norm": 4.261963844299316, + "learning_rate": 1.78705527638191e-05, + "loss": 1.2118, + "step": 11100 + }, + { + "epoch": 0.56, + "grad_norm": 6.414599418640137, + "learning_rate": 1.7850452261306534e-05, + "loss": 1.222, + "step": 11200 + }, + { + "epoch": 0.56, + "grad_norm": 5.642942905426025, + "learning_rate": 1.783035175879397e-05, + "loss": 1.1809, + "step": 11300 + }, + { + "epoch": 0.57, + "grad_norm": 4.094428539276123, + "learning_rate": 1.781025125628141e-05, + "loss": 1.2362, + "step": 11400 + }, + { + "epoch": 0.57, + "grad_norm": 5.5772881507873535, + "learning_rate": 1.7790150753768847e-05, + "loss": 1.2005, + "step": 11500 + }, + { + "epoch": 0.58, + "grad_norm": 4.420604705810547, + "learning_rate": 1.7770050251256282e-05, + "loss": 1.2138, + "step": 11600 + }, + { + "epoch": 0.58, + "grad_norm": 5.298806667327881, + "learning_rate": 1.774994974874372e-05, + "loss": 1.1693, + "step": 11700 + }, + { + "epoch": 0.59, + "grad_norm": 5.862612247467041, + "learning_rate": 1.7729849246231158e-05, + "loss": 1.1728, + "step": 11800 + }, + { + "epoch": 0.59, + "grad_norm": 3.835301637649536, + "learning_rate": 1.7709748743718593e-05, + "loss": 1.2159, + "step": 11900 + }, + { + "epoch": 0.6, + "grad_norm": 5.67401123046875, + "learning_rate": 1.768964824120603e-05, + "loss": 1.2393, + "step": 12000 + }, + { + "epoch": 0.6, + "grad_norm": 5.424498558044434, + "learning_rate": 1.766954773869347e-05, + "loss": 1.2255, + "step": 12100 + }, + { + "epoch": 0.61, + "grad_norm": 5.532503604888916, + "learning_rate": 1.7649447236180907e-05, + "loss": 1.2024, + "step": 12200 + }, + { + "epoch": 0.61, + "grad_norm": 5.404232501983643, + "learning_rate": 1.762934673366834e-05, + "loss": 1.2202, + "step": 12300 + }, + { + "epoch": 0.62, + "grad_norm": 3.9564428329467773, + "learning_rate": 1.7609246231155782e-05, + "loss": 1.1655, + "step": 12400 + }, + { + "epoch": 0.62, + "grad_norm": 3.2090141773223877, + "learning_rate": 1.7589145728643217e-05, + "loss": 1.1563, + "step": 12500 + }, + { + "epoch": 0.63, + "grad_norm": 6.341458320617676, + "learning_rate": 1.7569045226130655e-05, + "loss": 1.1982, + "step": 12600 + }, + { + "epoch": 0.64, + "grad_norm": 7.190246105194092, + "learning_rate": 1.754894472361809e-05, + "loss": 1.1817, + "step": 12700 + }, + { + "epoch": 0.64, + "grad_norm": 6.108299255371094, + "learning_rate": 1.752884422110553e-05, + "loss": 1.2123, + "step": 12800 + }, + { + "epoch": 0.65, + "grad_norm": 5.769379615783691, + "learning_rate": 1.7508743718592966e-05, + "loss": 1.1964, + "step": 12900 + }, + { + "epoch": 0.65, + "grad_norm": 5.177648067474365, + "learning_rate": 1.7488643216080404e-05, + "loss": 1.2103, + "step": 13000 + }, + { + "epoch": 0.66, + "grad_norm": 5.531684875488281, + "learning_rate": 1.7468542713567838e-05, + "loss": 1.1801, + "step": 13100 + }, + { + "epoch": 0.66, + "grad_norm": 5.700603008270264, + "learning_rate": 1.744844221105528e-05, + "loss": 1.1943, + "step": 13200 + }, + { + "epoch": 0.67, + "grad_norm": 9.25114917755127, + "learning_rate": 1.7428341708542714e-05, + "loss": 1.2286, + "step": 13300 + }, + { + "epoch": 0.67, + "grad_norm": 4.238541126251221, + "learning_rate": 1.7408241206030152e-05, + "loss": 1.1869, + "step": 13400 + }, + { + "epoch": 0.68, + "grad_norm": 5.6147260665893555, + "learning_rate": 1.738814070351759e-05, + "loss": 1.1854, + "step": 13500 + }, + { + "epoch": 0.68, + "grad_norm": 4.879734039306641, + "learning_rate": 1.7368040201005028e-05, + "loss": 1.1941, + "step": 13600 + }, + { + "epoch": 0.69, + "grad_norm": 3.612379312515259, + "learning_rate": 1.7347939698492463e-05, + "loss": 1.1649, + "step": 13700 + }, + { + "epoch": 0.69, + "grad_norm": 4.583663463592529, + "learning_rate": 1.73278391959799e-05, + "loss": 1.1796, + "step": 13800 + }, + { + "epoch": 0.69, + "grad_norm": 4.3080339431762695, + "learning_rate": 1.7307939698492463e-05, + "loss": 1.2092, + "step": 13900 + }, + { + "epoch": 0.7, + "grad_norm": 5.9151506423950195, + "learning_rate": 1.72878391959799e-05, + "loss": 1.1809, + "step": 14000 + }, + { + "epoch": 0.7, + "grad_norm": 5.167910575866699, + "learning_rate": 1.726773869346734e-05, + "loss": 1.2063, + "step": 14100 + }, + { + "epoch": 0.71, + "grad_norm": 7.372837543487549, + "learning_rate": 1.7247638190954777e-05, + "loss": 1.147, + "step": 14200 + }, + { + "epoch": 0.71, + "grad_norm": 3.6992413997650146, + "learning_rate": 1.722753768844221e-05, + "loss": 1.2312, + "step": 14300 + }, + { + "epoch": 0.72, + "grad_norm": 6.654348850250244, + "learning_rate": 1.720743718592965e-05, + "loss": 1.1956, + "step": 14400 + }, + { + "epoch": 0.72, + "grad_norm": 4.683749675750732, + "learning_rate": 1.7187336683417087e-05, + "loss": 1.1598, + "step": 14500 + }, + { + "epoch": 0.73, + "grad_norm": 5.769094467163086, + "learning_rate": 1.7167236180904522e-05, + "loss": 1.1387, + "step": 14600 + }, + { + "epoch": 0.73, + "grad_norm": 7.586219310760498, + "learning_rate": 1.7147135678391963e-05, + "loss": 1.1994, + "step": 14700 + }, + { + "epoch": 0.74, + "grad_norm": 5.573954105377197, + "learning_rate": 1.7127035175879398e-05, + "loss": 1.1887, + "step": 14800 + }, + { + "epoch": 0.74, + "grad_norm": 6.4866251945495605, + "learning_rate": 1.7106934673366836e-05, + "loss": 1.1892, + "step": 14900 + }, + { + "epoch": 0.75, + "grad_norm": 4.954825401306152, + "learning_rate": 1.708683417085427e-05, + "loss": 1.1742, + "step": 15000 + }, + { + "epoch": 0.76, + "grad_norm": 3.952847480773926, + "learning_rate": 1.7066733668341712e-05, + "loss": 1.143, + "step": 15100 + }, + { + "epoch": 0.76, + "grad_norm": 5.170006275177002, + "learning_rate": 1.7046633165829146e-05, + "loss": 1.1881, + "step": 15200 + }, + { + "epoch": 0.77, + "grad_norm": 4.910400390625, + "learning_rate": 1.7026532663316584e-05, + "loss": 1.131, + "step": 15300 + }, + { + "epoch": 0.77, + "grad_norm": 4.728166580200195, + "learning_rate": 1.700643216080402e-05, + "loss": 1.1854, + "step": 15400 + }, + { + "epoch": 0.78, + "grad_norm": 6.516223430633545, + "learning_rate": 1.698633165829146e-05, + "loss": 1.2069, + "step": 15500 + }, + { + "epoch": 0.78, + "grad_norm": 5.914300918579102, + "learning_rate": 1.6966231155778895e-05, + "loss": 1.1663, + "step": 15600 + }, + { + "epoch": 0.79, + "grad_norm": 4.6894378662109375, + "learning_rate": 1.6946130653266333e-05, + "loss": 1.145, + "step": 15700 + }, + { + "epoch": 0.79, + "grad_norm": 4.994019031524658, + "learning_rate": 1.692603015075377e-05, + "loss": 1.1156, + "step": 15800 + }, + { + "epoch": 0.8, + "grad_norm": 5.994630813598633, + "learning_rate": 1.690592964824121e-05, + "loss": 1.1583, + "step": 15900 + }, + { + "epoch": 0.8, + "grad_norm": 6.7444562911987305, + "learning_rate": 1.6885829145728643e-05, + "loss": 1.1821, + "step": 16000 + }, + { + "epoch": 0.81, + "grad_norm": 5.461032867431641, + "learning_rate": 1.686572864321608e-05, + "loss": 1.1388, + "step": 16100 + }, + { + "epoch": 0.81, + "grad_norm": 5.0775251388549805, + "learning_rate": 1.684562814070352e-05, + "loss": 1.1576, + "step": 16200 + }, + { + "epoch": 0.81, + "grad_norm": 4.469027042388916, + "learning_rate": 1.6825527638190957e-05, + "loss": 1.1792, + "step": 16300 + }, + { + "epoch": 0.82, + "grad_norm": 6.780773639678955, + "learning_rate": 1.6805427135678395e-05, + "loss": 1.1441, + "step": 16400 + }, + { + "epoch": 0.82, + "grad_norm": 6.338268756866455, + "learning_rate": 1.678532663316583e-05, + "loss": 1.1087, + "step": 16500 + }, + { + "epoch": 0.83, + "grad_norm": 4.28759241104126, + "learning_rate": 1.6765226130653268e-05, + "loss": 1.1616, + "step": 16600 + }, + { + "epoch": 0.83, + "grad_norm": 4.656599998474121, + "learning_rate": 1.6745125628140706e-05, + "loss": 1.1086, + "step": 16700 + }, + { + "epoch": 0.84, + "grad_norm": 4.506341457366943, + "learning_rate": 1.6725025125628144e-05, + "loss": 1.1821, + "step": 16800 + }, + { + "epoch": 0.84, + "grad_norm": 5.074087142944336, + "learning_rate": 1.670492462311558e-05, + "loss": 1.1376, + "step": 16900 + }, + { + "epoch": 0.85, + "grad_norm": 4.427557468414307, + "learning_rate": 1.6684824120603016e-05, + "loss": 1.1608, + "step": 17000 + }, + { + "epoch": 0.85, + "grad_norm": 4.684313774108887, + "learning_rate": 1.666472361809045e-05, + "loss": 1.1374, + "step": 17100 + }, + { + "epoch": 0.86, + "grad_norm": 4.981125354766846, + "learning_rate": 1.6644623115577892e-05, + "loss": 1.1157, + "step": 17200 + }, + { + "epoch": 0.86, + "grad_norm": 6.36452579498291, + "learning_rate": 1.6624522613065327e-05, + "loss": 1.1547, + "step": 17300 + }, + { + "epoch": 0.87, + "grad_norm": 4.986701011657715, + "learning_rate": 1.6604422110552765e-05, + "loss": 1.147, + "step": 17400 + }, + { + "epoch": 0.88, + "grad_norm": 6.206230640411377, + "learning_rate": 1.6584321608040203e-05, + "loss": 1.1235, + "step": 17500 + }, + { + "epoch": 0.88, + "grad_norm": 5.597214221954346, + "learning_rate": 1.656422110552764e-05, + "loss": 1.1472, + "step": 17600 + }, + { + "epoch": 0.89, + "grad_norm": 5.753964424133301, + "learning_rate": 1.6544120603015076e-05, + "loss": 1.0838, + "step": 17700 + }, + { + "epoch": 0.89, + "grad_norm": 5.263125896453857, + "learning_rate": 1.6524020100502513e-05, + "loss": 1.1149, + "step": 17800 + }, + { + "epoch": 0.9, + "grad_norm": 2.9451704025268555, + "learning_rate": 1.6504120603015076e-05, + "loss": 1.1162, + "step": 17900 + }, + { + "epoch": 0.9, + "grad_norm": 6.694633960723877, + "learning_rate": 1.6484020100502514e-05, + "loss": 1.1268, + "step": 18000 + }, + { + "epoch": 0.91, + "grad_norm": 5.449553489685059, + "learning_rate": 1.6463919597989952e-05, + "loss": 1.1307, + "step": 18100 + }, + { + "epoch": 0.91, + "grad_norm": 5.502272129058838, + "learning_rate": 1.644381909547739e-05, + "loss": 1.1031, + "step": 18200 + }, + { + "epoch": 0.92, + "grad_norm": 6.899608612060547, + "learning_rate": 1.6423718592964824e-05, + "loss": 1.1389, + "step": 18300 + }, + { + "epoch": 0.92, + "grad_norm": 4.675032615661621, + "learning_rate": 1.6403618090452262e-05, + "loss": 1.1541, + "step": 18400 + }, + { + "epoch": 0.93, + "grad_norm": 7.353012561798096, + "learning_rate": 1.63835175879397e-05, + "loss": 1.1213, + "step": 18500 + }, + { + "epoch": 0.93, + "grad_norm": 4.253681659698486, + "learning_rate": 1.636341708542714e-05, + "loss": 1.1672, + "step": 18600 + }, + { + "epoch": 0.94, + "grad_norm": 6.5902018547058105, + "learning_rate": 1.6343316582914573e-05, + "loss": 1.1349, + "step": 18700 + }, + { + "epoch": 0.94, + "grad_norm": 5.40578556060791, + "learning_rate": 1.632321608040201e-05, + "loss": 1.1283, + "step": 18800 + }, + { + "epoch": 0.94, + "grad_norm": 3.9744160175323486, + "learning_rate": 1.630311557788945e-05, + "loss": 1.1463, + "step": 18900 + }, + { + "epoch": 0.95, + "grad_norm": 6.115358352661133, + "learning_rate": 1.6283015075376887e-05, + "loss": 1.1443, + "step": 19000 + }, + { + "epoch": 0.95, + "grad_norm": 2.9785940647125244, + "learning_rate": 1.6262914572864325e-05, + "loss": 1.1409, + "step": 19100 + }, + { + "epoch": 0.96, + "grad_norm": 5.200758934020996, + "learning_rate": 1.6243015075376887e-05, + "loss": 1.1629, + "step": 19200 + }, + { + "epoch": 0.96, + "grad_norm": 5.975739479064941, + "learning_rate": 1.6222914572864322e-05, + "loss": 1.083, + "step": 19300 + }, + { + "epoch": 0.97, + "grad_norm": 6.220870018005371, + "learning_rate": 1.620281407035176e-05, + "loss": 1.1305, + "step": 19400 + }, + { + "epoch": 0.97, + "grad_norm": 4.187997341156006, + "learning_rate": 1.6182713567839198e-05, + "loss": 1.1028, + "step": 19500 + }, + { + "epoch": 0.98, + "grad_norm": 5.540648937225342, + "learning_rate": 1.6162613065326636e-05, + "loss": 1.1176, + "step": 19600 + }, + { + "epoch": 0.98, + "grad_norm": 5.99765157699585, + "learning_rate": 1.6142512562814074e-05, + "loss": 1.0932, + "step": 19700 + }, + { + "epoch": 0.99, + "grad_norm": 4.647700786590576, + "learning_rate": 1.6122412060301508e-05, + "loss": 1.1294, + "step": 19800 + }, + { + "epoch": 0.99, + "grad_norm": 6.05048131942749, + "learning_rate": 1.6102311557788946e-05, + "loss": 1.0828, + "step": 19900 + }, + { + "epoch": 1.0, + "grad_norm": 4.912966251373291, + "learning_rate": 1.608221105527638e-05, + "loss": 1.0975, + "step": 20000 + }, + { + "epoch": 1.0, + "eval_loss": 1.132000207901001, + "eval_runtime": 21.5853, + "eval_samples_per_second": 46.328, + "eval_steps_per_second": 5.791, + "step": 20000 + }, + { + "epoch": 1.0, + "grad_norm": 5.5869340896606445, + "learning_rate": 1.6062110552763822e-05, + "loss": 1.1428, + "step": 20100 + }, + { + "epoch": 1.01, + "grad_norm": 4.5555739402771, + "learning_rate": 1.6042010050251257e-05, + "loss": 1.0939, + "step": 20200 + }, + { + "epoch": 1.01, + "grad_norm": 3.527172803878784, + "learning_rate": 1.6021909547738695e-05, + "loss": 1.1184, + "step": 20300 + }, + { + "epoch": 1.02, + "grad_norm": 2.7429285049438477, + "learning_rate": 1.600180904522613e-05, + "loss": 1.1028, + "step": 20400 + }, + { + "epoch": 1.02, + "grad_norm": 3.6536190509796143, + "learning_rate": 1.598170854271357e-05, + "loss": 1.0954, + "step": 20500 + }, + { + "epoch": 1.03, + "grad_norm": 4.48521089553833, + "learning_rate": 1.5961608040201005e-05, + "loss": 1.1001, + "step": 20600 + }, + { + "epoch": 1.03, + "grad_norm": 7.937503814697266, + "learning_rate": 1.5941507537688443e-05, + "loss": 1.0676, + "step": 20700 + }, + { + "epoch": 1.04, + "grad_norm": 7.802252769470215, + "learning_rate": 1.592140703517588e-05, + "loss": 1.1007, + "step": 20800 + }, + { + "epoch": 1.04, + "grad_norm": 6.141603469848633, + "learning_rate": 1.590130653266332e-05, + "loss": 1.0749, + "step": 20900 + }, + { + "epoch": 1.05, + "grad_norm": 5.166286945343018, + "learning_rate": 1.5881206030150754e-05, + "loss": 1.0704, + "step": 21000 + }, + { + "epoch": 1.05, + "grad_norm": 5.407045364379883, + "learning_rate": 1.5861105527638192e-05, + "loss": 1.0852, + "step": 21100 + }, + { + "epoch": 1.06, + "grad_norm": 5.4536967277526855, + "learning_rate": 1.584100502512563e-05, + "loss": 1.1152, + "step": 21200 + }, + { + "epoch": 1.06, + "grad_norm": 5.464064121246338, + "learning_rate": 1.5820904522613068e-05, + "loss": 1.0546, + "step": 21300 + }, + { + "epoch": 1.07, + "grad_norm": 3.853875160217285, + "learning_rate": 1.580100502512563e-05, + "loss": 1.0858, + "step": 21400 + }, + { + "epoch": 1.07, + "grad_norm": 4.8497443199157715, + "learning_rate": 1.5780904522613068e-05, + "loss": 1.0973, + "step": 21500 + }, + { + "epoch": 1.08, + "grad_norm": 4.255434513092041, + "learning_rate": 1.5760804020100503e-05, + "loss": 1.0872, + "step": 21600 + }, + { + "epoch": 1.08, + "grad_norm": 4.134657382965088, + "learning_rate": 1.574070351758794e-05, + "loss": 1.1127, + "step": 21700 + }, + { + "epoch": 1.09, + "grad_norm": 9.425840377807617, + "learning_rate": 1.572060301507538e-05, + "loss": 1.1147, + "step": 21800 + }, + { + "epoch": 1.09, + "grad_norm": 5.42075777053833, + "learning_rate": 1.5700502512562817e-05, + "loss": 1.0719, + "step": 21900 + }, + { + "epoch": 1.1, + "grad_norm": 5.076992988586426, + "learning_rate": 1.5680402010050255e-05, + "loss": 1.095, + "step": 22000 + }, + { + "epoch": 1.1, + "grad_norm": 5.668195724487305, + "learning_rate": 1.566030150753769e-05, + "loss": 1.0799, + "step": 22100 + }, + { + "epoch": 1.11, + "grad_norm": 5.9342474937438965, + "learning_rate": 1.5640201005025127e-05, + "loss": 1.0965, + "step": 22200 + }, + { + "epoch": 1.11, + "grad_norm": 5.112601280212402, + "learning_rate": 1.5620100502512565e-05, + "loss": 1.0951, + "step": 22300 + }, + { + "epoch": 1.12, + "grad_norm": 10.206339836120605, + "learning_rate": 1.5600000000000003e-05, + "loss": 1.0837, + "step": 22400 + }, + { + "epoch": 1.12, + "grad_norm": 3.8015480041503906, + "learning_rate": 1.5579899497487438e-05, + "loss": 1.0871, + "step": 22500 + }, + { + "epoch": 1.13, + "grad_norm": 4.524369239807129, + "learning_rate": 1.5559798994974876e-05, + "loss": 1.1263, + "step": 22600 + }, + { + "epoch": 1.14, + "grad_norm": 5.1671671867370605, + "learning_rate": 1.553969849246231e-05, + "loss": 1.085, + "step": 22700 + }, + { + "epoch": 1.14, + "grad_norm": 4.96006965637207, + "learning_rate": 1.551959798994975e-05, + "loss": 1.0893, + "step": 22800 + }, + { + "epoch": 1.15, + "grad_norm": 6.482675075531006, + "learning_rate": 1.5499497487437186e-05, + "loss": 1.0667, + "step": 22900 + }, + { + "epoch": 1.15, + "grad_norm": 4.591585636138916, + "learning_rate": 1.5479396984924624e-05, + "loss": 1.0861, + "step": 23000 + }, + { + "epoch": 1.16, + "grad_norm": 4.026520729064941, + "learning_rate": 1.5459296482412062e-05, + "loss": 1.0772, + "step": 23100 + }, + { + "epoch": 1.16, + "grad_norm": 5.972117900848389, + "learning_rate": 1.54391959798995e-05, + "loss": 1.0818, + "step": 23200 + }, + { + "epoch": 1.17, + "grad_norm": 4.737887382507324, + "learning_rate": 1.5419095477386935e-05, + "loss": 1.0752, + "step": 23300 + }, + { + "epoch": 1.17, + "grad_norm": 4.748262882232666, + "learning_rate": 1.5398994974874373e-05, + "loss": 1.0803, + "step": 23400 + }, + { + "epoch": 1.18, + "grad_norm": 4.94175386428833, + "learning_rate": 1.537889447236181e-05, + "loss": 1.0754, + "step": 23500 + }, + { + "epoch": 1.18, + "grad_norm": 4.3259172439575195, + "learning_rate": 1.535879396984925e-05, + "loss": 1.0463, + "step": 23600 + }, + { + "epoch": 1.19, + "grad_norm": 5.240546703338623, + "learning_rate": 1.5338693467336687e-05, + "loss": 1.0547, + "step": 23700 + }, + { + "epoch": 1.19, + "grad_norm": 6.120886325836182, + "learning_rate": 1.531859296482412e-05, + "loss": 1.0861, + "step": 23800 + }, + { + "epoch": 1.2, + "grad_norm": 5.634921550750732, + "learning_rate": 1.529849246231156e-05, + "loss": 1.0722, + "step": 23900 + }, + { + "epoch": 1.2, + "grad_norm": 5.39201021194458, + "learning_rate": 1.5278391959798997e-05, + "loss": 1.07, + "step": 24000 + }, + { + "epoch": 1.21, + "grad_norm": 6.85221004486084, + "learning_rate": 1.5258291457286433e-05, + "loss": 1.0578, + "step": 24100 + }, + { + "epoch": 1.21, + "grad_norm": 4.522882461547852, + "learning_rate": 1.523819095477387e-05, + "loss": 1.0895, + "step": 24200 + }, + { + "epoch": 1.22, + "grad_norm": 4.020057201385498, + "learning_rate": 1.5218090452261308e-05, + "loss": 1.0377, + "step": 24300 + }, + { + "epoch": 1.22, + "grad_norm": 4.188474655151367, + "learning_rate": 1.5197989949748746e-05, + "loss": 1.0469, + "step": 24400 + }, + { + "epoch": 1.23, + "grad_norm": 6.872804164886475, + "learning_rate": 1.5177889447236182e-05, + "loss": 1.0795, + "step": 24500 + }, + { + "epoch": 1.23, + "grad_norm": 5.834617614746094, + "learning_rate": 1.515778894472362e-05, + "loss": 1.0827, + "step": 24600 + }, + { + "epoch": 1.23, + "grad_norm": 4.008932590484619, + "learning_rate": 1.5137688442211056e-05, + "loss": 1.069, + "step": 24700 + }, + { + "epoch": 1.24, + "grad_norm": 5.309475898742676, + "learning_rate": 1.5117587939698494e-05, + "loss": 1.0668, + "step": 24800 + }, + { + "epoch": 1.25, + "grad_norm": 6.02021598815918, + "learning_rate": 1.5097487437185932e-05, + "loss": 1.0611, + "step": 24900 + }, + { + "epoch": 1.25, + "grad_norm": 4.143280029296875, + "learning_rate": 1.5077587939698495e-05, + "loss": 1.0526, + "step": 25000 + }, + { + "epoch": 1.25, + "grad_norm": 4.231622695922852, + "learning_rate": 1.505748743718593e-05, + "loss": 1.0706, + "step": 25100 + }, + { + "epoch": 1.26, + "grad_norm": 4.0399322509765625, + "learning_rate": 1.5037386934673369e-05, + "loss": 1.0878, + "step": 25200 + }, + { + "epoch": 1.27, + "grad_norm": 4.2283759117126465, + "learning_rate": 1.5017286432160805e-05, + "loss": 1.0903, + "step": 25300 + }, + { + "epoch": 1.27, + "grad_norm": 6.159567356109619, + "learning_rate": 1.4997185929648241e-05, + "loss": 1.069, + "step": 25400 + }, + { + "epoch": 1.27, + "grad_norm": 5.181605815887451, + "learning_rate": 1.4977085427135681e-05, + "loss": 1.0712, + "step": 25500 + }, + { + "epoch": 1.28, + "grad_norm": 4.90966796875, + "learning_rate": 1.4956984924623117e-05, + "loss": 1.0672, + "step": 25600 + }, + { + "epoch": 1.28, + "grad_norm": 4.765697479248047, + "learning_rate": 1.4936884422110554e-05, + "loss": 1.0338, + "step": 25700 + }, + { + "epoch": 1.29, + "grad_norm": 4.3462018966674805, + "learning_rate": 1.491678391959799e-05, + "loss": 1.0408, + "step": 25800 + }, + { + "epoch": 1.29, + "grad_norm": 5.249480247497559, + "learning_rate": 1.489668341708543e-05, + "loss": 1.0576, + "step": 25900 + }, + { + "epoch": 1.3, + "grad_norm": 5.543900489807129, + "learning_rate": 1.4876582914572866e-05, + "loss": 1.0651, + "step": 26000 + }, + { + "epoch": 1.3, + "grad_norm": 6.526113033294678, + "learning_rate": 1.4856482412060302e-05, + "loss": 1.0596, + "step": 26100 + }, + { + "epoch": 1.31, + "grad_norm": 4.725895404815674, + "learning_rate": 1.4836381909547738e-05, + "loss": 1.0969, + "step": 26200 + }, + { + "epoch": 1.31, + "grad_norm": 6.068490028381348, + "learning_rate": 1.4816281407035178e-05, + "loss": 1.0284, + "step": 26300 + }, + { + "epoch": 1.32, + "grad_norm": 4.363389015197754, + "learning_rate": 1.4796180904522614e-05, + "loss": 1.0589, + "step": 26400 + }, + { + "epoch": 1.32, + "grad_norm": 6.8659257888793945, + "learning_rate": 1.477608040201005e-05, + "loss": 1.0803, + "step": 26500 + }, + { + "epoch": 1.33, + "grad_norm": 5.061355113983154, + "learning_rate": 1.4755979899497489e-05, + "loss": 1.066, + "step": 26600 + }, + { + "epoch": 1.33, + "grad_norm": 4.511940956115723, + "learning_rate": 1.4735879396984927e-05, + "loss": 1.0447, + "step": 26700 + }, + { + "epoch": 1.34, + "grad_norm": 4.449003219604492, + "learning_rate": 1.4715778894472363e-05, + "loss": 1.0532, + "step": 26800 + }, + { + "epoch": 1.34, + "grad_norm": 5.1782307624816895, + "learning_rate": 1.46956783919598e-05, + "loss": 1.0608, + "step": 26900 + }, + { + "epoch": 1.35, + "grad_norm": 5.087260723114014, + "learning_rate": 1.4675577889447237e-05, + "loss": 1.0371, + "step": 27000 + }, + { + "epoch": 1.35, + "grad_norm": 4.387496471405029, + "learning_rate": 1.4655477386934675e-05, + "loss": 1.055, + "step": 27100 + }, + { + "epoch": 1.36, + "grad_norm": 4.9253010749816895, + "learning_rate": 1.4635376884422113e-05, + "loss": 1.0385, + "step": 27200 + }, + { + "epoch": 1.36, + "grad_norm": 4.611992835998535, + "learning_rate": 1.461527638190955e-05, + "loss": 1.0338, + "step": 27300 + }, + { + "epoch": 1.37, + "grad_norm": 2.981304168701172, + "learning_rate": 1.4595175879396986e-05, + "loss": 1.0516, + "step": 27400 + }, + { + "epoch": 1.38, + "grad_norm": 5.678966045379639, + "learning_rate": 1.4575075376884422e-05, + "loss": 1.0788, + "step": 27500 + }, + { + "epoch": 1.38, + "grad_norm": 5.3079752922058105, + "learning_rate": 1.4554974874371862e-05, + "loss": 1.0853, + "step": 27600 + }, + { + "epoch": 1.39, + "grad_norm": 5.990561485290527, + "learning_rate": 1.4534874371859298e-05, + "loss": 1.0187, + "step": 27700 + }, + { + "epoch": 1.39, + "grad_norm": 7.396142482757568, + "learning_rate": 1.4514773869346734e-05, + "loss": 1.0694, + "step": 27800 + }, + { + "epoch": 1.4, + "grad_norm": 4.319200038909912, + "learning_rate": 1.449467336683417e-05, + "loss": 1.0668, + "step": 27900 + }, + { + "epoch": 1.4, + "grad_norm": 2.7691450119018555, + "learning_rate": 1.447457286432161e-05, + "loss": 1.0652, + "step": 28000 + }, + { + "epoch": 1.41, + "grad_norm": 8.814241409301758, + "learning_rate": 1.4454472361809046e-05, + "loss": 1.0423, + "step": 28100 + }, + { + "epoch": 1.41, + "grad_norm": 5.264801979064941, + "learning_rate": 1.4434371859296483e-05, + "loss": 1.0918, + "step": 28200 + }, + { + "epoch": 1.42, + "grad_norm": 4.573727130889893, + "learning_rate": 1.441427135678392e-05, + "loss": 1.0822, + "step": 28300 + }, + { + "epoch": 1.42, + "grad_norm": 3.6568844318389893, + "learning_rate": 1.4394170854271359e-05, + "loss": 1.0492, + "step": 28400 + }, + { + "epoch": 1.43, + "grad_norm": 4.999285697937012, + "learning_rate": 1.437427135678392e-05, + "loss": 1.0583, + "step": 28500 + }, + { + "epoch": 1.43, + "grad_norm": 4.125443458557129, + "learning_rate": 1.4354170854271359e-05, + "loss": 1.0422, + "step": 28600 + }, + { + "epoch": 1.44, + "grad_norm": 6.014279365539551, + "learning_rate": 1.4334070351758795e-05, + "loss": 1.0347, + "step": 28700 + }, + { + "epoch": 1.44, + "grad_norm": 8.18229866027832, + "learning_rate": 1.4313969849246232e-05, + "loss": 1.0133, + "step": 28800 + }, + { + "epoch": 1.45, + "grad_norm": 3.3756470680236816, + "learning_rate": 1.4294070351758796e-05, + "loss": 1.0684, + "step": 28900 + }, + { + "epoch": 1.45, + "grad_norm": 5.568530559539795, + "learning_rate": 1.4273969849246232e-05, + "loss": 1.0666, + "step": 29000 + }, + { + "epoch": 1.46, + "grad_norm": 4.440110683441162, + "learning_rate": 1.4253869346733668e-05, + "loss": 1.057, + "step": 29100 + }, + { + "epoch": 1.46, + "grad_norm": 6.835775852203369, + "learning_rate": 1.4233768844221108e-05, + "loss": 1.0176, + "step": 29200 + }, + { + "epoch": 1.47, + "grad_norm": 5.715722560882568, + "learning_rate": 1.4213668341708544e-05, + "loss": 1.0996, + "step": 29300 + }, + { + "epoch": 1.47, + "grad_norm": 6.401480674743652, + "learning_rate": 1.419356783919598e-05, + "loss": 1.0459, + "step": 29400 + }, + { + "epoch": 1.48, + "grad_norm": 7.125598430633545, + "learning_rate": 1.4173467336683417e-05, + "loss": 1.0067, + "step": 29500 + }, + { + "epoch": 1.48, + "grad_norm": 5.287647724151611, + "learning_rate": 1.4153366834170856e-05, + "loss": 1.0475, + "step": 29600 + }, + { + "epoch": 1.48, + "grad_norm": 5.175357818603516, + "learning_rate": 1.4133266331658293e-05, + "loss": 1.0361, + "step": 29700 + }, + { + "epoch": 1.49, + "grad_norm": 4.676697731018066, + "learning_rate": 1.4113165829145729e-05, + "loss": 0.9925, + "step": 29800 + }, + { + "epoch": 1.5, + "grad_norm": 4.375120162963867, + "learning_rate": 1.4093065326633167e-05, + "loss": 1.0145, + "step": 29900 + }, + { + "epoch": 1.5, + "grad_norm": 4.380770683288574, + "learning_rate": 1.4072964824120605e-05, + "loss": 1.0763, + "step": 30000 + }, + { + "epoch": 1.5, + "eval_loss": 1.0519436597824097, + "eval_runtime": 21.613, + "eval_samples_per_second": 46.269, + "eval_steps_per_second": 5.784, + "step": 30000 + }, + { + "epoch": 1.5, + "grad_norm": 5.796531677246094, + "learning_rate": 1.4052864321608041e-05, + "loss": 1.0563, + "step": 30100 + }, + { + "epoch": 1.51, + "grad_norm": 2.713714361190796, + "learning_rate": 1.4032763819095479e-05, + "loss": 1.0549, + "step": 30200 + }, + { + "epoch": 1.52, + "grad_norm": 6.333755016326904, + "learning_rate": 1.4012663316582915e-05, + "loss": 1.042, + "step": 30300 + }, + { + "epoch": 1.52, + "grad_norm": 3.8109474182128906, + "learning_rate": 1.3992562814070353e-05, + "loss": 1.0773, + "step": 30400 + }, + { + "epoch": 1.52, + "grad_norm": 6.425621509552002, + "learning_rate": 1.3972462311557791e-05, + "loss": 1.0066, + "step": 30500 + }, + { + "epoch": 1.53, + "grad_norm": 4.9127607345581055, + "learning_rate": 1.3952361809045228e-05, + "loss": 1.0022, + "step": 30600 + }, + { + "epoch": 1.54, + "grad_norm": 4.212081432342529, + "learning_rate": 1.3932261306532664e-05, + "loss": 1.0358, + "step": 30700 + }, + { + "epoch": 1.54, + "grad_norm": 7.6413187980651855, + "learning_rate": 1.39121608040201e-05, + "loss": 1.0413, + "step": 30800 + }, + { + "epoch": 1.54, + "grad_norm": 4.2576494216918945, + "learning_rate": 1.389206030150754e-05, + "loss": 1.0332, + "step": 30900 + }, + { + "epoch": 1.55, + "grad_norm": 4.797669887542725, + "learning_rate": 1.3871959798994976e-05, + "loss": 1.0396, + "step": 31000 + }, + { + "epoch": 1.56, + "grad_norm": 5.891973972320557, + "learning_rate": 1.3851859296482412e-05, + "loss": 1.0281, + "step": 31100 + }, + { + "epoch": 1.56, + "grad_norm": 5.9344964027404785, + "learning_rate": 1.3831758793969849e-05, + "loss": 1.024, + "step": 31200 + }, + { + "epoch": 1.56, + "grad_norm": 4.902309417724609, + "learning_rate": 1.3811658291457288e-05, + "loss": 1.027, + "step": 31300 + }, + { + "epoch": 1.57, + "grad_norm": 6.387609958648682, + "learning_rate": 1.3791557788944725e-05, + "loss": 1.0207, + "step": 31400 + }, + { + "epoch": 1.57, + "grad_norm": 5.870815277099609, + "learning_rate": 1.3771457286432161e-05, + "loss": 1.0128, + "step": 31500 + }, + { + "epoch": 1.58, + "grad_norm": 6.101361274719238, + "learning_rate": 1.3751356783919599e-05, + "loss": 1.0412, + "step": 31600 + }, + { + "epoch": 1.58, + "grad_norm": 5.250607967376709, + "learning_rate": 1.3731256281407037e-05, + "loss": 1.0146, + "step": 31700 + }, + { + "epoch": 1.59, + "grad_norm": 5.449378967285156, + "learning_rate": 1.3711155778894473e-05, + "loss": 1.03, + "step": 31800 + }, + { + "epoch": 1.59, + "grad_norm": 4.564045429229736, + "learning_rate": 1.3691055276381911e-05, + "loss": 1.0567, + "step": 31900 + }, + { + "epoch": 1.6, + "grad_norm": 5.84417200088501, + "learning_rate": 1.3670954773869347e-05, + "loss": 1.0782, + "step": 32000 + }, + { + "epoch": 1.6, + "grad_norm": 4.725462436676025, + "learning_rate": 1.3650854271356785e-05, + "loss": 1.0327, + "step": 32100 + }, + { + "epoch": 1.61, + "grad_norm": 6.999115943908691, + "learning_rate": 1.3630753768844223e-05, + "loss": 1.0405, + "step": 32200 + }, + { + "epoch": 1.61, + "grad_norm": 4.241363525390625, + "learning_rate": 1.361065326633166e-05, + "loss": 1.02, + "step": 32300 + }, + { + "epoch": 1.62, + "grad_norm": 5.884255886077881, + "learning_rate": 1.3590552763819096e-05, + "loss": 1.0634, + "step": 32400 + }, + { + "epoch": 1.62, + "grad_norm": 3.674698829650879, + "learning_rate": 1.3570452261306536e-05, + "loss": 1.0389, + "step": 32500 + }, + { + "epoch": 1.63, + "grad_norm": 4.227616310119629, + "learning_rate": 1.3550351758793972e-05, + "loss": 0.9992, + "step": 32600 + }, + { + "epoch": 1.64, + "grad_norm": 4.682816982269287, + "learning_rate": 1.3530251256281408e-05, + "loss": 1.0111, + "step": 32700 + }, + { + "epoch": 1.64, + "grad_norm": 4.632464408874512, + "learning_rate": 1.3510150753768844e-05, + "loss": 1.0223, + "step": 32800 + }, + { + "epoch": 1.65, + "grad_norm": 6.061766147613525, + "learning_rate": 1.349005025125628e-05, + "loss": 0.9837, + "step": 32900 + }, + { + "epoch": 1.65, + "grad_norm": 5.4998908042907715, + "learning_rate": 1.346994974874372e-05, + "loss": 1.041, + "step": 33000 + }, + { + "epoch": 1.66, + "grad_norm": 6.294175624847412, + "learning_rate": 1.3449849246231157e-05, + "loss": 1.0311, + "step": 33100 + }, + { + "epoch": 1.66, + "grad_norm": 5.177206039428711, + "learning_rate": 1.3429748743718593e-05, + "loss": 1.0435, + "step": 33200 + }, + { + "epoch": 1.67, + "grad_norm": 4.389501571655273, + "learning_rate": 1.3409648241206031e-05, + "loss": 1.0104, + "step": 33300 + }, + { + "epoch": 1.67, + "grad_norm": 5.105901718139648, + "learning_rate": 1.3389547738693469e-05, + "loss": 0.9868, + "step": 33400 + }, + { + "epoch": 1.68, + "grad_norm": 3.407482147216797, + "learning_rate": 1.3369447236180905e-05, + "loss": 1.0559, + "step": 33500 + }, + { + "epoch": 1.68, + "grad_norm": 6.496652126312256, + "learning_rate": 1.3349346733668343e-05, + "loss": 0.9849, + "step": 33600 + }, + { + "epoch": 1.69, + "grad_norm": 6.241397857666016, + "learning_rate": 1.332924623115578e-05, + "loss": 0.9995, + "step": 33700 + }, + { + "epoch": 1.69, + "grad_norm": 5.998499870300293, + "learning_rate": 1.3309145728643217e-05, + "loss": 1.0355, + "step": 33800 + }, + { + "epoch": 1.69, + "grad_norm": 5.380569934844971, + "learning_rate": 1.3289045226130655e-05, + "loss": 1.0082, + "step": 33900 + }, + { + "epoch": 1.7, + "grad_norm": 5.168824195861816, + "learning_rate": 1.3268944723618092e-05, + "loss": 1.052, + "step": 34000 + }, + { + "epoch": 1.71, + "grad_norm": 5.691008567810059, + "learning_rate": 1.3248844221105528e-05, + "loss": 1.0424, + "step": 34100 + }, + { + "epoch": 1.71, + "grad_norm": 5.678094387054443, + "learning_rate": 1.3228743718592968e-05, + "loss": 1.0083, + "step": 34200 + }, + { + "epoch": 1.71, + "grad_norm": 6.432235240936279, + "learning_rate": 1.3208643216080404e-05, + "loss": 0.9766, + "step": 34300 + }, + { + "epoch": 1.72, + "grad_norm": 6.016462326049805, + "learning_rate": 1.318854271356784e-05, + "loss": 1.0059, + "step": 34400 + }, + { + "epoch": 1.73, + "grad_norm": 4.596778392791748, + "learning_rate": 1.3168442211055276e-05, + "loss": 0.9462, + "step": 34500 + }, + { + "epoch": 1.73, + "grad_norm": 7.2965850830078125, + "learning_rate": 1.3148341708542716e-05, + "loss": 0.972, + "step": 34600 + }, + { + "epoch": 1.73, + "grad_norm": 5.232773780822754, + "learning_rate": 1.3128241206030152e-05, + "loss": 1.0532, + "step": 34700 + }, + { + "epoch": 1.74, + "grad_norm": 5.5057783126831055, + "learning_rate": 1.3108140703517589e-05, + "loss": 0.9835, + "step": 34800 + }, + { + "epoch": 1.75, + "grad_norm": 3.0561375617980957, + "learning_rate": 1.3088040201005025e-05, + "loss": 1.0293, + "step": 34900 + }, + { + "epoch": 1.75, + "grad_norm": 4.761837959289551, + "learning_rate": 1.3068140703517589e-05, + "loss": 1.0232, + "step": 35000 + }, + { + "epoch": 1.75, + "grad_norm": 7.006007671356201, + "learning_rate": 1.3048040201005025e-05, + "loss": 0.9945, + "step": 35100 + }, + { + "epoch": 1.76, + "grad_norm": 4.829462051391602, + "learning_rate": 1.3027939698492465e-05, + "loss": 1.0589, + "step": 35200 + }, + { + "epoch": 1.77, + "grad_norm": 3.8825013637542725, + "learning_rate": 1.3007839195979901e-05, + "loss": 0.9984, + "step": 35300 + }, + { + "epoch": 1.77, + "grad_norm": 5.655978202819824, + "learning_rate": 1.2987738693467338e-05, + "loss": 1.0004, + "step": 35400 + }, + { + "epoch": 1.77, + "grad_norm": 5.612642765045166, + "learning_rate": 1.2967638190954774e-05, + "loss": 0.9874, + "step": 35500 + }, + { + "epoch": 1.78, + "grad_norm": 17.78661346435547, + "learning_rate": 1.2947537688442212e-05, + "loss": 1.0322, + "step": 35600 + }, + { + "epoch": 1.79, + "grad_norm": 4.723743915557861, + "learning_rate": 1.292743718592965e-05, + "loss": 0.9984, + "step": 35700 + }, + { + "epoch": 1.79, + "grad_norm": 5.048336982727051, + "learning_rate": 1.2907336683417086e-05, + "loss": 1.0588, + "step": 35800 + }, + { + "epoch": 1.79, + "grad_norm": 6.086093425750732, + "learning_rate": 1.2887236180904524e-05, + "loss": 1.0075, + "step": 35900 + }, + { + "epoch": 1.8, + "grad_norm": 6.542403697967529, + "learning_rate": 1.286713567839196e-05, + "loss": 1.0219, + "step": 36000 + }, + { + "epoch": 1.81, + "grad_norm": 5.013860702514648, + "learning_rate": 1.2847035175879398e-05, + "loss": 1.0307, + "step": 36100 + }, + { + "epoch": 1.81, + "grad_norm": 5.978675365447998, + "learning_rate": 1.2826934673366835e-05, + "loss": 1.0026, + "step": 36200 + }, + { + "epoch": 1.81, + "grad_norm": 6.217547416687012, + "learning_rate": 1.2806834170854273e-05, + "loss": 1.0196, + "step": 36300 + }, + { + "epoch": 1.82, + "grad_norm": 4.577905654907227, + "learning_rate": 1.2786733668341709e-05, + "loss": 0.9767, + "step": 36400 + }, + { + "epoch": 1.82, + "grad_norm": 4.999172210693359, + "learning_rate": 1.2766633165829147e-05, + "loss": 1.0261, + "step": 36500 + }, + { + "epoch": 1.83, + "grad_norm": 3.3435771465301514, + "learning_rate": 1.2746532663316585e-05, + "loss": 0.9751, + "step": 36600 + }, + { + "epoch": 1.83, + "grad_norm": 6.218837261199951, + "learning_rate": 1.2726432160804021e-05, + "loss": 0.9887, + "step": 36700 + }, + { + "epoch": 1.84, + "grad_norm": 2.914499044418335, + "learning_rate": 1.2706331658291457e-05, + "loss": 1.0172, + "step": 36800 + }, + { + "epoch": 1.84, + "grad_norm": 4.287944793701172, + "learning_rate": 1.2686231155778897e-05, + "loss": 1.0336, + "step": 36900 + }, + { + "epoch": 1.85, + "grad_norm": 9.045112609863281, + "learning_rate": 1.2666331658291458e-05, + "loss": 0.9966, + "step": 37000 + }, + { + "epoch": 1.85, + "grad_norm": 3.9664063453674316, + "learning_rate": 1.2646231155778896e-05, + "loss": 1.0315, + "step": 37100 + }, + { + "epoch": 1.86, + "grad_norm": 5.26336145401001, + "learning_rate": 1.2626130653266334e-05, + "loss": 1.031, + "step": 37200 + }, + { + "epoch": 1.86, + "grad_norm": 5.820954322814941, + "learning_rate": 1.260603015075377e-05, + "loss": 0.9786, + "step": 37300 + }, + { + "epoch": 1.87, + "grad_norm": 3.7999236583709717, + "learning_rate": 1.2585929648241206e-05, + "loss": 1.0008, + "step": 37400 + }, + { + "epoch": 1.88, + "grad_norm": 4.96231746673584, + "learning_rate": 1.2565829145728646e-05, + "loss": 0.9823, + "step": 37500 + }, + { + "epoch": 1.88, + "grad_norm": 5.442008018493652, + "learning_rate": 1.2545728643216082e-05, + "loss": 0.9993, + "step": 37600 + }, + { + "epoch": 1.89, + "grad_norm": 3.0178353786468506, + "learning_rate": 1.2525628140703518e-05, + "loss": 1.009, + "step": 37700 + }, + { + "epoch": 1.89, + "grad_norm": 4.0404052734375, + "learning_rate": 1.2505527638190955e-05, + "loss": 1.0047, + "step": 37800 + }, + { + "epoch": 1.9, + "grad_norm": 3.924924850463867, + "learning_rate": 1.2485427135678394e-05, + "loss": 0.9681, + "step": 37900 + }, + { + "epoch": 1.9, + "grad_norm": 6.560153961181641, + "learning_rate": 1.246532663316583e-05, + "loss": 0.9346, + "step": 38000 + }, + { + "epoch": 1.91, + "grad_norm": 4.826027870178223, + "learning_rate": 1.2445226130653267e-05, + "loss": 0.9878, + "step": 38100 + }, + { + "epoch": 1.91, + "grad_norm": 3.489680767059326, + "learning_rate": 1.2425125628140703e-05, + "loss": 0.9943, + "step": 38200 + }, + { + "epoch": 1.92, + "grad_norm": 4.7767014503479, + "learning_rate": 1.2405025125628141e-05, + "loss": 1.02, + "step": 38300 + }, + { + "epoch": 1.92, + "grad_norm": 7.311853408813477, + "learning_rate": 1.238492462311558e-05, + "loss": 0.946, + "step": 38400 + }, + { + "epoch": 1.93, + "grad_norm": 4.217949390411377, + "learning_rate": 1.236502512562814e-05, + "loss": 0.974, + "step": 38500 + }, + { + "epoch": 1.93, + "grad_norm": 8.919093132019043, + "learning_rate": 1.234492462311558e-05, + "loss": 0.9628, + "step": 38600 + }, + { + "epoch": 1.94, + "grad_norm": 4.355369567871094, + "learning_rate": 1.2324824120603016e-05, + "loss": 0.9325, + "step": 38700 + }, + { + "epoch": 1.94, + "grad_norm": 5.474518775939941, + "learning_rate": 1.2304723618090452e-05, + "loss": 0.9505, + "step": 38800 + }, + { + "epoch": 1.94, + "grad_norm": 6.389540195465088, + "learning_rate": 1.228462311557789e-05, + "loss": 0.9574, + "step": 38900 + }, + { + "epoch": 1.95, + "grad_norm": 6.9164719581604, + "learning_rate": 1.2264522613065328e-05, + "loss": 0.9644, + "step": 39000 + }, + { + "epoch": 1.96, + "grad_norm": 4.559136390686035, + "learning_rate": 1.2244422110552764e-05, + "loss": 1.0306, + "step": 39100 + }, + { + "epoch": 1.96, + "grad_norm": 6.381926536560059, + "learning_rate": 1.2224321608040202e-05, + "loss": 0.9542, + "step": 39200 + }, + { + "epoch": 1.96, + "grad_norm": 7.826279163360596, + "learning_rate": 1.2204221105527639e-05, + "loss": 0.9818, + "step": 39300 + }, + { + "epoch": 1.97, + "grad_norm": 5.7296929359436035, + "learning_rate": 1.2184120603015077e-05, + "loss": 0.9591, + "step": 39400 + }, + { + "epoch": 1.98, + "grad_norm": 6.479053974151611, + "learning_rate": 1.2164020100502515e-05, + "loss": 1.0083, + "step": 39500 + }, + { + "epoch": 1.98, + "grad_norm": 5.9377241134643555, + "learning_rate": 1.2144120603015077e-05, + "loss": 0.9969, + "step": 39600 + }, + { + "epoch": 1.98, + "grad_norm": 4.59481143951416, + "learning_rate": 1.2124020100502513e-05, + "loss": 1.015, + "step": 39700 + }, + { + "epoch": 1.99, + "grad_norm": 4.979703903198242, + "learning_rate": 1.2103919597989951e-05, + "loss": 0.977, + "step": 39800 + }, + { + "epoch": 2.0, + "grad_norm": 6.539973735809326, + "learning_rate": 1.2083819095477388e-05, + "loss": 0.9938, + "step": 39900 + }, + { + "epoch": 2.0, + "grad_norm": 5.971490383148193, + "learning_rate": 1.2063718592964825e-05, + "loss": 0.9848, + "step": 40000 + }, + { + "epoch": 2.0, + "eval_loss": 0.9915822744369507, + "eval_runtime": 21.5957, + "eval_samples_per_second": 46.305, + "eval_steps_per_second": 5.788, + "step": 40000 + }, + { + "epoch": 2.0, + "grad_norm": 5.639512538909912, + "learning_rate": 1.2043618090452262e-05, + "loss": 0.9401, + "step": 40100 + }, + { + "epoch": 2.01, + "grad_norm": 3.0007824897766113, + "learning_rate": 1.20235175879397e-05, + "loss": 0.9769, + "step": 40200 + }, + { + "epoch": 2.02, + "grad_norm": 4.346365451812744, + "learning_rate": 1.2003417085427136e-05, + "loss": 0.9247, + "step": 40300 + }, + { + "epoch": 2.02, + "grad_norm": 6.308602809906006, + "learning_rate": 1.1983316582914574e-05, + "loss": 0.9685, + "step": 40400 + }, + { + "epoch": 2.02, + "grad_norm": 4.597143173217773, + "learning_rate": 1.1963216080402012e-05, + "loss": 0.907, + "step": 40500 + }, + { + "epoch": 2.03, + "grad_norm": 6.000264644622803, + "learning_rate": 1.1943115577889448e-05, + "loss": 0.9311, + "step": 40600 + }, + { + "epoch": 2.04, + "grad_norm": 4.718263149261475, + "learning_rate": 1.1923015075376885e-05, + "loss": 0.9707, + "step": 40700 + }, + { + "epoch": 2.04, + "grad_norm": 3.7472355365753174, + "learning_rate": 1.1902914572864324e-05, + "loss": 0.9812, + "step": 40800 + }, + { + "epoch": 2.04, + "grad_norm": 4.8061017990112305, + "learning_rate": 1.188281407035176e-05, + "loss": 0.9461, + "step": 40900 + }, + { + "epoch": 2.05, + "grad_norm": 5.2381391525268555, + "learning_rate": 1.1862713567839197e-05, + "loss": 0.9972, + "step": 41000 + }, + { + "epoch": 2.06, + "grad_norm": 6.1567583084106445, + "learning_rate": 1.1842613065326633e-05, + "loss": 0.9611, + "step": 41100 + }, + { + "epoch": 2.06, + "grad_norm": 5.496160984039307, + "learning_rate": 1.1822512562814071e-05, + "loss": 0.9612, + "step": 41200 + }, + { + "epoch": 2.06, + "grad_norm": 6.659996509552002, + "learning_rate": 1.1802412060301509e-05, + "loss": 0.9593, + "step": 41300 + }, + { + "epoch": 2.07, + "grad_norm": 7.010763645172119, + "learning_rate": 1.1782311557788945e-05, + "loss": 0.9079, + "step": 41400 + }, + { + "epoch": 2.08, + "grad_norm": 5.539340496063232, + "learning_rate": 1.1762211055276383e-05, + "loss": 0.946, + "step": 41500 + }, + { + "epoch": 2.08, + "grad_norm": 4.7269368171691895, + "learning_rate": 1.174211055276382e-05, + "loss": 0.9702, + "step": 41600 + }, + { + "epoch": 2.08, + "grad_norm": 6.573697090148926, + "learning_rate": 1.1722010050251257e-05, + "loss": 0.9166, + "step": 41700 + }, + { + "epoch": 2.09, + "grad_norm": 5.467616558074951, + "learning_rate": 1.1701909547738694e-05, + "loss": 0.9479, + "step": 41800 + }, + { + "epoch": 2.1, + "grad_norm": 7.292219638824463, + "learning_rate": 1.1681809045226132e-05, + "loss": 0.9694, + "step": 41900 + }, + { + "epoch": 2.1, + "grad_norm": 5.9063849449157715, + "learning_rate": 1.1661708542713568e-05, + "loss": 0.9467, + "step": 42000 + }, + { + "epoch": 2.1, + "grad_norm": 7.106956958770752, + "learning_rate": 1.1641608040201006e-05, + "loss": 0.9344, + "step": 42100 + }, + { + "epoch": 2.11, + "grad_norm": 2.7898268699645996, + "learning_rate": 1.1621507537688444e-05, + "loss": 0.9174, + "step": 42200 + }, + { + "epoch": 2.12, + "grad_norm": 5.543144226074219, + "learning_rate": 1.160140703517588e-05, + "loss": 0.9399, + "step": 42300 + }, + { + "epoch": 2.12, + "grad_norm": 4.507541656494141, + "learning_rate": 1.1581306532663317e-05, + "loss": 0.8989, + "step": 42400 + }, + { + "epoch": 2.12, + "grad_norm": 7.4493937492370605, + "learning_rate": 1.1561206030150756e-05, + "loss": 0.9663, + "step": 42500 + }, + { + "epoch": 2.13, + "grad_norm": 5.758662700653076, + "learning_rate": 1.1541105527638192e-05, + "loss": 0.983, + "step": 42600 + }, + { + "epoch": 2.13, + "grad_norm": 4.6601386070251465, + "learning_rate": 1.1521005025125629e-05, + "loss": 0.936, + "step": 42700 + }, + { + "epoch": 2.14, + "grad_norm": 6.687641620635986, + "learning_rate": 1.1500904522613065e-05, + "loss": 0.9452, + "step": 42800 + }, + { + "epoch": 2.15, + "grad_norm": 6.454759120941162, + "learning_rate": 1.1480804020100505e-05, + "loss": 0.9494, + "step": 42900 + }, + { + "epoch": 2.15, + "grad_norm": 6.235274314880371, + "learning_rate": 1.1460703517587941e-05, + "loss": 0.9107, + "step": 43000 + }, + { + "epoch": 2.15, + "grad_norm": 6.445216655731201, + "learning_rate": 1.1440603015075377e-05, + "loss": 0.9448, + "step": 43100 + }, + { + "epoch": 2.16, + "grad_norm": 4.565326690673828, + "learning_rate": 1.1420502512562814e-05, + "loss": 0.9435, + "step": 43200 + }, + { + "epoch": 2.17, + "grad_norm": 4.653913497924805, + "learning_rate": 1.1400402010050253e-05, + "loss": 0.9492, + "step": 43300 + }, + { + "epoch": 2.17, + "grad_norm": 4.022702693939209, + "learning_rate": 1.138030150753769e-05, + "loss": 0.9365, + "step": 43400 + }, + { + "epoch": 2.17, + "grad_norm": 6.998848915100098, + "learning_rate": 1.1360201005025126e-05, + "loss": 0.9215, + "step": 43500 + }, + { + "epoch": 2.18, + "grad_norm": 3.925429344177246, + "learning_rate": 1.1340100502512564e-05, + "loss": 0.9408, + "step": 43600 + }, + { + "epoch": 2.19, + "grad_norm": 5.22701358795166, + "learning_rate": 1.132e-05, + "loss": 0.9755, + "step": 43700 + }, + { + "epoch": 2.19, + "grad_norm": 5.142667293548584, + "learning_rate": 1.1299899497487438e-05, + "loss": 0.8938, + "step": 43800 + }, + { + "epoch": 2.19, + "grad_norm": 5.2655158042907715, + "learning_rate": 1.1279798994974876e-05, + "loss": 0.9751, + "step": 43900 + }, + { + "epoch": 2.2, + "grad_norm": 5.084207057952881, + "learning_rate": 1.1259698492462312e-05, + "loss": 0.9141, + "step": 44000 + }, + { + "epoch": 2.21, + "grad_norm": 4.578594207763672, + "learning_rate": 1.1239597989949749e-05, + "loss": 0.9403, + "step": 44100 + }, + { + "epoch": 2.21, + "grad_norm": 3.3010849952697754, + "learning_rate": 1.1219497487437188e-05, + "loss": 0.9657, + "step": 44200 + }, + { + "epoch": 2.21, + "grad_norm": 6.451618194580078, + "learning_rate": 1.1199396984924624e-05, + "loss": 0.9297, + "step": 44300 + }, + { + "epoch": 2.22, + "grad_norm": 5.8492655754089355, + "learning_rate": 1.117929648241206e-05, + "loss": 0.9157, + "step": 44400 + }, + { + "epoch": 2.23, + "grad_norm": 5.015758037567139, + "learning_rate": 1.1159195979899497e-05, + "loss": 0.9508, + "step": 44500 + }, + { + "epoch": 2.23, + "grad_norm": 4.814078330993652, + "learning_rate": 1.1139095477386937e-05, + "loss": 0.9361, + "step": 44600 + }, + { + "epoch": 2.23, + "grad_norm": 5.004156112670898, + "learning_rate": 1.1118994974874373e-05, + "loss": 0.958, + "step": 44700 + }, + { + "epoch": 2.24, + "grad_norm": 5.016057968139648, + "learning_rate": 1.109889447236181e-05, + "loss": 0.9755, + "step": 44800 + }, + { + "epoch": 2.25, + "grad_norm": 5.041826248168945, + "learning_rate": 1.1078793969849246e-05, + "loss": 0.9082, + "step": 44900 + }, + { + "epoch": 2.25, + "grad_norm": 5.787368297576904, + "learning_rate": 1.1058693467336685e-05, + "loss": 0.9076, + "step": 45000 + }, + { + "epoch": 2.25, + "grad_norm": 5.170538902282715, + "learning_rate": 1.1038592964824122e-05, + "loss": 0.9117, + "step": 45100 + }, + { + "epoch": 2.26, + "grad_norm": 7.477475643157959, + "learning_rate": 1.1018492462311558e-05, + "loss": 0.8987, + "step": 45200 + }, + { + "epoch": 2.27, + "grad_norm": 4.626328945159912, + "learning_rate": 1.0998391959798996e-05, + "loss": 0.9197, + "step": 45300 + }, + { + "epoch": 2.27, + "grad_norm": 5.800539016723633, + "learning_rate": 1.0978291457286434e-05, + "loss": 0.9025, + "step": 45400 + }, + { + "epoch": 2.27, + "grad_norm": 4.291562080383301, + "learning_rate": 1.0958391959798994e-05, + "loss": 0.9348, + "step": 45500 + }, + { + "epoch": 2.28, + "grad_norm": 5.439847946166992, + "learning_rate": 1.0938291457286434e-05, + "loss": 0.9416, + "step": 45600 + }, + { + "epoch": 2.29, + "grad_norm": 5.728611946105957, + "learning_rate": 1.091819095477387e-05, + "loss": 0.9124, + "step": 45700 + }, + { + "epoch": 2.29, + "grad_norm": 3.7975008487701416, + "learning_rate": 1.0898090452261307e-05, + "loss": 0.9345, + "step": 45800 + }, + { + "epoch": 2.29, + "grad_norm": 7.625438690185547, + "learning_rate": 1.0877989949748745e-05, + "loss": 0.8925, + "step": 45900 + }, + { + "epoch": 2.3, + "grad_norm": 4.858023643493652, + "learning_rate": 1.0857889447236183e-05, + "loss": 0.9103, + "step": 46000 + }, + { + "epoch": 2.31, + "grad_norm": 6.363548755645752, + "learning_rate": 1.0837788944723619e-05, + "loss": 0.9523, + "step": 46100 + }, + { + "epoch": 2.31, + "grad_norm": 4.639822959899902, + "learning_rate": 1.0817688442211057e-05, + "loss": 0.9322, + "step": 46200 + }, + { + "epoch": 2.31, + "grad_norm": 4.796472072601318, + "learning_rate": 1.0797587939698493e-05, + "loss": 0.9242, + "step": 46300 + }, + { + "epoch": 2.32, + "grad_norm": 3.8870980739593506, + "learning_rate": 1.077748743718593e-05, + "loss": 0.9048, + "step": 46400 + }, + { + "epoch": 2.33, + "grad_norm": 6.010646343231201, + "learning_rate": 1.0757386934673369e-05, + "loss": 0.9566, + "step": 46500 + }, + { + "epoch": 2.33, + "grad_norm": 3.925715684890747, + "learning_rate": 1.0737286432160805e-05, + "loss": 0.9274, + "step": 46600 + }, + { + "epoch": 2.33, + "grad_norm": 5.222326278686523, + "learning_rate": 1.0717185929648242e-05, + "loss": 0.8914, + "step": 46700 + }, + { + "epoch": 2.34, + "grad_norm": 5.363781929016113, + "learning_rate": 1.0697085427135678e-05, + "loss": 0.922, + "step": 46800 + }, + { + "epoch": 2.34, + "grad_norm": 6.332427024841309, + "learning_rate": 1.0676984924623118e-05, + "loss": 0.9017, + "step": 46900 + }, + { + "epoch": 2.35, + "grad_norm": 4.68159818649292, + "learning_rate": 1.0656884422110554e-05, + "loss": 0.9089, + "step": 47000 + }, + { + "epoch": 2.35, + "grad_norm": 4.770488739013672, + "learning_rate": 1.063678391959799e-05, + "loss": 0.9738, + "step": 47100 + }, + { + "epoch": 2.36, + "grad_norm": 6.209041595458984, + "learning_rate": 1.0616683417085426e-05, + "loss": 0.9301, + "step": 47200 + }, + { + "epoch": 2.37, + "grad_norm": 5.330206394195557, + "learning_rate": 1.0596582914572866e-05, + "loss": 0.9515, + "step": 47300 + }, + { + "epoch": 2.37, + "grad_norm": 7.701655387878418, + "learning_rate": 1.0576482412060302e-05, + "loss": 0.9072, + "step": 47400 + }, + { + "epoch": 2.38, + "grad_norm": 4.921889305114746, + "learning_rate": 1.0556381909547739e-05, + "loss": 0.9326, + "step": 47500 + }, + { + "epoch": 2.38, + "grad_norm": 5.353864669799805, + "learning_rate": 1.0536281407035177e-05, + "loss": 0.902, + "step": 47600 + }, + { + "epoch": 2.38, + "grad_norm": 4.63252592086792, + "learning_rate": 1.0516180904522615e-05, + "loss": 0.9357, + "step": 47700 + }, + { + "epoch": 2.39, + "grad_norm": 5.968425750732422, + "learning_rate": 1.0496281407035175e-05, + "loss": 0.9416, + "step": 47800 + }, + { + "epoch": 2.4, + "grad_norm": 5.979503154754639, + "learning_rate": 1.0476180904522615e-05, + "loss": 0.9461, + "step": 47900 + }, + { + "epoch": 2.4, + "grad_norm": 5.928488731384277, + "learning_rate": 1.0456080402010051e-05, + "loss": 0.9045, + "step": 48000 + }, + { + "epoch": 2.41, + "grad_norm": 12.569512367248535, + "learning_rate": 1.0435979899497488e-05, + "loss": 0.9205, + "step": 48100 + }, + { + "epoch": 2.41, + "grad_norm": 4.5606865882873535, + "learning_rate": 1.0415879396984926e-05, + "loss": 0.9005, + "step": 48200 + }, + { + "epoch": 2.42, + "grad_norm": 5.511040210723877, + "learning_rate": 1.0395778894472364e-05, + "loss": 0.9105, + "step": 48300 + }, + { + "epoch": 2.42, + "grad_norm": 5.660979747772217, + "learning_rate": 1.03756783919598e-05, + "loss": 0.8911, + "step": 48400 + }, + { + "epoch": 2.42, + "grad_norm": 3.5720648765563965, + "learning_rate": 1.0355577889447238e-05, + "loss": 0.9468, + "step": 48500 + }, + { + "epoch": 2.43, + "grad_norm": 5.770594120025635, + "learning_rate": 1.0335477386934674e-05, + "loss": 0.9296, + "step": 48600 + }, + { + "epoch": 2.44, + "grad_norm": 4.0545477867126465, + "learning_rate": 1.0315376884422112e-05, + "loss": 0.9133, + "step": 48700 + }, + { + "epoch": 2.44, + "grad_norm": 4.586203575134277, + "learning_rate": 1.0295276381909548e-05, + "loss": 0.906, + "step": 48800 + }, + { + "epoch": 2.44, + "grad_norm": 5.315196514129639, + "learning_rate": 1.0275175879396986e-05, + "loss": 0.9065, + "step": 48900 + }, + { + "epoch": 2.45, + "grad_norm": 5.344489574432373, + "learning_rate": 1.0255075376884423e-05, + "loss": 0.9363, + "step": 49000 + }, + { + "epoch": 2.46, + "grad_norm": 6.762577533721924, + "learning_rate": 1.0234974874371859e-05, + "loss": 0.9366, + "step": 49100 + }, + { + "epoch": 2.46, + "grad_norm": 4.087870121002197, + "learning_rate": 1.0214874371859299e-05, + "loss": 0.8812, + "step": 49200 + }, + { + "epoch": 2.46, + "grad_norm": 5.586741924285889, + "learning_rate": 1.0194773869346735e-05, + "loss": 0.9341, + "step": 49300 + }, + { + "epoch": 2.47, + "grad_norm": 8.180070877075195, + "learning_rate": 1.0174673366834171e-05, + "loss": 0.9381, + "step": 49400 + }, + { + "epoch": 2.48, + "grad_norm": 4.389576435089111, + "learning_rate": 1.0154572864321607e-05, + "loss": 0.9288, + "step": 49500 + }, + { + "epoch": 2.48, + "grad_norm": 4.339807033538818, + "learning_rate": 1.0134472361809047e-05, + "loss": 0.9282, + "step": 49600 + }, + { + "epoch": 2.48, + "grad_norm": 7.801273345947266, + "learning_rate": 1.0114371859296483e-05, + "loss": 0.9657, + "step": 49700 + }, + { + "epoch": 2.49, + "grad_norm": 6.016520977020264, + "learning_rate": 1.009427135678392e-05, + "loss": 0.8704, + "step": 49800 + }, + { + "epoch": 2.5, + "grad_norm": 5.2764506340026855, + "learning_rate": 1.0074170854271358e-05, + "loss": 0.9226, + "step": 49900 + }, + { + "epoch": 2.5, + "grad_norm": 4.923444747924805, + "learning_rate": 1.0054070351758796e-05, + "loss": 0.9084, + "step": 50000 + }, + { + "epoch": 2.5, + "eval_loss": 0.9846327900886536, + "eval_runtime": 21.5925, + "eval_samples_per_second": 46.312, + "eval_steps_per_second": 5.789, + "step": 50000 + }, + { + "epoch": 2.5, + "grad_norm": 6.061006546020508, + "learning_rate": 1.0033969849246232e-05, + "loss": 0.9218, + "step": 50100 + }, + { + "epoch": 2.51, + "grad_norm": 4.1440348625183105, + "learning_rate": 1.0013869346733668e-05, + "loss": 0.9324, + "step": 50200 + }, + { + "epoch": 2.52, + "grad_norm": 4.084045886993408, + "learning_rate": 9.993768844221106e-06, + "loss": 0.8859, + "step": 50300 + }, + { + "epoch": 2.52, + "grad_norm": 3.723971366882324, + "learning_rate": 9.973668341708544e-06, + "loss": 0.9128, + "step": 50400 + }, + { + "epoch": 2.52, + "grad_norm": 3.9887030124664307, + "learning_rate": 9.95356783919598e-06, + "loss": 0.8987, + "step": 50500 + }, + { + "epoch": 2.53, + "grad_norm": 5.572610855102539, + "learning_rate": 9.933467336683418e-06, + "loss": 0.9287, + "step": 50600 + }, + { + "epoch": 2.54, + "grad_norm": 5.956911087036133, + "learning_rate": 9.913366834170856e-06, + "loss": 0.8808, + "step": 50700 + }, + { + "epoch": 2.54, + "grad_norm": 3.948564052581787, + "learning_rate": 9.893266331658293e-06, + "loss": 0.9243, + "step": 50800 + }, + { + "epoch": 2.54, + "grad_norm": 5.561892509460449, + "learning_rate": 9.87316582914573e-06, + "loss": 0.9174, + "step": 50900 + }, + { + "epoch": 2.55, + "grad_norm": 5.9155755043029785, + "learning_rate": 9.853065326633167e-06, + "loss": 0.8951, + "step": 51000 + }, + { + "epoch": 2.56, + "grad_norm": 5.4488348960876465, + "learning_rate": 9.832964824120603e-06, + "loss": 0.8864, + "step": 51100 + }, + { + "epoch": 2.56, + "grad_norm": 4.52565860748291, + "learning_rate": 9.812864321608041e-06, + "loss": 0.9103, + "step": 51200 + }, + { + "epoch": 2.56, + "grad_norm": 3.995807647705078, + "learning_rate": 9.792763819095477e-06, + "loss": 0.8999, + "step": 51300 + }, + { + "epoch": 2.57, + "grad_norm": 9.156529426574707, + "learning_rate": 9.772663316582915e-06, + "loss": 0.9383, + "step": 51400 + }, + { + "epoch": 2.58, + "grad_norm": 6.388377666473389, + "learning_rate": 9.752562814070352e-06, + "loss": 0.908, + "step": 51500 + }, + { + "epoch": 2.58, + "grad_norm": 3.975545644760132, + "learning_rate": 9.73246231155779e-06, + "loss": 0.9006, + "step": 51600 + }, + { + "epoch": 2.58, + "grad_norm": 4.579479694366455, + "learning_rate": 9.712361809045226e-06, + "loss": 0.9443, + "step": 51700 + }, + { + "epoch": 2.59, + "grad_norm": 5.22560977935791, + "learning_rate": 9.69246231155779e-06, + "loss": 0.909, + "step": 51800 + }, + { + "epoch": 2.59, + "grad_norm": 5.2606587409973145, + "learning_rate": 9.672361809045226e-06, + "loss": 0.9255, + "step": 51900 + }, + { + "epoch": 2.6, + "grad_norm": 4.772227764129639, + "learning_rate": 9.652261306532664e-06, + "loss": 0.9161, + "step": 52000 + }, + { + "epoch": 2.6, + "grad_norm": 4.636828899383545, + "learning_rate": 9.6321608040201e-06, + "loss": 0.874, + "step": 52100 + }, + { + "epoch": 2.61, + "grad_norm": 4.5946784019470215, + "learning_rate": 9.612060301507538e-06, + "loss": 0.902, + "step": 52200 + }, + { + "epoch": 2.62, + "grad_norm": 4.0993266105651855, + "learning_rate": 9.591959798994975e-06, + "loss": 0.9369, + "step": 52300 + }, + { + "epoch": 2.62, + "grad_norm": 5.468399524688721, + "learning_rate": 9.571859296482413e-06, + "loss": 0.9359, + "step": 52400 + }, + { + "epoch": 2.62, + "grad_norm": 10.92428970336914, + "learning_rate": 9.551758793969849e-06, + "loss": 0.8889, + "step": 52500 + }, + { + "epoch": 2.63, + "grad_norm": 6.2350029945373535, + "learning_rate": 9.531658291457287e-06, + "loss": 0.9304, + "step": 52600 + }, + { + "epoch": 2.63, + "grad_norm": 4.780547618865967, + "learning_rate": 9.511557788944725e-06, + "loss": 0.9396, + "step": 52700 + }, + { + "epoch": 2.64, + "grad_norm": 3.1009738445281982, + "learning_rate": 9.491457286432161e-06, + "loss": 0.9077, + "step": 52800 + }, + { + "epoch": 2.65, + "grad_norm": 7.036947727203369, + "learning_rate": 9.4713567839196e-06, + "loss": 0.8753, + "step": 52900 + }, + { + "epoch": 2.65, + "grad_norm": 4.945110321044922, + "learning_rate": 9.451256281407035e-06, + "loss": 0.9157, + "step": 53000 + }, + { + "epoch": 2.66, + "grad_norm": 5.361321926116943, + "learning_rate": 9.431155778894473e-06, + "loss": 0.8929, + "step": 53100 + }, + { + "epoch": 2.66, + "grad_norm": 3.351379632949829, + "learning_rate": 9.411055276381911e-06, + "loss": 0.8636, + "step": 53200 + }, + { + "epoch": 2.67, + "grad_norm": 5.34309720993042, + "learning_rate": 9.390954773869348e-06, + "loss": 0.8865, + "step": 53300 + }, + { + "epoch": 2.67, + "grad_norm": 5.316425800323486, + "learning_rate": 9.370854271356786e-06, + "loss": 0.9178, + "step": 53400 + }, + { + "epoch": 2.67, + "grad_norm": 4.478712558746338, + "learning_rate": 9.350753768844222e-06, + "loss": 0.9181, + "step": 53500 + }, + { + "epoch": 2.68, + "grad_norm": 5.095877647399902, + "learning_rate": 9.33065326633166e-06, + "loss": 0.902, + "step": 53600 + }, + { + "epoch": 2.69, + "grad_norm": 4.4164862632751465, + "learning_rate": 9.310552763819096e-06, + "loss": 0.887, + "step": 53700 + }, + { + "epoch": 2.69, + "grad_norm": 6.3961591720581055, + "learning_rate": 9.290452261306533e-06, + "loss": 0.8778, + "step": 53800 + }, + { + "epoch": 2.69, + "grad_norm": 7.141729354858398, + "learning_rate": 9.270552763819097e-06, + "loss": 0.9144, + "step": 53900 + }, + { + "epoch": 2.7, + "grad_norm": 5.858211040496826, + "learning_rate": 9.250452261306535e-06, + "loss": 0.8889, + "step": 54000 + }, + { + "epoch": 2.71, + "grad_norm": 5.192725658416748, + "learning_rate": 9.230351758793971e-06, + "loss": 0.8928, + "step": 54100 + }, + { + "epoch": 2.71, + "grad_norm": 6.190788745880127, + "learning_rate": 9.210251256281407e-06, + "loss": 0.8683, + "step": 54200 + }, + { + "epoch": 2.71, + "grad_norm": 4.610683441162109, + "learning_rate": 9.190150753768845e-06, + "loss": 0.9473, + "step": 54300 + }, + { + "epoch": 2.72, + "grad_norm": 5.043734550476074, + "learning_rate": 9.170050251256281e-06, + "loss": 0.9142, + "step": 54400 + }, + { + "epoch": 2.73, + "grad_norm": 5.166931629180908, + "learning_rate": 9.14994974874372e-06, + "loss": 0.8894, + "step": 54500 + }, + { + "epoch": 2.73, + "grad_norm": 5.05250358581543, + "learning_rate": 9.129849246231156e-06, + "loss": 0.8799, + "step": 54600 + }, + { + "epoch": 2.73, + "grad_norm": 5.468914031982422, + "learning_rate": 9.109748743718594e-06, + "loss": 0.9099, + "step": 54700 + }, + { + "epoch": 2.74, + "grad_norm": 4.162414073944092, + "learning_rate": 9.08964824120603e-06, + "loss": 0.8859, + "step": 54800 + }, + { + "epoch": 2.75, + "grad_norm": 5.149291515350342, + "learning_rate": 9.069547738693468e-06, + "loss": 0.9096, + "step": 54900 + }, + { + "epoch": 2.75, + "grad_norm": 4.889472961425781, + "learning_rate": 9.049447236180904e-06, + "loss": 0.8953, + "step": 55000 + }, + { + "epoch": 2.75, + "grad_norm": 4.146818161010742, + "learning_rate": 9.029346733668342e-06, + "loss": 0.8917, + "step": 55100 + }, + { + "epoch": 2.76, + "grad_norm": 5.937385559082031, + "learning_rate": 9.00924623115578e-06, + "loss": 0.9295, + "step": 55200 + }, + { + "epoch": 2.77, + "grad_norm": 4.749314785003662, + "learning_rate": 8.989145728643216e-06, + "loss": 0.8776, + "step": 55300 + }, + { + "epoch": 2.77, + "grad_norm": 6.271254539489746, + "learning_rate": 8.969045226130654e-06, + "loss": 0.8593, + "step": 55400 + }, + { + "epoch": 2.77, + "grad_norm": 5.769760608673096, + "learning_rate": 8.948944723618092e-06, + "loss": 0.891, + "step": 55500 + }, + { + "epoch": 2.78, + "grad_norm": 4.185112476348877, + "learning_rate": 8.928844221105529e-06, + "loss": 0.8869, + "step": 55600 + }, + { + "epoch": 2.79, + "grad_norm": 3.2164394855499268, + "learning_rate": 8.908743718592967e-06, + "loss": 0.8992, + "step": 55700 + }, + { + "epoch": 2.79, + "grad_norm": 4.406613349914551, + "learning_rate": 8.888643216080403e-06, + "loss": 0.8971, + "step": 55800 + }, + { + "epoch": 2.79, + "grad_norm": 5.101110458374023, + "learning_rate": 8.868542713567841e-06, + "loss": 0.9066, + "step": 55900 + }, + { + "epoch": 2.8, + "grad_norm": 4.963405132293701, + "learning_rate": 8.848643216080403e-06, + "loss": 0.881, + "step": 56000 + }, + { + "epoch": 2.81, + "grad_norm": 7.5268683433532715, + "learning_rate": 8.82854271356784e-06, + "loss": 0.8692, + "step": 56100 + }, + { + "epoch": 2.81, + "grad_norm": 5.325132369995117, + "learning_rate": 8.808442211055278e-06, + "loss": 0.895, + "step": 56200 + }, + { + "epoch": 2.81, + "grad_norm": 4.687073707580566, + "learning_rate": 8.788341708542715e-06, + "loss": 0.9007, + "step": 56300 + }, + { + "epoch": 2.82, + "grad_norm": 4.215831279754639, + "learning_rate": 8.768241206030152e-06, + "loss": 0.8783, + "step": 56400 + }, + { + "epoch": 2.83, + "grad_norm": 6.363833427429199, + "learning_rate": 8.74814070351759e-06, + "loss": 0.9276, + "step": 56500 + }, + { + "epoch": 2.83, + "grad_norm": 4.2875518798828125, + "learning_rate": 8.728040201005026e-06, + "loss": 0.8758, + "step": 56600 + }, + { + "epoch": 2.83, + "grad_norm": 4.461952209472656, + "learning_rate": 8.707939698492464e-06, + "loss": 0.8789, + "step": 56700 + }, + { + "epoch": 2.84, + "grad_norm": 7.590397834777832, + "learning_rate": 8.6878391959799e-06, + "loss": 0.8726, + "step": 56800 + }, + { + "epoch": 2.84, + "grad_norm": 5.754077911376953, + "learning_rate": 8.667738693467337e-06, + "loss": 0.9022, + "step": 56900 + }, + { + "epoch": 2.85, + "grad_norm": 4.305074214935303, + "learning_rate": 8.647638190954775e-06, + "loss": 0.8933, + "step": 57000 + }, + { + "epoch": 2.85, + "grad_norm": 4.451827526092529, + "learning_rate": 8.627738693467337e-06, + "loss": 0.934, + "step": 57100 + }, + { + "epoch": 2.86, + "grad_norm": 6.323834419250488, + "learning_rate": 8.607638190954775e-06, + "loss": 0.8858, + "step": 57200 + }, + { + "epoch": 2.87, + "grad_norm": 6.937102317810059, + "learning_rate": 8.587537688442211e-06, + "loss": 0.9263, + "step": 57300 + }, + { + "epoch": 2.87, + "grad_norm": 7.153318881988525, + "learning_rate": 8.56743718592965e-06, + "loss": 0.8868, + "step": 57400 + }, + { + "epoch": 2.88, + "grad_norm": 4.7994842529296875, + "learning_rate": 8.547336683417085e-06, + "loss": 0.877, + "step": 57500 + }, + { + "epoch": 2.88, + "grad_norm": 6.6480793952941895, + "learning_rate": 8.527236180904523e-06, + "loss": 0.8774, + "step": 57600 + }, + { + "epoch": 2.88, + "grad_norm": 7.170138835906982, + "learning_rate": 8.50713567839196e-06, + "loss": 0.8704, + "step": 57700 + }, + { + "epoch": 2.89, + "grad_norm": 4.006447792053223, + "learning_rate": 8.487035175879398e-06, + "loss": 0.8736, + "step": 57800 + }, + { + "epoch": 2.9, + "grad_norm": 4.998128414154053, + "learning_rate": 8.466934673366834e-06, + "loss": 0.8653, + "step": 57900 + }, + { + "epoch": 2.9, + "grad_norm": 5.756192207336426, + "learning_rate": 8.446834170854272e-06, + "loss": 0.8756, + "step": 58000 + }, + { + "epoch": 2.91, + "grad_norm": 5.486929893493652, + "learning_rate": 8.426733668341708e-06, + "loss": 0.9047, + "step": 58100 + }, + { + "epoch": 2.91, + "grad_norm": 4.589926242828369, + "learning_rate": 8.406633165829146e-06, + "loss": 0.8766, + "step": 58200 + }, + { + "epoch": 2.92, + "grad_norm": 4.535083293914795, + "learning_rate": 8.386532663316584e-06, + "loss": 0.8758, + "step": 58300 + }, + { + "epoch": 2.92, + "grad_norm": 3.2254798412323, + "learning_rate": 8.36643216080402e-06, + "loss": 0.8813, + "step": 58400 + }, + { + "epoch": 2.92, + "grad_norm": 6.055229187011719, + "learning_rate": 8.346331658291458e-06, + "loss": 0.8779, + "step": 58500 + }, + { + "epoch": 2.93, + "grad_norm": 4.221169471740723, + "learning_rate": 8.326231155778895e-06, + "loss": 0.9325, + "step": 58600 + }, + { + "epoch": 2.94, + "grad_norm": 5.035799026489258, + "learning_rate": 8.306130653266333e-06, + "loss": 0.8896, + "step": 58700 + }, + { + "epoch": 2.94, + "grad_norm": 6.551968574523926, + "learning_rate": 8.28603015075377e-06, + "loss": 0.8644, + "step": 58800 + }, + { + "epoch": 2.94, + "grad_norm": 4.297557353973389, + "learning_rate": 8.265929648241207e-06, + "loss": 0.8853, + "step": 58900 + }, + { + "epoch": 2.95, + "grad_norm": 6.603255271911621, + "learning_rate": 8.245829145728645e-06, + "loss": 0.9237, + "step": 59000 + }, + { + "epoch": 2.96, + "grad_norm": 6.272432804107666, + "learning_rate": 8.225728643216081e-06, + "loss": 0.8708, + "step": 59100 + }, + { + "epoch": 2.96, + "grad_norm": 5.919680595397949, + "learning_rate": 8.20562814070352e-06, + "loss": 0.8525, + "step": 59200 + }, + { + "epoch": 2.96, + "grad_norm": 4.834166049957275, + "learning_rate": 8.185527638190955e-06, + "loss": 0.8576, + "step": 59300 + }, + { + "epoch": 2.97, + "grad_norm": 5.948410987854004, + "learning_rate": 8.165427135678393e-06, + "loss": 0.9017, + "step": 59400 + }, + { + "epoch": 2.98, + "grad_norm": 7.001020431518555, + "learning_rate": 8.14532663316583e-06, + "loss": 0.891, + "step": 59500 + }, + { + "epoch": 2.98, + "grad_norm": 5.623896598815918, + "learning_rate": 8.125226130653266e-06, + "loss": 0.8255, + "step": 59600 + }, + { + "epoch": 2.98, + "grad_norm": 5.0935468673706055, + "learning_rate": 8.105125628140704e-06, + "loss": 0.8709, + "step": 59700 + }, + { + "epoch": 2.99, + "grad_norm": 6.403896808624268, + "learning_rate": 8.08502512562814e-06, + "loss": 0.8957, + "step": 59800 + }, + { + "epoch": 3.0, + "grad_norm": 5.92683744430542, + "learning_rate": 8.064924623115578e-06, + "loss": 0.9102, + "step": 59900 + }, + { + "epoch": 3.0, + "grad_norm": 3.4657108783721924, + "learning_rate": 8.04502512562814e-06, + "loss": 0.916, + "step": 60000 + }, + { + "epoch": 3.0, + "eval_loss": 0.9292559623718262, + "eval_runtime": 21.5879, + "eval_samples_per_second": 46.322, + "eval_steps_per_second": 5.79, + "step": 60000 + }, + { + "epoch": 3.0, + "grad_norm": 5.614874839782715, + "learning_rate": 8.024924623115579e-06, + "loss": 0.8151, + "step": 60100 + }, + { + "epoch": 3.01, + "grad_norm": 6.177361011505127, + "learning_rate": 8.004824120603015e-06, + "loss": 0.8266, + "step": 60200 + }, + { + "epoch": 3.02, + "grad_norm": 5.4862213134765625, + "learning_rate": 7.984723618090453e-06, + "loss": 0.7741, + "step": 60300 + }, + { + "epoch": 3.02, + "grad_norm": 6.674380779266357, + "learning_rate": 7.964623115577889e-06, + "loss": 0.8134, + "step": 60400 + }, + { + "epoch": 3.02, + "grad_norm": 6.712404251098633, + "learning_rate": 7.944522613065327e-06, + "loss": 0.8332, + "step": 60500 + }, + { + "epoch": 3.03, + "grad_norm": 4.442228317260742, + "learning_rate": 7.924422110552763e-06, + "loss": 0.832, + "step": 60600 + }, + { + "epoch": 3.04, + "grad_norm": 5.503748416900635, + "learning_rate": 7.904321608040201e-06, + "loss": 0.844, + "step": 60700 + }, + { + "epoch": 3.04, + "grad_norm": 4.290737628936768, + "learning_rate": 7.88422110552764e-06, + "loss": 0.8593, + "step": 60800 + }, + { + "epoch": 3.04, + "grad_norm": 4.687915802001953, + "learning_rate": 7.864120603015076e-06, + "loss": 0.8506, + "step": 60900 + }, + { + "epoch": 3.05, + "grad_norm": 5.838376998901367, + "learning_rate": 7.844020100502514e-06, + "loss": 0.8297, + "step": 61000 + }, + { + "epoch": 3.06, + "grad_norm": 7.26198148727417, + "learning_rate": 7.823919597989952e-06, + "loss": 0.8463, + "step": 61100 + }, + { + "epoch": 3.06, + "grad_norm": 5.693443298339844, + "learning_rate": 7.803819095477388e-06, + "loss": 0.8405, + "step": 61200 + }, + { + "epoch": 3.06, + "grad_norm": 5.379219055175781, + "learning_rate": 7.783718592964826e-06, + "loss": 0.8431, + "step": 61300 + }, + { + "epoch": 3.07, + "grad_norm": 5.703670501708984, + "learning_rate": 7.763618090452262e-06, + "loss": 0.8484, + "step": 61400 + }, + { + "epoch": 3.08, + "grad_norm": 5.679072380065918, + "learning_rate": 7.7435175879397e-06, + "loss": 0.8272, + "step": 61500 + }, + { + "epoch": 3.08, + "grad_norm": 4.1109113693237305, + "learning_rate": 7.723417085427136e-06, + "loss": 0.83, + "step": 61600 + }, + { + "epoch": 3.08, + "grad_norm": 5.94366979598999, + "learning_rate": 7.703316582914574e-06, + "loss": 0.8102, + "step": 61700 + }, + { + "epoch": 3.09, + "grad_norm": 8.418631553649902, + "learning_rate": 7.68321608040201e-06, + "loss": 0.8396, + "step": 61800 + }, + { + "epoch": 3.1, + "grad_norm": 5.8859100341796875, + "learning_rate": 7.663115577889449e-06, + "loss": 0.8142, + "step": 61900 + }, + { + "epoch": 3.1, + "grad_norm": 5.267168045043945, + "learning_rate": 7.643015075376885e-06, + "loss": 0.8087, + "step": 62000 + }, + { + "epoch": 3.1, + "grad_norm": 5.58022403717041, + "learning_rate": 7.622914572864322e-06, + "loss": 0.8407, + "step": 62100 + }, + { + "epoch": 3.11, + "grad_norm": 5.999646186828613, + "learning_rate": 7.602814070351759e-06, + "loss": 0.8214, + "step": 62200 + }, + { + "epoch": 3.12, + "grad_norm": 4.449764251708984, + "learning_rate": 7.582713567839196e-06, + "loss": 0.8562, + "step": 62300 + }, + { + "epoch": 3.12, + "grad_norm": 6.2914137840271, + "learning_rate": 7.562613065326634e-06, + "loss": 0.8359, + "step": 62400 + }, + { + "epoch": 3.12, + "grad_norm": 5.262882709503174, + "learning_rate": 7.5425125628140705e-06, + "loss": 0.8368, + "step": 62500 + }, + { + "epoch": 3.13, + "grad_norm": 4.981582164764404, + "learning_rate": 7.5224120603015085e-06, + "loss": 0.8138, + "step": 62600 + }, + { + "epoch": 3.13, + "grad_norm": 5.330999374389648, + "learning_rate": 7.502311557788945e-06, + "loss": 0.8292, + "step": 62700 + }, + { + "epoch": 3.14, + "grad_norm": 5.176852226257324, + "learning_rate": 7.482211055276383e-06, + "loss": 0.8108, + "step": 62800 + }, + { + "epoch": 3.15, + "grad_norm": 8.819506645202637, + "learning_rate": 7.462110552763819e-06, + "loss": 0.854, + "step": 62900 + }, + { + "epoch": 3.15, + "grad_norm": 5.1937642097473145, + "learning_rate": 7.442010050251257e-06, + "loss": 0.7943, + "step": 63000 + }, + { + "epoch": 3.15, + "grad_norm": 4.399514675140381, + "learning_rate": 7.421909547738694e-06, + "loss": 0.7815, + "step": 63100 + }, + { + "epoch": 3.16, + "grad_norm": 5.575798034667969, + "learning_rate": 7.402010050251257e-06, + "loss": 0.8481, + "step": 63200 + }, + { + "epoch": 3.17, + "grad_norm": 5.097688674926758, + "learning_rate": 7.381909547738694e-06, + "loss": 0.8412, + "step": 63300 + }, + { + "epoch": 3.17, + "grad_norm": 4.748641490936279, + "learning_rate": 7.361809045226132e-06, + "loss": 0.8058, + "step": 63400 + }, + { + "epoch": 3.17, + "grad_norm": 7.151881694793701, + "learning_rate": 7.341708542713568e-06, + "loss": 0.7944, + "step": 63500 + }, + { + "epoch": 3.18, + "grad_norm": 4.642664909362793, + "learning_rate": 7.321608040201006e-06, + "loss": 0.8185, + "step": 63600 + }, + { + "epoch": 3.19, + "grad_norm": 5.354043483734131, + "learning_rate": 7.301507537688442e-06, + "loss": 0.7833, + "step": 63700 + }, + { + "epoch": 3.19, + "grad_norm": 5.168720245361328, + "learning_rate": 7.28140703517588e-06, + "loss": 0.7966, + "step": 63800 + }, + { + "epoch": 3.19, + "grad_norm": 4.343645095825195, + "learning_rate": 7.261306532663317e-06, + "loss": 0.7851, + "step": 63900 + }, + { + "epoch": 3.2, + "grad_norm": 4.882009506225586, + "learning_rate": 7.241206030150754e-06, + "loss": 0.8069, + "step": 64000 + }, + { + "epoch": 3.21, + "grad_norm": 4.954422473907471, + "learning_rate": 7.2211055276381915e-06, + "loss": 0.8029, + "step": 64100 + }, + { + "epoch": 3.21, + "grad_norm": 3.5329108238220215, + "learning_rate": 7.2010050251256295e-06, + "loss": 0.8262, + "step": 64200 + }, + { + "epoch": 3.21, + "grad_norm": 4.995691776275635, + "learning_rate": 7.180904522613066e-06, + "loss": 0.7899, + "step": 64300 + }, + { + "epoch": 3.22, + "grad_norm": 4.367786884307861, + "learning_rate": 7.160804020100504e-06, + "loss": 0.8014, + "step": 64400 + }, + { + "epoch": 3.23, + "grad_norm": 3.8841774463653564, + "learning_rate": 7.14070351758794e-06, + "loss": 0.8207, + "step": 64500 + }, + { + "epoch": 3.23, + "grad_norm": 4.118581295013428, + "learning_rate": 7.120603015075378e-06, + "loss": 0.8399, + "step": 64600 + }, + { + "epoch": 3.23, + "grad_norm": 5.320229530334473, + "learning_rate": 7.100502512562814e-06, + "loss": 0.8407, + "step": 64700 + }, + { + "epoch": 3.24, + "grad_norm": 4.324894428253174, + "learning_rate": 7.080402010050251e-06, + "loss": 0.7897, + "step": 64800 + }, + { + "epoch": 3.25, + "grad_norm": 6.917771816253662, + "learning_rate": 7.060301507537689e-06, + "loss": 0.8019, + "step": 64900 + }, + { + "epoch": 3.25, + "grad_norm": 7.098691463470459, + "learning_rate": 7.040201005025126e-06, + "loss": 0.8058, + "step": 65000 + }, + { + "epoch": 3.25, + "grad_norm": 5.166707992553711, + "learning_rate": 7.020100502512564e-06, + "loss": 0.7839, + "step": 65100 + }, + { + "epoch": 3.26, + "grad_norm": 5.616134166717529, + "learning_rate": 7e-06, + "loss": 0.7821, + "step": 65200 + }, + { + "epoch": 3.27, + "grad_norm": 7.216468334197998, + "learning_rate": 6.979899497487438e-06, + "loss": 0.7974, + "step": 65300 + }, + { + "epoch": 3.27, + "grad_norm": 7.116774082183838, + "learning_rate": 6.959798994974874e-06, + "loss": 0.8446, + "step": 65400 + }, + { + "epoch": 3.27, + "grad_norm": 6.275495529174805, + "learning_rate": 6.939698492462312e-06, + "loss": 0.8185, + "step": 65500 + }, + { + "epoch": 3.28, + "grad_norm": 4.431950092315674, + "learning_rate": 6.919597989949749e-06, + "loss": 0.8203, + "step": 65600 + }, + { + "epoch": 3.29, + "grad_norm": 6.8355302810668945, + "learning_rate": 6.899497487437186e-06, + "loss": 0.789, + "step": 65700 + }, + { + "epoch": 3.29, + "grad_norm": 4.217498779296875, + "learning_rate": 6.8793969849246235e-06, + "loss": 0.7909, + "step": 65800 + }, + { + "epoch": 3.29, + "grad_norm": 9.218932151794434, + "learning_rate": 6.859899497487438e-06, + "loss": 0.8387, + "step": 65900 + }, + { + "epoch": 3.3, + "grad_norm": 5.607006072998047, + "learning_rate": 6.8397989949748745e-06, + "loss": 0.787, + "step": 66000 + }, + { + "epoch": 3.31, + "grad_norm": 5.220907688140869, + "learning_rate": 6.8196984924623124e-06, + "loss": 0.8274, + "step": 66100 + }, + { + "epoch": 3.31, + "grad_norm": 4.795065402984619, + "learning_rate": 6.799597989949749e-06, + "loss": 0.7833, + "step": 66200 + }, + { + "epoch": 3.31, + "grad_norm": 5.653503894805908, + "learning_rate": 6.779497487437187e-06, + "loss": 0.7919, + "step": 66300 + }, + { + "epoch": 3.32, + "grad_norm": 5.359546184539795, + "learning_rate": 6.759396984924623e-06, + "loss": 0.8, + "step": 66400 + }, + { + "epoch": 3.33, + "grad_norm": 3.9278500080108643, + "learning_rate": 6.739296482412061e-06, + "loss": 0.816, + "step": 66500 + }, + { + "epoch": 3.33, + "grad_norm": 6.889082908630371, + "learning_rate": 6.719195979899498e-06, + "loss": 0.8559, + "step": 66600 + }, + { + "epoch": 3.33, + "grad_norm": 6.555418491363525, + "learning_rate": 6.699095477386935e-06, + "loss": 0.8084, + "step": 66700 + }, + { + "epoch": 3.34, + "grad_norm": 5.0188798904418945, + "learning_rate": 6.678994974874372e-06, + "loss": 0.8199, + "step": 66800 + }, + { + "epoch": 3.34, + "grad_norm": 5.341757297515869, + "learning_rate": 6.6588944723618094e-06, + "loss": 0.8301, + "step": 66900 + }, + { + "epoch": 3.35, + "grad_norm": 7.638245105743408, + "learning_rate": 6.6387939698492466e-06, + "loss": 0.8156, + "step": 67000 + }, + { + "epoch": 3.35, + "grad_norm": 4.004561424255371, + "learning_rate": 6.6186934673366845e-06, + "loss": 0.779, + "step": 67100 + }, + { + "epoch": 3.36, + "grad_norm": 5.197673320770264, + "learning_rate": 6.598592964824121e-06, + "loss": 0.8086, + "step": 67200 + }, + { + "epoch": 3.37, + "grad_norm": 5.757644176483154, + "learning_rate": 6.578492462311559e-06, + "loss": 0.8609, + "step": 67300 + }, + { + "epoch": 3.37, + "grad_norm": 3.9802143573760986, + "learning_rate": 6.558391959798995e-06, + "loss": 0.814, + "step": 67400 + }, + { + "epoch": 3.38, + "grad_norm": 4.6707892417907715, + "learning_rate": 6.538291457286433e-06, + "loss": 0.809, + "step": 67500 + }, + { + "epoch": 3.38, + "grad_norm": 6.902073860168457, + "learning_rate": 6.51819095477387e-06, + "loss": 0.7862, + "step": 67600 + }, + { + "epoch": 3.38, + "grad_norm": 4.793231010437012, + "learning_rate": 6.498090452261307e-06, + "loss": 0.8139, + "step": 67700 + }, + { + "epoch": 3.39, + "grad_norm": 10.16287612915039, + "learning_rate": 6.4779899497487444e-06, + "loss": 0.78, + "step": 67800 + }, + { + "epoch": 3.4, + "grad_norm": 5.308049201965332, + "learning_rate": 6.4578894472361816e-06, + "loss": 0.8235, + "step": 67900 + }, + { + "epoch": 3.4, + "grad_norm": 5.0899271965026855, + "learning_rate": 6.437788944723619e-06, + "loss": 0.8222, + "step": 68000 + }, + { + "epoch": 3.41, + "grad_norm": 5.174381732940674, + "learning_rate": 6.417688442211055e-06, + "loss": 0.7985, + "step": 68100 + }, + { + "epoch": 3.41, + "grad_norm": 4.858529090881348, + "learning_rate": 6.397587939698493e-06, + "loss": 0.8224, + "step": 68200 + }, + { + "epoch": 3.42, + "grad_norm": 8.091994285583496, + "learning_rate": 6.37748743718593e-06, + "loss": 0.8078, + "step": 68300 + }, + { + "epoch": 3.42, + "grad_norm": 5.269526481628418, + "learning_rate": 6.357386934673367e-06, + "loss": 0.8006, + "step": 68400 + }, + { + "epoch": 3.42, + "grad_norm": 5.161372184753418, + "learning_rate": 6.337286432160804e-06, + "loss": 0.814, + "step": 68500 + }, + { + "epoch": 3.43, + "grad_norm": 4.547713279724121, + "learning_rate": 6.3171859296482415e-06, + "loss": 0.8024, + "step": 68600 + }, + { + "epoch": 3.44, + "grad_norm": 5.171160697937012, + "learning_rate": 6.297085427135679e-06, + "loss": 0.7936, + "step": 68700 + }, + { + "epoch": 3.44, + "grad_norm": 6.406951904296875, + "learning_rate": 6.2769849246231166e-06, + "loss": 0.7627, + "step": 68800 + }, + { + "epoch": 3.44, + "grad_norm": 6.404531955718994, + "learning_rate": 6.256884422110553e-06, + "loss": 0.8081, + "step": 68900 + }, + { + "epoch": 3.45, + "grad_norm": 4.409193992614746, + "learning_rate": 6.236783919597991e-06, + "loss": 0.8284, + "step": 69000 + }, + { + "epoch": 3.46, + "grad_norm": 6.643680572509766, + "learning_rate": 6.216683417085427e-06, + "loss": 0.7908, + "step": 69100 + }, + { + "epoch": 3.46, + "grad_norm": 6.344150543212891, + "learning_rate": 6.196582914572865e-06, + "loss": 0.8028, + "step": 69200 + }, + { + "epoch": 3.46, + "grad_norm": 4.83349609375, + "learning_rate": 6.176482412060301e-06, + "loss": 0.7946, + "step": 69300 + }, + { + "epoch": 3.47, + "grad_norm": 4.103985786437988, + "learning_rate": 6.156381909547739e-06, + "loss": 0.8089, + "step": 69400 + }, + { + "epoch": 3.48, + "grad_norm": 4.681515693664551, + "learning_rate": 6.1362814070351764e-06, + "loss": 0.8104, + "step": 69500 + }, + { + "epoch": 3.48, + "grad_norm": 3.5046350955963135, + "learning_rate": 6.1161809045226136e-06, + "loss": 0.8024, + "step": 69600 + }, + { + "epoch": 3.48, + "grad_norm": 5.06920051574707, + "learning_rate": 6.096080402010051e-06, + "loss": 0.8043, + "step": 69700 + }, + { + "epoch": 3.49, + "grad_norm": 6.419402599334717, + "learning_rate": 6.075979899497489e-06, + "loss": 0.8218, + "step": 69800 + }, + { + "epoch": 3.5, + "grad_norm": 4.9620184898376465, + "learning_rate": 6.055879396984925e-06, + "loss": 0.7904, + "step": 69900 + }, + { + "epoch": 3.5, + "grad_norm": 6.6012349128723145, + "learning_rate": 6.035778894472363e-06, + "loss": 0.8021, + "step": 70000 + }, + { + "epoch": 3.5, + "eval_loss": 0.913910448551178, + "eval_runtime": 21.5858, + "eval_samples_per_second": 46.327, + "eval_steps_per_second": 5.791, + "step": 70000 + }, + { + "epoch": 3.5, + "grad_norm": 8.278429985046387, + "learning_rate": 6.015678391959799e-06, + "loss": 0.8255, + "step": 70100 + }, + { + "epoch": 3.51, + "grad_norm": 5.309919834136963, + "learning_rate": 5.995577889447237e-06, + "loss": 0.8048, + "step": 70200 + }, + { + "epoch": 3.52, + "grad_norm": 5.3151535987854, + "learning_rate": 5.9754773869346735e-06, + "loss": 0.796, + "step": 70300 + }, + { + "epoch": 3.52, + "grad_norm": 6.962722301483154, + "learning_rate": 5.9553768844221114e-06, + "loss": 0.8448, + "step": 70400 + }, + { + "epoch": 3.52, + "grad_norm": 6.564899444580078, + "learning_rate": 5.9352763819095486e-06, + "loss": 0.7782, + "step": 70500 + }, + { + "epoch": 3.53, + "grad_norm": 4.522327423095703, + "learning_rate": 5.915175879396985e-06, + "loss": 0.8306, + "step": 70600 + }, + { + "epoch": 3.54, + "grad_norm": 4.783290863037109, + "learning_rate": 5.895075376884423e-06, + "loss": 0.8448, + "step": 70700 + }, + { + "epoch": 3.54, + "grad_norm": 8.016778945922852, + "learning_rate": 5.874974874371859e-06, + "loss": 0.805, + "step": 70800 + }, + { + "epoch": 3.54, + "grad_norm": 6.962314605712891, + "learning_rate": 5.854874371859297e-06, + "loss": 0.7802, + "step": 70900 + }, + { + "epoch": 3.55, + "grad_norm": 4.056068420410156, + "learning_rate": 5.834773869346733e-06, + "loss": 0.8146, + "step": 71000 + }, + { + "epoch": 3.56, + "grad_norm": 4.548468589782715, + "learning_rate": 5.814673366834171e-06, + "loss": 0.7631, + "step": 71100 + }, + { + "epoch": 3.56, + "grad_norm": 4.344750881195068, + "learning_rate": 5.7945728643216085e-06, + "loss": 0.8032, + "step": 71200 + }, + { + "epoch": 3.56, + "grad_norm": 6.746843338012695, + "learning_rate": 5.774472361809046e-06, + "loss": 0.7622, + "step": 71300 + }, + { + "epoch": 3.57, + "grad_norm": 5.048290729522705, + "learning_rate": 5.754371859296483e-06, + "loss": 0.8133, + "step": 71400 + }, + { + "epoch": 3.58, + "grad_norm": 5.74857759475708, + "learning_rate": 5.734271356783921e-06, + "loss": 0.7834, + "step": 71500 + }, + { + "epoch": 3.58, + "grad_norm": 4.5277934074401855, + "learning_rate": 5.714170854271357e-06, + "loss": 0.789, + "step": 71600 + }, + { + "epoch": 3.58, + "grad_norm": 8.23270034790039, + "learning_rate": 5.694070351758795e-06, + "loss": 0.7613, + "step": 71700 + }, + { + "epoch": 3.59, + "grad_norm": 3.9528987407684326, + "learning_rate": 5.673969849246231e-06, + "loss": 0.8081, + "step": 71800 + }, + { + "epoch": 3.59, + "grad_norm": 5.704257965087891, + "learning_rate": 5.653869346733669e-06, + "loss": 0.8164, + "step": 71900 + }, + { + "epoch": 3.6, + "grad_norm": 4.676042079925537, + "learning_rate": 5.6337688442211055e-06, + "loss": 0.8202, + "step": 72000 + }, + { + "epoch": 3.6, + "grad_norm": 5.20451021194458, + "learning_rate": 5.6136683417085434e-06, + "loss": 0.7953, + "step": 72100 + }, + { + "epoch": 3.61, + "grad_norm": 7.501960277557373, + "learning_rate": 5.5935678391959806e-06, + "loss": 0.8168, + "step": 72200 + }, + { + "epoch": 3.62, + "grad_norm": 7.015203475952148, + "learning_rate": 5.573467336683418e-06, + "loss": 0.789, + "step": 72300 + }, + { + "epoch": 3.62, + "grad_norm": 4.428484916687012, + "learning_rate": 5.553366834170855e-06, + "loss": 0.8092, + "step": 72400 + }, + { + "epoch": 3.62, + "grad_norm": 4.477147102355957, + "learning_rate": 5.533266331658293e-06, + "loss": 0.7843, + "step": 72500 + }, + { + "epoch": 3.63, + "grad_norm": 5.1699748039245605, + "learning_rate": 5.513165829145729e-06, + "loss": 0.7996, + "step": 72600 + }, + { + "epoch": 3.63, + "grad_norm": 5.133453369140625, + "learning_rate": 5.493065326633167e-06, + "loss": 0.8233, + "step": 72700 + }, + { + "epoch": 3.64, + "grad_norm": 4.902942657470703, + "learning_rate": 5.472964824120603e-06, + "loss": 0.7586, + "step": 72800 + }, + { + "epoch": 3.65, + "grad_norm": 6.46637487411499, + "learning_rate": 5.4528643216080405e-06, + "loss": 0.7959, + "step": 72900 + }, + { + "epoch": 3.65, + "grad_norm": 7.144857406616211, + "learning_rate": 5.432763819095478e-06, + "loss": 0.8197, + "step": 73000 + }, + { + "epoch": 3.66, + "grad_norm": 6.084510326385498, + "learning_rate": 5.412663316582915e-06, + "loss": 0.8133, + "step": 73100 + }, + { + "epoch": 3.66, + "grad_norm": 5.132942199707031, + "learning_rate": 5.392562814070353e-06, + "loss": 0.7482, + "step": 73200 + }, + { + "epoch": 3.67, + "grad_norm": 6.69909143447876, + "learning_rate": 5.372462311557789e-06, + "loss": 0.7498, + "step": 73300 + }, + { + "epoch": 3.67, + "grad_norm": 7.99722146987915, + "learning_rate": 5.352361809045227e-06, + "loss": 0.7857, + "step": 73400 + }, + { + "epoch": 3.67, + "grad_norm": 7.380476951599121, + "learning_rate": 5.332261306532663e-06, + "loss": 0.8081, + "step": 73500 + }, + { + "epoch": 3.68, + "grad_norm": 6.441634178161621, + "learning_rate": 5.312160804020101e-06, + "loss": 0.7737, + "step": 73600 + }, + { + "epoch": 3.69, + "grad_norm": 5.027355194091797, + "learning_rate": 5.2920603015075375e-06, + "loss": 0.7991, + "step": 73700 + }, + { + "epoch": 3.69, + "grad_norm": 8.128876686096191, + "learning_rate": 5.2719597989949755e-06, + "loss": 0.8271, + "step": 73800 + }, + { + "epoch": 3.69, + "grad_norm": 4.09487247467041, + "learning_rate": 5.251859296482413e-06, + "loss": 0.775, + "step": 73900 + }, + { + "epoch": 3.7, + "grad_norm": 6.368048667907715, + "learning_rate": 5.231959798994976e-06, + "loss": 0.7872, + "step": 74000 + }, + { + "epoch": 3.71, + "grad_norm": 4.72104549407959, + "learning_rate": 5.211859296482412e-06, + "loss": 0.8057, + "step": 74100 + }, + { + "epoch": 3.71, + "grad_norm": 5.083056926727295, + "learning_rate": 5.19175879396985e-06, + "loss": 0.7839, + "step": 74200 + }, + { + "epoch": 3.71, + "grad_norm": 5.289855003356934, + "learning_rate": 5.171658291457286e-06, + "loss": 0.7829, + "step": 74300 + }, + { + "epoch": 3.72, + "grad_norm": 5.842662811279297, + "learning_rate": 5.151557788944724e-06, + "loss": 0.7782, + "step": 74400 + }, + { + "epoch": 3.73, + "grad_norm": 6.445068836212158, + "learning_rate": 5.131457286432161e-06, + "loss": 0.8335, + "step": 74500 + }, + { + "epoch": 3.73, + "grad_norm": 4.2318220138549805, + "learning_rate": 5.111356783919599e-06, + "loss": 0.7942, + "step": 74600 + }, + { + "epoch": 3.73, + "grad_norm": 8.975232124328613, + "learning_rate": 5.091256281407036e-06, + "loss": 0.8284, + "step": 74700 + }, + { + "epoch": 3.74, + "grad_norm": 4.482039451599121, + "learning_rate": 5.071155778894473e-06, + "loss": 0.8281, + "step": 74800 + }, + { + "epoch": 3.75, + "grad_norm": 4.330044269561768, + "learning_rate": 5.05105527638191e-06, + "loss": 0.7737, + "step": 74900 + }, + { + "epoch": 3.75, + "grad_norm": 4.636693000793457, + "learning_rate": 5.030954773869348e-06, + "loss": 0.7882, + "step": 75000 + }, + { + "epoch": 3.75, + "grad_norm": 4.175960540771484, + "learning_rate": 5.010854271356784e-06, + "loss": 0.7417, + "step": 75100 + }, + { + "epoch": 3.76, + "grad_norm": 4.081864833831787, + "learning_rate": 4.990753768844221e-06, + "loss": 0.7579, + "step": 75200 + }, + { + "epoch": 3.77, + "grad_norm": 4.608290672302246, + "learning_rate": 4.9706532663316585e-06, + "loss": 0.799, + "step": 75300 + }, + { + "epoch": 3.77, + "grad_norm": 4.851296901702881, + "learning_rate": 4.950552763819096e-06, + "loss": 0.7998, + "step": 75400 + }, + { + "epoch": 3.77, + "grad_norm": 4.3285112380981445, + "learning_rate": 4.930452261306533e-06, + "loss": 0.8093, + "step": 75500 + }, + { + "epoch": 3.78, + "grad_norm": 4.927236080169678, + "learning_rate": 4.910552763819096e-06, + "loss": 0.7793, + "step": 75600 + }, + { + "epoch": 3.79, + "grad_norm": 6.193936824798584, + "learning_rate": 4.890452261306533e-06, + "loss": 0.8072, + "step": 75700 + }, + { + "epoch": 3.79, + "grad_norm": 4.687440872192383, + "learning_rate": 4.87035175879397e-06, + "loss": 0.8, + "step": 75800 + }, + { + "epoch": 3.79, + "grad_norm": 4.473381519317627, + "learning_rate": 4.850251256281407e-06, + "loss": 0.8027, + "step": 75900 + }, + { + "epoch": 3.8, + "grad_norm": 4.676540374755859, + "learning_rate": 4.8301507537688445e-06, + "loss": 0.8029, + "step": 76000 + }, + { + "epoch": 3.81, + "grad_norm": 4.967388153076172, + "learning_rate": 4.810050251256282e-06, + "loss": 0.7539, + "step": 76100 + }, + { + "epoch": 3.81, + "grad_norm": 4.699183940887451, + "learning_rate": 4.789949748743719e-06, + "loss": 0.7651, + "step": 76200 + }, + { + "epoch": 3.81, + "grad_norm": 4.629420757293701, + "learning_rate": 4.769849246231156e-06, + "loss": 0.7803, + "step": 76300 + }, + { + "epoch": 3.82, + "grad_norm": 5.920188903808594, + "learning_rate": 4.749748743718594e-06, + "loss": 0.8017, + "step": 76400 + }, + { + "epoch": 3.83, + "grad_norm": 6.677817344665527, + "learning_rate": 4.729648241206031e-06, + "loss": 0.8216, + "step": 76500 + }, + { + "epoch": 3.83, + "grad_norm": 5.312260627746582, + "learning_rate": 4.709547738693468e-06, + "loss": 0.7827, + "step": 76600 + }, + { + "epoch": 3.83, + "grad_norm": 4.119052410125732, + "learning_rate": 4.689447236180905e-06, + "loss": 0.7483, + "step": 76700 + }, + { + "epoch": 3.84, + "grad_norm": 4.5976715087890625, + "learning_rate": 4.669346733668342e-06, + "loss": 0.7657, + "step": 76800 + }, + { + "epoch": 3.84, + "grad_norm": 5.721061706542969, + "learning_rate": 4.649246231155779e-06, + "loss": 0.7817, + "step": 76900 + }, + { + "epoch": 3.85, + "grad_norm": 7.369571208953857, + "learning_rate": 4.629145728643216e-06, + "loss": 0.7402, + "step": 77000 + }, + { + "epoch": 3.85, + "grad_norm": 5.615093231201172, + "learning_rate": 4.609045226130654e-06, + "loss": 0.811, + "step": 77100 + }, + { + "epoch": 3.86, + "grad_norm": 6.276815414428711, + "learning_rate": 4.588944723618091e-06, + "loss": 0.7909, + "step": 77200 + }, + { + "epoch": 3.87, + "grad_norm": 4.287708759307861, + "learning_rate": 4.568844221105528e-06, + "loss": 0.8012, + "step": 77300 + }, + { + "epoch": 3.87, + "grad_norm": 4.280378818511963, + "learning_rate": 4.548743718592965e-06, + "loss": 0.8205, + "step": 77400 + }, + { + "epoch": 3.88, + "grad_norm": 8.309846878051758, + "learning_rate": 4.528643216080402e-06, + "loss": 0.7785, + "step": 77500 + }, + { + "epoch": 3.88, + "grad_norm": 5.504384517669678, + "learning_rate": 4.508542713567839e-06, + "loss": 0.7678, + "step": 77600 + }, + { + "epoch": 3.88, + "grad_norm": 4.6738996505737305, + "learning_rate": 4.4884422110552765e-06, + "loss": 0.8207, + "step": 77700 + }, + { + "epoch": 3.89, + "grad_norm": 8.038127899169922, + "learning_rate": 4.468341708542714e-06, + "loss": 0.7788, + "step": 77800 + }, + { + "epoch": 3.9, + "grad_norm": 6.898759365081787, + "learning_rate": 4.448241206030151e-06, + "loss": 0.7575, + "step": 77900 + }, + { + "epoch": 3.9, + "grad_norm": 5.893388271331787, + "learning_rate": 4.428140703517588e-06, + "loss": 0.7842, + "step": 78000 + }, + { + "epoch": 3.91, + "grad_norm": 7.37433385848999, + "learning_rate": 4.408040201005026e-06, + "loss": 0.756, + "step": 78100 + }, + { + "epoch": 3.91, + "grad_norm": 6.226987838745117, + "learning_rate": 4.387939698492463e-06, + "loss": 0.7818, + "step": 78200 + }, + { + "epoch": 3.92, + "grad_norm": 6.20886754989624, + "learning_rate": 4.368040201005025e-06, + "loss": 0.8057, + "step": 78300 + }, + { + "epoch": 3.92, + "grad_norm": 3.9309849739074707, + "learning_rate": 4.3479396984924625e-06, + "loss": 0.8052, + "step": 78400 + }, + { + "epoch": 3.92, + "grad_norm": 4.972345352172852, + "learning_rate": 4.3278391959799e-06, + "loss": 0.7666, + "step": 78500 + }, + { + "epoch": 3.93, + "grad_norm": 8.730260848999023, + "learning_rate": 4.307738693467337e-06, + "loss": 0.7897, + "step": 78600 + }, + { + "epoch": 3.94, + "grad_norm": 6.734485626220703, + "learning_rate": 4.287638190954774e-06, + "loss": 0.7595, + "step": 78700 + }, + { + "epoch": 3.94, + "grad_norm": 6.456557750701904, + "learning_rate": 4.267537688442212e-06, + "loss": 0.7924, + "step": 78800 + }, + { + "epoch": 3.94, + "grad_norm": 4.421884059906006, + "learning_rate": 4.247437185929649e-06, + "loss": 0.7821, + "step": 78900 + }, + { + "epoch": 3.95, + "grad_norm": 7.825852394104004, + "learning_rate": 4.227336683417086e-06, + "loss": 0.7834, + "step": 79000 + }, + { + "epoch": 3.96, + "grad_norm": 6.445671081542969, + "learning_rate": 4.207236180904523e-06, + "loss": 0.7794, + "step": 79100 + }, + { + "epoch": 3.96, + "grad_norm": 3.7435953617095947, + "learning_rate": 4.18713567839196e-06, + "loss": 0.7218, + "step": 79200 + }, + { + "epoch": 3.96, + "grad_norm": 10.594905853271484, + "learning_rate": 4.1670351758793975e-06, + "loss": 0.7957, + "step": 79300 + }, + { + "epoch": 3.97, + "grad_norm": 7.166194438934326, + "learning_rate": 4.146934673366835e-06, + "loss": 0.7936, + "step": 79400 + }, + { + "epoch": 3.98, + "grad_norm": 4.773101329803467, + "learning_rate": 4.126834170854272e-06, + "loss": 0.7721, + "step": 79500 + }, + { + "epoch": 3.98, + "grad_norm": 5.979006767272949, + "learning_rate": 4.106733668341709e-06, + "loss": 0.7899, + "step": 79600 + }, + { + "epoch": 3.98, + "grad_norm": 6.46978235244751, + "learning_rate": 4.086633165829146e-06, + "loss": 0.7874, + "step": 79700 + }, + { + "epoch": 3.99, + "grad_norm": 5.1106977462768555, + "learning_rate": 4.066532663316583e-06, + "loss": 0.7644, + "step": 79800 + }, + { + "epoch": 4.0, + "grad_norm": 7.125823974609375, + "learning_rate": 4.0466331658291464e-06, + "loss": 0.792, + "step": 79900 + }, + { + "epoch": 4.0, + "grad_norm": 5.539035797119141, + "learning_rate": 4.026532663316583e-06, + "loss": 0.7779, + "step": 80000 + }, + { + "epoch": 4.0, + "eval_loss": 0.8846080303192139, + "eval_runtime": 21.6073, + "eval_samples_per_second": 46.281, + "eval_steps_per_second": 5.785, + "step": 80000 + }, + { + "epoch": 4.0, + "grad_norm": 5.7579193115234375, + "learning_rate": 4.00643216080402e-06, + "loss": 0.6947, + "step": 80100 + }, + { + "epoch": 4.01, + "grad_norm": 5.583180904388428, + "learning_rate": 3.986331658291458e-06, + "loss": 0.6614, + "step": 80200 + }, + { + "epoch": 4.01, + "grad_norm": 5.107233047485352, + "learning_rate": 3.966231155778895e-06, + "loss": 0.6936, + "step": 80300 + }, + { + "epoch": 4.02, + "grad_norm": 5.804276466369629, + "learning_rate": 3.946130653266332e-06, + "loss": 0.6946, + "step": 80400 + }, + { + "epoch": 4.03, + "grad_norm": 6.738204479217529, + "learning_rate": 3.926030150753769e-06, + "loss": 0.6681, + "step": 80500 + }, + { + "epoch": 4.03, + "grad_norm": 6.331192970275879, + "learning_rate": 3.905929648241206e-06, + "loss": 0.6839, + "step": 80600 + }, + { + "epoch": 4.04, + "grad_norm": 5.382104873657227, + "learning_rate": 3.8858291457286434e-06, + "loss": 0.6566, + "step": 80700 + }, + { + "epoch": 4.04, + "grad_norm": 6.394933223724365, + "learning_rate": 3.8657286432160806e-06, + "loss": 0.7378, + "step": 80800 + }, + { + "epoch": 4.04, + "grad_norm": 5.813870429992676, + "learning_rate": 3.845628140703518e-06, + "loss": 0.7112, + "step": 80900 + }, + { + "epoch": 4.05, + "grad_norm": 6.095046520233154, + "learning_rate": 3.825527638190955e-06, + "loss": 0.6885, + "step": 81000 + }, + { + "epoch": 4.05, + "grad_norm": 6.212576866149902, + "learning_rate": 3.8054271356783924e-06, + "loss": 0.6658, + "step": 81100 + }, + { + "epoch": 4.06, + "grad_norm": 4.426722526550293, + "learning_rate": 3.7853266331658295e-06, + "loss": 0.6915, + "step": 81200 + }, + { + "epoch": 4.07, + "grad_norm": 7.474303722381592, + "learning_rate": 3.7652261306532666e-06, + "loss": 0.6486, + "step": 81300 + }, + { + "epoch": 4.07, + "grad_norm": 7.347512245178223, + "learning_rate": 3.7451256281407038e-06, + "loss": 0.7078, + "step": 81400 + }, + { + "epoch": 4.08, + "grad_norm": 9.426233291625977, + "learning_rate": 3.7250251256281413e-06, + "loss": 0.6951, + "step": 81500 + }, + { + "epoch": 4.08, + "grad_norm": 5.577968597412109, + "learning_rate": 3.7049246231155784e-06, + "loss": 0.6905, + "step": 81600 + }, + { + "epoch": 4.08, + "grad_norm": 6.477217197418213, + "learning_rate": 3.6848241206030156e-06, + "loss": 0.663, + "step": 81700 + }, + { + "epoch": 4.09, + "grad_norm": 6.228948593139648, + "learning_rate": 3.6647236180904527e-06, + "loss": 0.6677, + "step": 81800 + }, + { + "epoch": 4.09, + "grad_norm": 5.777594089508057, + "learning_rate": 3.64462311557789e-06, + "loss": 0.6905, + "step": 81900 + }, + { + "epoch": 4.1, + "grad_norm": 6.7552080154418945, + "learning_rate": 3.624522613065327e-06, + "loss": 0.7086, + "step": 82000 + }, + { + "epoch": 4.11, + "grad_norm": 5.3912553787231445, + "learning_rate": 3.6044221105527645e-06, + "loss": 0.6833, + "step": 82100 + }, + { + "epoch": 4.11, + "grad_norm": 7.366456508636475, + "learning_rate": 3.5843216080402016e-06, + "loss": 0.6618, + "step": 82200 + }, + { + "epoch": 4.12, + "grad_norm": 4.593729019165039, + "learning_rate": 3.5642211055276383e-06, + "loss": 0.6397, + "step": 82300 + }, + { + "epoch": 4.12, + "grad_norm": 6.743685722351074, + "learning_rate": 3.5441206030150755e-06, + "loss": 0.7233, + "step": 82400 + }, + { + "epoch": 4.12, + "grad_norm": 6.125808238983154, + "learning_rate": 3.5240201005025126e-06, + "loss": 0.6804, + "step": 82500 + }, + { + "epoch": 4.13, + "grad_norm": 7.0340752601623535, + "learning_rate": 3.5039195979899497e-06, + "loss": 0.699, + "step": 82600 + }, + { + "epoch": 4.13, + "grad_norm": 7.293619632720947, + "learning_rate": 3.4838190954773873e-06, + "loss": 0.6572, + "step": 82700 + }, + { + "epoch": 4.14, + "grad_norm": 6.3135552406311035, + "learning_rate": 3.4637185929648244e-06, + "loss": 0.6364, + "step": 82800 + }, + { + "epoch": 4.14, + "grad_norm": 5.138033390045166, + "learning_rate": 3.4436180904522615e-06, + "loss": 0.6815, + "step": 82900 + }, + { + "epoch": 4.15, + "grad_norm": 6.240560054779053, + "learning_rate": 3.4235175879396986e-06, + "loss": 0.6919, + "step": 83000 + }, + { + "epoch": 4.16, + "grad_norm": 4.19957971572876, + "learning_rate": 3.4034170854271358e-06, + "loss": 0.6845, + "step": 83100 + }, + { + "epoch": 4.16, + "grad_norm": 6.340314865112305, + "learning_rate": 3.383316582914573e-06, + "loss": 0.653, + "step": 83200 + }, + { + "epoch": 4.17, + "grad_norm": 3.309894323348999, + "learning_rate": 3.3632160804020104e-06, + "loss": 0.6612, + "step": 83300 + }, + { + "epoch": 4.17, + "grad_norm": 5.189826011657715, + "learning_rate": 3.3431155778894476e-06, + "loss": 0.6871, + "step": 83400 + }, + { + "epoch": 4.17, + "grad_norm": 6.599611759185791, + "learning_rate": 3.3230150753768847e-06, + "loss": 0.6743, + "step": 83500 + }, + { + "epoch": 4.18, + "grad_norm": 26.47356414794922, + "learning_rate": 3.302914572864322e-06, + "loss": 0.6312, + "step": 83600 + }, + { + "epoch": 4.18, + "grad_norm": 8.280220985412598, + "learning_rate": 3.282814070351759e-06, + "loss": 0.6276, + "step": 83700 + }, + { + "epoch": 4.19, + "grad_norm": 7.8088555335998535, + "learning_rate": 3.2627135678391965e-06, + "loss": 0.6514, + "step": 83800 + }, + { + "epoch": 4.2, + "grad_norm": 5.11159086227417, + "learning_rate": 3.2426130653266336e-06, + "loss": 0.6262, + "step": 83900 + }, + { + "epoch": 4.2, + "grad_norm": 6.656592845916748, + "learning_rate": 3.2225125628140708e-06, + "loss": 0.6889, + "step": 84000 + }, + { + "epoch": 4.21, + "grad_norm": 7.140279769897461, + "learning_rate": 3.202412060301508e-06, + "loss": 0.6435, + "step": 84100 + }, + { + "epoch": 4.21, + "grad_norm": 6.478577613830566, + "learning_rate": 3.182311557788945e-06, + "loss": 0.6593, + "step": 84200 + }, + { + "epoch": 4.21, + "grad_norm": 6.854846477508545, + "learning_rate": 3.1622110552763826e-06, + "loss": 0.7097, + "step": 84300 + }, + { + "epoch": 4.22, + "grad_norm": 5.070549488067627, + "learning_rate": 3.1421105527638197e-06, + "loss": 0.6736, + "step": 84400 + }, + { + "epoch": 4.22, + "grad_norm": 7.519010543823242, + "learning_rate": 3.122010050251257e-06, + "loss": 0.6518, + "step": 84500 + }, + { + "epoch": 4.23, + "grad_norm": 6.662156105041504, + "learning_rate": 3.1019095477386935e-06, + "loss": 0.675, + "step": 84600 + }, + { + "epoch": 4.24, + "grad_norm": 7.687413215637207, + "learning_rate": 3.0818090452261307e-06, + "loss": 0.6477, + "step": 84700 + }, + { + "epoch": 4.24, + "grad_norm": 5.934724807739258, + "learning_rate": 3.0617085427135678e-06, + "loss": 0.6492, + "step": 84800 + }, + { + "epoch": 4.25, + "grad_norm": 9.457836151123047, + "learning_rate": 3.041608040201005e-06, + "loss": 0.633, + "step": 84900 + }, + { + "epoch": 4.25, + "grad_norm": 6.666748523712158, + "learning_rate": 3.0215075376884425e-06, + "loss": 0.6693, + "step": 85000 + }, + { + "epoch": 4.25, + "grad_norm": 6.439404487609863, + "learning_rate": 3.0014070351758796e-06, + "loss": 0.6643, + "step": 85100 + }, + { + "epoch": 4.26, + "grad_norm": 7.257474422454834, + "learning_rate": 2.9813065326633167e-06, + "loss": 0.6623, + "step": 85200 + }, + { + "epoch": 4.26, + "grad_norm": 4.707270622253418, + "learning_rate": 2.961206030150754e-06, + "loss": 0.6471, + "step": 85300 + }, + { + "epoch": 4.27, + "grad_norm": 5.7160844802856445, + "learning_rate": 2.941105527638191e-06, + "loss": 0.683, + "step": 85400 + }, + { + "epoch": 4.28, + "grad_norm": 6.038240432739258, + "learning_rate": 2.9210050251256285e-06, + "loss": 0.6742, + "step": 85500 + }, + { + "epoch": 4.28, + "grad_norm": 6.851832866668701, + "learning_rate": 2.9009045226130656e-06, + "loss": 0.6748, + "step": 85600 + }, + { + "epoch": 4.29, + "grad_norm": 5.691901683807373, + "learning_rate": 2.8808040201005028e-06, + "loss": 0.6703, + "step": 85700 + }, + { + "epoch": 4.29, + "grad_norm": 6.378291130065918, + "learning_rate": 2.86070351758794e-06, + "loss": 0.6487, + "step": 85800 + }, + { + "epoch": 4.29, + "grad_norm": 4.439263343811035, + "learning_rate": 2.840603015075377e-06, + "loss": 0.6598, + "step": 85900 + }, + { + "epoch": 4.3, + "grad_norm": 6.466790199279785, + "learning_rate": 2.8205025125628146e-06, + "loss": 0.6914, + "step": 86000 + }, + { + "epoch": 4.3, + "grad_norm": 6.0331902503967285, + "learning_rate": 2.8004020100502517e-06, + "loss": 0.6929, + "step": 86100 + }, + { + "epoch": 4.31, + "grad_norm": 4.750064849853516, + "learning_rate": 2.780301507537689e-06, + "loss": 0.6715, + "step": 86200 + }, + { + "epoch": 4.32, + "grad_norm": 8.289958953857422, + "learning_rate": 2.760201005025126e-06, + "loss": 0.6975, + "step": 86300 + }, + { + "epoch": 4.32, + "grad_norm": 10.746756553649902, + "learning_rate": 2.740100502512563e-06, + "loss": 0.6454, + "step": 86400 + }, + { + "epoch": 4.33, + "grad_norm": 6.792548656463623, + "learning_rate": 2.720201005025126e-06, + "loss": 0.7056, + "step": 86500 + }, + { + "epoch": 4.33, + "grad_norm": 5.030031204223633, + "learning_rate": 2.700100502512563e-06, + "loss": 0.6711, + "step": 86600 + }, + { + "epoch": 4.33, + "grad_norm": 4.626148223876953, + "learning_rate": 2.680201005025126e-06, + "loss": 0.676, + "step": 86700 + }, + { + "epoch": 4.34, + "grad_norm": 8.56241512298584, + "learning_rate": 2.660100502512563e-06, + "loss": 0.6548, + "step": 86800 + }, + { + "epoch": 4.34, + "grad_norm": 9.747623443603516, + "learning_rate": 2.64e-06, + "loss": 0.6883, + "step": 86900 + }, + { + "epoch": 4.35, + "grad_norm": 8.002108573913574, + "learning_rate": 2.6198994974874377e-06, + "loss": 0.7166, + "step": 87000 + }, + { + "epoch": 4.36, + "grad_norm": 6.09249210357666, + "learning_rate": 2.599798994974875e-06, + "loss": 0.6841, + "step": 87100 + }, + { + "epoch": 4.36, + "grad_norm": 5.512220859527588, + "learning_rate": 2.579698492462312e-06, + "loss": 0.6816, + "step": 87200 + }, + { + "epoch": 4.37, + "grad_norm": 5.139577388763428, + "learning_rate": 2.559597989949749e-06, + "loss": 0.6475, + "step": 87300 + }, + { + "epoch": 4.37, + "grad_norm": 11.360005378723145, + "learning_rate": 2.539497487437186e-06, + "loss": 0.7434, + "step": 87400 + }, + { + "epoch": 4.38, + "grad_norm": 5.06545877456665, + "learning_rate": 2.5193969849246237e-06, + "loss": 0.6626, + "step": 87500 + }, + { + "epoch": 4.38, + "grad_norm": 4.432734966278076, + "learning_rate": 2.4992964824120604e-06, + "loss": 0.6357, + "step": 87600 + }, + { + "epoch": 4.38, + "grad_norm": 7.90862512588501, + "learning_rate": 2.4791959798994976e-06, + "loss": 0.6039, + "step": 87700 + }, + { + "epoch": 4.39, + "grad_norm": 4.959092617034912, + "learning_rate": 2.459095477386935e-06, + "loss": 0.6699, + "step": 87800 + }, + { + "epoch": 4.39, + "grad_norm": 7.495928764343262, + "learning_rate": 2.4389949748743723e-06, + "loss": 0.6648, + "step": 87900 + }, + { + "epoch": 4.4, + "grad_norm": 10.80557918548584, + "learning_rate": 2.4188944723618094e-06, + "loss": 0.6532, + "step": 88000 + }, + { + "epoch": 4.41, + "grad_norm": 7.1374006271362305, + "learning_rate": 2.3987939698492465e-06, + "loss": 0.6903, + "step": 88100 + }, + { + "epoch": 4.41, + "grad_norm": 12.275821685791016, + "learning_rate": 2.3786934673366836e-06, + "loss": 0.6433, + "step": 88200 + }, + { + "epoch": 4.42, + "grad_norm": 8.747936248779297, + "learning_rate": 2.3585929648241208e-06, + "loss": 0.62, + "step": 88300 + }, + { + "epoch": 4.42, + "grad_norm": 5.3552985191345215, + "learning_rate": 2.338492462311558e-06, + "loss": 0.6525, + "step": 88400 + }, + { + "epoch": 4.42, + "grad_norm": 7.049367427825928, + "learning_rate": 2.318391959798995e-06, + "loss": 0.6742, + "step": 88500 + }, + { + "epoch": 4.43, + "grad_norm": 8.841930389404297, + "learning_rate": 2.298291457286432e-06, + "loss": 0.6806, + "step": 88600 + }, + { + "epoch": 4.43, + "grad_norm": 4.58371114730835, + "learning_rate": 2.2781909547738697e-06, + "loss": 0.6469, + "step": 88700 + }, + { + "epoch": 4.44, + "grad_norm": 8.08278751373291, + "learning_rate": 2.258090452261307e-06, + "loss": 0.6918, + "step": 88800 + }, + { + "epoch": 4.45, + "grad_norm": 5.989361763000488, + "learning_rate": 2.237989949748744e-06, + "loss": 0.7048, + "step": 88900 + }, + { + "epoch": 4.45, + "grad_norm": 8.200750350952148, + "learning_rate": 2.217889447236181e-06, + "loss": 0.6222, + "step": 89000 + }, + { + "epoch": 4.46, + "grad_norm": 7.658218860626221, + "learning_rate": 2.197788944723618e-06, + "loss": 0.653, + "step": 89100 + }, + { + "epoch": 4.46, + "grad_norm": 6.744418621063232, + "learning_rate": 2.177889447236181e-06, + "loss": 0.6698, + "step": 89200 + }, + { + "epoch": 4.46, + "grad_norm": 4.423871994018555, + "learning_rate": 2.157788944723618e-06, + "loss": 0.6665, + "step": 89300 + }, + { + "epoch": 4.47, + "grad_norm": 7.368816375732422, + "learning_rate": 2.1376884422110557e-06, + "loss": 0.6766, + "step": 89400 + }, + { + "epoch": 4.47, + "grad_norm": 4.649584770202637, + "learning_rate": 2.117587939698493e-06, + "loss": 0.6464, + "step": 89500 + }, + { + "epoch": 4.48, + "grad_norm": 7.77773904800415, + "learning_rate": 2.09748743718593e-06, + "loss": 0.6721, + "step": 89600 + }, + { + "epoch": 4.49, + "grad_norm": 6.5589280128479, + "learning_rate": 2.0773869346733667e-06, + "loss": 0.6817, + "step": 89700 + }, + { + "epoch": 4.49, + "grad_norm": 10.153287887573242, + "learning_rate": 2.0572864321608042e-06, + "loss": 0.645, + "step": 89800 + }, + { + "epoch": 4.5, + "grad_norm": 8.705924987792969, + "learning_rate": 2.0371859296482414e-06, + "loss": 0.707, + "step": 89900 + }, + { + "epoch": 4.5, + "grad_norm": 5.7329511642456055, + "learning_rate": 2.0170854271356785e-06, + "loss": 0.6834, + "step": 90000 + }, + { + "epoch": 4.5, + "eval_loss": 0.9503761529922485, + "eval_runtime": 21.641, + "eval_samples_per_second": 46.209, + "eval_steps_per_second": 5.776, + "step": 90000 + }, + { + "epoch": 4.5, + "grad_norm": 6.902284622192383, + "learning_rate": 1.9969849246231156e-06, + "loss": 0.6237, + "step": 90100 + }, + { + "epoch": 4.51, + "grad_norm": 5.6710710525512695, + "learning_rate": 1.9768844221105527e-06, + "loss": 0.6638, + "step": 90200 + }, + { + "epoch": 4.51, + "grad_norm": 6.364370346069336, + "learning_rate": 1.9567839195979903e-06, + "loss": 0.6537, + "step": 90300 + }, + { + "epoch": 4.52, + "grad_norm": 5.928137302398682, + "learning_rate": 1.9366834170854274e-06, + "loss": 0.6266, + "step": 90400 + }, + { + "epoch": 4.53, + "grad_norm": 8.740313529968262, + "learning_rate": 1.9165829145728645e-06, + "loss": 0.6198, + "step": 90500 + }, + { + "epoch": 4.53, + "grad_norm": 8.339399337768555, + "learning_rate": 1.8964824120603017e-06, + "loss": 0.6482, + "step": 90600 + }, + { + "epoch": 4.54, + "grad_norm": 8.13129997253418, + "learning_rate": 1.876381909547739e-06, + "loss": 0.6521, + "step": 90700 + }, + { + "epoch": 4.54, + "grad_norm": 10.06900405883789, + "learning_rate": 1.856281407035176e-06, + "loss": 0.6472, + "step": 90800 + }, + { + "epoch": 4.54, + "grad_norm": 6.953003406524658, + "learning_rate": 1.836180904522613e-06, + "loss": 0.6185, + "step": 90900 + }, + { + "epoch": 4.55, + "grad_norm": 7.572219371795654, + "learning_rate": 1.8160804020100504e-06, + "loss": 0.664, + "step": 91000 + }, + { + "epoch": 4.55, + "grad_norm": 8.318469047546387, + "learning_rate": 1.7959798994974875e-06, + "loss": 0.6442, + "step": 91100 + }, + { + "epoch": 4.56, + "grad_norm": 6.608754634857178, + "learning_rate": 1.7758793969849246e-06, + "loss": 0.6398, + "step": 91200 + }, + { + "epoch": 4.56, + "grad_norm": 7.397676467895508, + "learning_rate": 1.755778894472362e-06, + "loss": 0.6689, + "step": 91300 + }, + { + "epoch": 4.57, + "grad_norm": 10.482325553894043, + "learning_rate": 1.7356783919597991e-06, + "loss": 0.6792, + "step": 91400 + }, + { + "epoch": 4.58, + "grad_norm": 5.926417827606201, + "learning_rate": 1.7155778894472364e-06, + "loss": 0.6774, + "step": 91500 + }, + { + "epoch": 4.58, + "grad_norm": 8.223274230957031, + "learning_rate": 1.6954773869346736e-06, + "loss": 0.6528, + "step": 91600 + }, + { + "epoch": 4.58, + "grad_norm": 7.564822196960449, + "learning_rate": 1.6753768844221107e-06, + "loss": 0.6224, + "step": 91700 + }, + { + "epoch": 4.59, + "grad_norm": 6.845765113830566, + "learning_rate": 1.655276381909548e-06, + "loss": 0.6984, + "step": 91800 + }, + { + "epoch": 4.59, + "grad_norm": 6.044042587280273, + "learning_rate": 1.6353768844221107e-06, + "loss": 0.6211, + "step": 91900 + }, + { + "epoch": 4.6, + "grad_norm": 12.825979232788086, + "learning_rate": 1.615276381909548e-06, + "loss": 0.6851, + "step": 92000 + }, + { + "epoch": 4.61, + "grad_norm": 6.73763370513916, + "learning_rate": 1.5951758793969851e-06, + "loss": 0.6161, + "step": 92100 + }, + { + "epoch": 4.61, + "grad_norm": 6.827399730682373, + "learning_rate": 1.5750753768844223e-06, + "loss": 0.6525, + "step": 92200 + }, + { + "epoch": 4.62, + "grad_norm": 6.6664228439331055, + "learning_rate": 1.5549748743718594e-06, + "loss": 0.6617, + "step": 92300 + }, + { + "epoch": 4.62, + "grad_norm": 9.772034645080566, + "learning_rate": 1.5348743718592965e-06, + "loss": 0.6687, + "step": 92400 + }, + { + "epoch": 4.62, + "grad_norm": 6.625182151794434, + "learning_rate": 1.5147738693467336e-06, + "loss": 0.6545, + "step": 92500 + }, + { + "epoch": 4.63, + "grad_norm": 10.207441329956055, + "learning_rate": 1.494673366834171e-06, + "loss": 0.6332, + "step": 92600 + }, + { + "epoch": 4.63, + "grad_norm": 9.929265975952148, + "learning_rate": 1.474572864321608e-06, + "loss": 0.6391, + "step": 92700 + }, + { + "epoch": 4.64, + "grad_norm": 6.050763130187988, + "learning_rate": 1.4544723618090452e-06, + "loss": 0.6708, + "step": 92800 + }, + { + "epoch": 4.64, + "grad_norm": 5.504277229309082, + "learning_rate": 1.4343718592964826e-06, + "loss": 0.6578, + "step": 92900 + }, + { + "epoch": 4.65, + "grad_norm": 7.113737106323242, + "learning_rate": 1.4142713567839197e-06, + "loss": 0.6419, + "step": 93000 + }, + { + "epoch": 4.66, + "grad_norm": 7.181005001068115, + "learning_rate": 1.394170854271357e-06, + "loss": 0.6298, + "step": 93100 + }, + { + "epoch": 4.66, + "grad_norm": 8.930741310119629, + "learning_rate": 1.3740703517587942e-06, + "loss": 0.6734, + "step": 93200 + }, + { + "epoch": 4.67, + "grad_norm": 6.288244724273682, + "learning_rate": 1.3539698492462313e-06, + "loss": 0.6307, + "step": 93300 + }, + { + "epoch": 4.67, + "grad_norm": 6.91972017288208, + "learning_rate": 1.3338693467336686e-06, + "loss": 0.676, + "step": 93400 + }, + { + "epoch": 4.67, + "grad_norm": 8.017012596130371, + "learning_rate": 1.3137688442211055e-06, + "loss": 0.6157, + "step": 93500 + }, + { + "epoch": 4.68, + "grad_norm": 4.738548755645752, + "learning_rate": 1.2936683417085427e-06, + "loss": 0.679, + "step": 93600 + }, + { + "epoch": 4.69, + "grad_norm": 6.201863765716553, + "learning_rate": 1.27356783919598e-06, + "loss": 0.6542, + "step": 93700 + }, + { + "epoch": 4.69, + "grad_norm": 7.595000267028809, + "learning_rate": 1.2534673366834171e-06, + "loss": 0.6659, + "step": 93800 + }, + { + "epoch": 4.7, + "grad_norm": 5.57780647277832, + "learning_rate": 1.2333668341708543e-06, + "loss": 0.6381, + "step": 93900 + }, + { + "epoch": 4.7, + "grad_norm": 8.426780700683594, + "learning_rate": 1.2132663316582916e-06, + "loss": 0.6705, + "step": 94000 + }, + { + "epoch": 4.71, + "grad_norm": 7.012176990509033, + "learning_rate": 1.1931658291457287e-06, + "loss": 0.6874, + "step": 94100 + }, + { + "epoch": 4.71, + "grad_norm": 7.747401237487793, + "learning_rate": 1.173065326633166e-06, + "loss": 0.6317, + "step": 94200 + }, + { + "epoch": 4.71, + "grad_norm": 4.817531108856201, + "learning_rate": 1.1529648241206032e-06, + "loss": 0.6083, + "step": 94300 + }, + { + "epoch": 4.72, + "grad_norm": 6.916783332824707, + "learning_rate": 1.1328643216080403e-06, + "loss": 0.6619, + "step": 94400 + }, + { + "epoch": 4.72, + "grad_norm": 7.570366382598877, + "learning_rate": 1.1127638190954775e-06, + "loss": 0.6471, + "step": 94500 + }, + { + "epoch": 4.73, + "grad_norm": 8.70361328125, + "learning_rate": 1.0926633165829146e-06, + "loss": 0.6483, + "step": 94600 + }, + { + "epoch": 4.74, + "grad_norm": 9.341569900512695, + "learning_rate": 1.072562814070352e-06, + "loss": 0.6194, + "step": 94700 + }, + { + "epoch": 4.74, + "grad_norm": 4.283209800720215, + "learning_rate": 1.052462311557789e-06, + "loss": 0.6111, + "step": 94800 + }, + { + "epoch": 4.75, + "grad_norm": 8.134038925170898, + "learning_rate": 1.0323618090452262e-06, + "loss": 0.632, + "step": 94900 + }, + { + "epoch": 4.75, + "grad_norm": 8.605172157287598, + "learning_rate": 1.0122613065326633e-06, + "loss": 0.6341, + "step": 95000 + }, + { + "epoch": 4.75, + "grad_norm": 8.067020416259766, + "learning_rate": 9.921608040201006e-07, + "loss": 0.6694, + "step": 95100 + }, + { + "epoch": 4.76, + "grad_norm": 6.967876434326172, + "learning_rate": 9.720603015075378e-07, + "loss": 0.648, + "step": 95200 + }, + { + "epoch": 4.76, + "grad_norm": 8.443940162658691, + "learning_rate": 9.51959798994975e-07, + "loss": 0.6174, + "step": 95300 + }, + { + "epoch": 4.77, + "grad_norm": 8.791583061218262, + "learning_rate": 9.318592964824122e-07, + "loss": 0.6463, + "step": 95400 + }, + { + "epoch": 4.78, + "grad_norm": 8.055484771728516, + "learning_rate": 9.117587939698493e-07, + "loss": 0.5966, + "step": 95500 + }, + { + "epoch": 4.78, + "grad_norm": 5.009509563446045, + "learning_rate": 8.916582914572865e-07, + "loss": 0.6147, + "step": 95600 + }, + { + "epoch": 4.79, + "grad_norm": 5.755350589752197, + "learning_rate": 8.715577889447237e-07, + "loss": 0.6101, + "step": 95700 + }, + { + "epoch": 4.79, + "grad_norm": 8.774045944213867, + "learning_rate": 8.514572864321608e-07, + "loss": 0.6332, + "step": 95800 + }, + { + "epoch": 4.79, + "grad_norm": 6.463279724121094, + "learning_rate": 8.315577889447237e-07, + "loss": 0.6705, + "step": 95900 + }, + { + "epoch": 4.8, + "grad_norm": 5.299009323120117, + "learning_rate": 8.114572864321608e-07, + "loss": 0.6605, + "step": 96000 + }, + { + "epoch": 4.8, + "grad_norm": 6.5152130126953125, + "learning_rate": 7.91356783919598e-07, + "loss": 0.6456, + "step": 96100 + }, + { + "epoch": 4.81, + "grad_norm": 8.499478340148926, + "learning_rate": 7.712562814070353e-07, + "loss": 0.6454, + "step": 96200 + }, + { + "epoch": 4.81, + "grad_norm": 8.317819595336914, + "learning_rate": 7.511557788944725e-07, + "loss": 0.5961, + "step": 96300 + }, + { + "epoch": 4.82, + "grad_norm": 7.257504940032959, + "learning_rate": 7.310552763819095e-07, + "loss": 0.614, + "step": 96400 + }, + { + "epoch": 4.83, + "grad_norm": 3.862578868865967, + "learning_rate": 7.109547738693468e-07, + "loss": 0.6388, + "step": 96500 + }, + { + "epoch": 4.83, + "grad_norm": 8.748353958129883, + "learning_rate": 6.90854271356784e-07, + "loss": 0.6222, + "step": 96600 + }, + { + "epoch": 4.83, + "grad_norm": 8.883009910583496, + "learning_rate": 6.707537688442211e-07, + "loss": 0.639, + "step": 96700 + }, + { + "epoch": 4.84, + "grad_norm": 7.332880973815918, + "learning_rate": 6.506532663316584e-07, + "loss": 0.6341, + "step": 96800 + }, + { + "epoch": 4.84, + "grad_norm": 7.421239852905273, + "learning_rate": 6.305527638190956e-07, + "loss": 0.6378, + "step": 96900 + }, + { + "epoch": 4.85, + "grad_norm": 6.633522033691406, + "learning_rate": 6.104522613065327e-07, + "loss": 0.6587, + "step": 97000 + }, + { + "epoch": 4.86, + "grad_norm": 6.347668170928955, + "learning_rate": 5.903517587939699e-07, + "loss": 0.6355, + "step": 97100 + }, + { + "epoch": 4.86, + "grad_norm": 5.266615390777588, + "learning_rate": 5.702512562814071e-07, + "loss": 0.5976, + "step": 97200 + }, + { + "epoch": 4.87, + "grad_norm": 5.0562286376953125, + "learning_rate": 5.501507537688443e-07, + "loss": 0.6426, + "step": 97300 + }, + { + "epoch": 4.87, + "grad_norm": 9.852864265441895, + "learning_rate": 5.300502512562814e-07, + "loss": 0.6434, + "step": 97400 + }, + { + "epoch": 4.88, + "grad_norm": 5.227302551269531, + "learning_rate": 5.099497487437187e-07, + "loss": 0.674, + "step": 97500 + }, + { + "epoch": 4.88, + "grad_norm": 7.586268424987793, + "learning_rate": 4.900502512562814e-07, + "loss": 0.6826, + "step": 97600 + }, + { + "epoch": 4.88, + "grad_norm": 7.512186527252197, + "learning_rate": 4.699497487437187e-07, + "loss": 0.6428, + "step": 97700 + }, + { + "epoch": 4.89, + "grad_norm": 8.383907318115234, + "learning_rate": 4.498492462311558e-07, + "loss": 0.6215, + "step": 97800 + }, + { + "epoch": 4.89, + "grad_norm": 6.214056968688965, + "learning_rate": 4.29748743718593e-07, + "loss": 0.6066, + "step": 97900 + }, + { + "epoch": 4.9, + "grad_norm": 8.587347030639648, + "learning_rate": 4.096482412060302e-07, + "loss": 0.6213, + "step": 98000 + }, + { + "epoch": 4.91, + "grad_norm": 14.060787200927734, + "learning_rate": 3.8954773869346735e-07, + "loss": 0.6151, + "step": 98100 + }, + { + "epoch": 4.91, + "grad_norm": 11.65833568572998, + "learning_rate": 3.694472361809046e-07, + "loss": 0.6226, + "step": 98200 + }, + { + "epoch": 4.92, + "grad_norm": 5.729846477508545, + "learning_rate": 3.4934673366834176e-07, + "loss": 0.6265, + "step": 98300 + }, + { + "epoch": 4.92, + "grad_norm": 5.596776485443115, + "learning_rate": 3.292462311557789e-07, + "loss": 0.6048, + "step": 98400 + }, + { + "epoch": 4.92, + "grad_norm": 5.834877967834473, + "learning_rate": 3.091457286432161e-07, + "loss": 0.6358, + "step": 98500 + }, + { + "epoch": 4.93, + "grad_norm": 7.830298900604248, + "learning_rate": 2.890452261306533e-07, + "loss": 0.6381, + "step": 98600 + }, + { + "epoch": 4.94, + "grad_norm": 7.147890567779541, + "learning_rate": 2.689447236180905e-07, + "loss": 0.6428, + "step": 98700 + }, + { + "epoch": 4.94, + "grad_norm": 5.18765926361084, + "learning_rate": 2.4884422110552766e-07, + "loss": 0.6098, + "step": 98800 + }, + { + "epoch": 4.95, + "grad_norm": 7.276676654815674, + "learning_rate": 2.2874371859296484e-07, + "loss": 0.6329, + "step": 98900 + }, + { + "epoch": 4.95, + "grad_norm": 7.58540678024292, + "learning_rate": 2.0864321608040202e-07, + "loss": 0.6095, + "step": 99000 + }, + { + "epoch": 4.96, + "grad_norm": 5.402534008026123, + "learning_rate": 1.8854271356783923e-07, + "loss": 0.605, + "step": 99100 + }, + { + "epoch": 4.96, + "grad_norm": 7.289499282836914, + "learning_rate": 1.684422110552764e-07, + "loss": 0.6694, + "step": 99200 + }, + { + "epoch": 4.96, + "grad_norm": 7.618215560913086, + "learning_rate": 1.483417085427136e-07, + "loss": 0.6313, + "step": 99300 + }, + { + "epoch": 4.97, + "grad_norm": 7.560898780822754, + "learning_rate": 1.2824120603015077e-07, + "loss": 0.6073, + "step": 99400 + }, + { + "epoch": 4.97, + "grad_norm": 5.637300968170166, + "learning_rate": 1.0834170854271359e-07, + "loss": 0.6211, + "step": 99500 + }, + { + "epoch": 4.98, + "grad_norm": 8.691441535949707, + "learning_rate": 8.824120603015076e-08, + "loss": 0.6085, + "step": 99600 + }, + { + "epoch": 4.99, + "grad_norm": 4.510754585266113, + "learning_rate": 6.814070351758795e-08, + "loss": 0.6193, + "step": 99700 + }, + { + "epoch": 4.99, + "grad_norm": 7.4050703048706055, + "learning_rate": 4.804020100502513e-08, + "loss": 0.6642, + "step": 99800 + }, + { + "epoch": 5.0, + "grad_norm": 9.641931533813477, + "learning_rate": 2.7939698492462312e-08, + "loss": 0.6304, + "step": 99900 + }, + { + "epoch": 5.0, + "grad_norm": 7.846133232116699, + "learning_rate": 7.839195979899499e-09, + "loss": 0.6181, + "step": 100000 + }, + { + "epoch": 5.0, + "eval_loss": 0.9481298923492432, + "eval_runtime": 21.6157, + "eval_samples_per_second": 46.263, + "eval_steps_per_second": 5.783, + "step": 100000 + } + ], + "logging_steps": 100, + "max_steps": 100000, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 10000, + "total_flos": 1.1800273747968e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}