| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 10000, | |
| "global_step": 100000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 12.391390800476074, | |
| "learning_rate": 3.920000000000001e-06, | |
| "loss": 1.8027, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 7.110462665557861, | |
| "learning_rate": 7.92e-06, | |
| "loss": 1.6358, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 10.526795387268066, | |
| "learning_rate": 1.1920000000000001e-05, | |
| "loss": 1.603, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 9.175031661987305, | |
| "learning_rate": 1.5920000000000003e-05, | |
| "loss": 1.6249, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 4.193933486938477, | |
| "learning_rate": 1.9920000000000002e-05, | |
| "loss": 1.6364, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 8.9299955368042, | |
| "learning_rate": 1.998030150753769e-05, | |
| "loss": 1.6265, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 11.564770698547363, | |
| "learning_rate": 1.996020100502513e-05, | |
| "loss": 1.5935, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 9.529921531677246, | |
| "learning_rate": 1.9940100502512564e-05, | |
| "loss": 1.5959, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 5.335429668426514, | |
| "learning_rate": 1.9920000000000002e-05, | |
| "loss": 1.6342, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 8.102309226989746, | |
| "learning_rate": 1.9899899497487437e-05, | |
| "loss": 1.572, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 5.742166042327881, | |
| "learning_rate": 1.987979899497488e-05, | |
| "loss": 1.5645, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 5.3909735679626465, | |
| "learning_rate": 1.9859698492462313e-05, | |
| "loss": 1.547, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 6.765148639678955, | |
| "learning_rate": 1.983959798994975e-05, | |
| "loss": 1.5399, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 6.0268378257751465, | |
| "learning_rate": 1.9819497487437185e-05, | |
| "loss": 1.4985, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 7.305541515350342, | |
| "learning_rate": 1.9799396984924623e-05, | |
| "loss": 1.5076, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 8.56618595123291, | |
| "learning_rate": 1.977929648241206e-05, | |
| "loss": 1.52, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 5.847652435302734, | |
| "learning_rate": 1.97591959798995e-05, | |
| "loss": 1.4976, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 6.940663814544678, | |
| "learning_rate": 1.9739095477386937e-05, | |
| "loss": 1.4983, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 5.06433629989624, | |
| "learning_rate": 1.9718994974874372e-05, | |
| "loss": 1.4951, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 5.1144022941589355, | |
| "learning_rate": 1.969889447236181e-05, | |
| "loss": 1.5256, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 6.515092849731445, | |
| "learning_rate": 1.9678793969849248e-05, | |
| "loss": 1.4677, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 5.787613868713379, | |
| "learning_rate": 1.9658693467336686e-05, | |
| "loss": 1.4841, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 7.798993110656738, | |
| "learning_rate": 1.963859296482412e-05, | |
| "loss": 1.4941, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.808990955352783, | |
| "learning_rate": 1.9618492462311562e-05, | |
| "loss": 1.4775, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 6.113214015960693, | |
| "learning_rate": 1.9598391959798996e-05, | |
| "loss": 1.4757, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 6.038852214813232, | |
| "learning_rate": 1.9578291457286434e-05, | |
| "loss": 1.4413, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 7.736110687255859, | |
| "learning_rate": 1.955819095477387e-05, | |
| "loss": 1.5001, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 6.173422336578369, | |
| "learning_rate": 1.953809045226131e-05, | |
| "loss": 1.4183, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 5.368058681488037, | |
| "learning_rate": 1.9517989949748745e-05, | |
| "loss": 1.4877, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 5.35443639755249, | |
| "learning_rate": 1.9497889447236183e-05, | |
| "loss": 1.4079, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 8.716644287109375, | |
| "learning_rate": 1.9477788944723618e-05, | |
| "loss": 1.4386, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.639494895935059, | |
| "learning_rate": 1.945768844221106e-05, | |
| "loss": 1.4524, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 3.3629064559936523, | |
| "learning_rate": 1.9437587939698493e-05, | |
| "loss": 1.4218, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 4.7631402015686035, | |
| "learning_rate": 1.941748743718593e-05, | |
| "loss": 1.4357, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 6.286344528198242, | |
| "learning_rate": 1.939738693467337e-05, | |
| "loss": 1.4025, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 4.501611232757568, | |
| "learning_rate": 1.9377286432160804e-05, | |
| "loss": 1.4002, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 6.302520275115967, | |
| "learning_rate": 1.9357185929648242e-05, | |
| "loss": 1.4128, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 6.156075477600098, | |
| "learning_rate": 1.933708542713568e-05, | |
| "loss": 1.4136, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.4391913414001465, | |
| "learning_rate": 1.9316984924623118e-05, | |
| "loss": 1.4307, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.862305641174316, | |
| "learning_rate": 1.9296884422110552e-05, | |
| "loss": 1.3605, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.392678737640381, | |
| "learning_rate": 1.9276783919597994e-05, | |
| "loss": 1.4059, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.686226844787598, | |
| "learning_rate": 1.925668341708543e-05, | |
| "loss": 1.3474, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 4.506126403808594, | |
| "learning_rate": 1.9236582914572866e-05, | |
| "loss": 1.3708, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 7.255539894104004, | |
| "learning_rate": 1.92164824120603e-05, | |
| "loss": 1.3803, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 6.463212966918945, | |
| "learning_rate": 1.9196381909547742e-05, | |
| "loss": 1.3371, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 7.1397294998168945, | |
| "learning_rate": 1.9176281407035177e-05, | |
| "loss": 1.3787, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 7.188973426818848, | |
| "learning_rate": 1.9156180904522615e-05, | |
| "loss": 1.3699, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 4.161841869354248, | |
| "learning_rate": 1.913608040201005e-05, | |
| "loss": 1.3819, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 3.420564889907837, | |
| "learning_rate": 1.911597989949749e-05, | |
| "loss": 1.3719, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.769357681274414, | |
| "learning_rate": 1.9095879396984925e-05, | |
| "loss": 1.366, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 6.374185562133789, | |
| "learning_rate": 1.9075778894472363e-05, | |
| "loss": 1.3377, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 6.3521575927734375, | |
| "learning_rate": 1.90556783919598e-05, | |
| "loss": 1.3632, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 4.51761531829834, | |
| "learning_rate": 1.903557788944724e-05, | |
| "loss": 1.3505, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 6.074390411376953, | |
| "learning_rate": 1.9015477386934674e-05, | |
| "loss": 1.3644, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 4.369632244110107, | |
| "learning_rate": 1.8995376884422112e-05, | |
| "loss": 1.3807, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 7.657780170440674, | |
| "learning_rate": 1.897527638190955e-05, | |
| "loss": 1.3125, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 9.048200607299805, | |
| "learning_rate": 1.8955175879396988e-05, | |
| "loss": 1.3216, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.997036933898926, | |
| "learning_rate": 1.8935075376884426e-05, | |
| "loss": 1.3262, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 4.751107692718506, | |
| "learning_rate": 1.891497487437186e-05, | |
| "loss": 1.3566, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.662681579589844, | |
| "learning_rate": 1.88948743718593e-05, | |
| "loss": 1.3645, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.755290508270264, | |
| "learning_rate": 1.887497487437186e-05, | |
| "loss": 1.2714, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 5.199550151824951, | |
| "learning_rate": 1.88548743718593e-05, | |
| "loss": 1.3427, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 7.531371116638184, | |
| "learning_rate": 1.8834773869346733e-05, | |
| "loss": 1.3198, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 4.267923831939697, | |
| "learning_rate": 1.881467336683417e-05, | |
| "loss": 1.334, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 5.429295063018799, | |
| "learning_rate": 1.879457286432161e-05, | |
| "loss": 1.2949, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 4.842006206512451, | |
| "learning_rate": 1.8774472361809047e-05, | |
| "loss": 1.3123, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 4.693381309509277, | |
| "learning_rate": 1.8754371859296482e-05, | |
| "loss": 1.3218, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 3.555487632751465, | |
| "learning_rate": 1.8734271356783923e-05, | |
| "loss": 1.3077, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 7.314678192138672, | |
| "learning_rate": 1.8714170854271358e-05, | |
| "loss": 1.2855, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 6.160294532775879, | |
| "learning_rate": 1.8694070351758796e-05, | |
| "loss": 1.2901, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 7.399959087371826, | |
| "learning_rate": 1.867396984924623e-05, | |
| "loss": 1.264, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 4.204007625579834, | |
| "learning_rate": 1.8653869346733672e-05, | |
| "loss": 1.323, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 5.531479358673096, | |
| "learning_rate": 1.8633768844221106e-05, | |
| "loss": 1.3211, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 4.645538806915283, | |
| "learning_rate": 1.8613668341708544e-05, | |
| "loss": 1.2941, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 6.326472282409668, | |
| "learning_rate": 1.8593567839195982e-05, | |
| "loss": 1.3025, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 6.338307857513428, | |
| "learning_rate": 1.857346733668342e-05, | |
| "loss": 1.2924, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 7.802080154418945, | |
| "learning_rate": 1.8553366834170855e-05, | |
| "loss": 1.3061, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 4.98875093460083, | |
| "learning_rate": 1.8533266331658293e-05, | |
| "loss": 1.321, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 5.888318061828613, | |
| "learning_rate": 1.851316582914573e-05, | |
| "loss": 1.2746, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 6.636387825012207, | |
| "learning_rate": 1.849306532663317e-05, | |
| "loss": 1.2653, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 6.1142449378967285, | |
| "learning_rate": 1.8473165829145728e-05, | |
| "loss": 1.2347, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 5.41117525100708, | |
| "learning_rate": 1.845306532663317e-05, | |
| "loss": 1.3062, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 5.025302886962891, | |
| "learning_rate": 1.8432964824120604e-05, | |
| "loss": 1.3162, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 7.1088972091674805, | |
| "learning_rate": 1.8412864321608042e-05, | |
| "loss": 1.2573, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 5.86447811126709, | |
| "learning_rate": 1.839276381909548e-05, | |
| "loss": 1.2855, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 4.323820114135742, | |
| "learning_rate": 1.8372663316582918e-05, | |
| "loss": 1.2272, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 7.335355758666992, | |
| "learning_rate": 1.8352562814070352e-05, | |
| "loss": 1.2718, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 5.308874130249023, | |
| "learning_rate": 1.833246231155779e-05, | |
| "loss": 1.2727, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 3.919790506362915, | |
| "learning_rate": 1.8312361809045228e-05, | |
| "loss": 1.28, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 7.291688442230225, | |
| "learning_rate": 1.8292261306532663e-05, | |
| "loss": 1.2768, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 5.098793029785156, | |
| "learning_rate": 1.8272160804020104e-05, | |
| "loss": 1.2441, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 5.242636203765869, | |
| "learning_rate": 1.825206030150754e-05, | |
| "loss": 1.2534, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 5.310051918029785, | |
| "learning_rate": 1.8231959798994977e-05, | |
| "loss": 1.2878, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 6.058734893798828, | |
| "learning_rate": 1.821185929648241e-05, | |
| "loss": 1.2964, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 6.912698745727539, | |
| "learning_rate": 1.8191758793969853e-05, | |
| "loss": 1.2511, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 6.428102016448975, | |
| "learning_rate": 1.8171658291457287e-05, | |
| "loss": 1.2605, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 5.642975807189941, | |
| "learning_rate": 1.8151557788944725e-05, | |
| "loss": 1.264, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 6.23274040222168, | |
| "learning_rate": 1.813145728643216e-05, | |
| "loss": 1.2583, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 7.3280792236328125, | |
| "learning_rate": 1.81113567839196e-05, | |
| "loss": 1.2324, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 6.048460483551025, | |
| "learning_rate": 1.8091256281407036e-05, | |
| "loss": 1.2477, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "eval_loss": 1.2569069862365723, | |
| "eval_runtime": 21.5797, | |
| "eval_samples_per_second": 46.34, | |
| "eval_steps_per_second": 5.792, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 5.294989109039307, | |
| "learning_rate": 1.80713567839196e-05, | |
| "loss": 1.3038, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 6.7187981605529785, | |
| "learning_rate": 1.8051256281407036e-05, | |
| "loss": 1.2584, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 7.11021089553833, | |
| "learning_rate": 1.8031155778894474e-05, | |
| "loss": 1.2612, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 6.111474990844727, | |
| "learning_rate": 1.801105527638191e-05, | |
| "loss": 1.2638, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 6.04983377456665, | |
| "learning_rate": 1.799095477386935e-05, | |
| "loss": 1.2381, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 5.682928562164307, | |
| "learning_rate": 1.7970854271356785e-05, | |
| "loss": 1.233, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 6.028292179107666, | |
| "learning_rate": 1.7950753768844223e-05, | |
| "loss": 1.2572, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 4.738650798797607, | |
| "learning_rate": 1.793065326633166e-05, | |
| "loss": 1.2125, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 5.227931976318359, | |
| "learning_rate": 1.7910753768844223e-05, | |
| "loss": 1.2862, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 6.476836204528809, | |
| "learning_rate": 1.7890653266331658e-05, | |
| "loss": 1.243, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 4.261963844299316, | |
| "learning_rate": 1.78705527638191e-05, | |
| "loss": 1.2118, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 6.414599418640137, | |
| "learning_rate": 1.7850452261306534e-05, | |
| "loss": 1.222, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 5.642942905426025, | |
| "learning_rate": 1.783035175879397e-05, | |
| "loss": 1.1809, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 4.094428539276123, | |
| "learning_rate": 1.781025125628141e-05, | |
| "loss": 1.2362, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 5.5772881507873535, | |
| "learning_rate": 1.7790150753768847e-05, | |
| "loss": 1.2005, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 4.420604705810547, | |
| "learning_rate": 1.7770050251256282e-05, | |
| "loss": 1.2138, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 5.298806667327881, | |
| "learning_rate": 1.774994974874372e-05, | |
| "loss": 1.1693, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 5.862612247467041, | |
| "learning_rate": 1.7729849246231158e-05, | |
| "loss": 1.1728, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 3.835301637649536, | |
| "learning_rate": 1.7709748743718593e-05, | |
| "loss": 1.2159, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 5.67401123046875, | |
| "learning_rate": 1.768964824120603e-05, | |
| "loss": 1.2393, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 5.424498558044434, | |
| "learning_rate": 1.766954773869347e-05, | |
| "loss": 1.2255, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 5.532503604888916, | |
| "learning_rate": 1.7649447236180907e-05, | |
| "loss": 1.2024, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 5.404232501983643, | |
| "learning_rate": 1.762934673366834e-05, | |
| "loss": 1.2202, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 3.9564428329467773, | |
| "learning_rate": 1.7609246231155782e-05, | |
| "loss": 1.1655, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 3.2090141773223877, | |
| "learning_rate": 1.7589145728643217e-05, | |
| "loss": 1.1563, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 6.341458320617676, | |
| "learning_rate": 1.7569045226130655e-05, | |
| "loss": 1.1982, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 7.190246105194092, | |
| "learning_rate": 1.754894472361809e-05, | |
| "loss": 1.1817, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 6.108299255371094, | |
| "learning_rate": 1.752884422110553e-05, | |
| "loss": 1.2123, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 5.769379615783691, | |
| "learning_rate": 1.7508743718592966e-05, | |
| "loss": 1.1964, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 5.177648067474365, | |
| "learning_rate": 1.7488643216080404e-05, | |
| "loss": 1.2103, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 5.531684875488281, | |
| "learning_rate": 1.7468542713567838e-05, | |
| "loss": 1.1801, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 5.700603008270264, | |
| "learning_rate": 1.744844221105528e-05, | |
| "loss": 1.1943, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 9.25114917755127, | |
| "learning_rate": 1.7428341708542714e-05, | |
| "loss": 1.2286, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 4.238541126251221, | |
| "learning_rate": 1.7408241206030152e-05, | |
| "loss": 1.1869, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 5.6147260665893555, | |
| "learning_rate": 1.738814070351759e-05, | |
| "loss": 1.1854, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 4.879734039306641, | |
| "learning_rate": 1.7368040201005028e-05, | |
| "loss": 1.1941, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 3.612379312515259, | |
| "learning_rate": 1.7347939698492463e-05, | |
| "loss": 1.1649, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 4.583663463592529, | |
| "learning_rate": 1.73278391959799e-05, | |
| "loss": 1.1796, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 4.3080339431762695, | |
| "learning_rate": 1.7307939698492463e-05, | |
| "loss": 1.2092, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 5.9151506423950195, | |
| "learning_rate": 1.72878391959799e-05, | |
| "loss": 1.1809, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 5.167910575866699, | |
| "learning_rate": 1.726773869346734e-05, | |
| "loss": 1.2063, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 7.372837543487549, | |
| "learning_rate": 1.7247638190954777e-05, | |
| "loss": 1.147, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 3.6992413997650146, | |
| "learning_rate": 1.722753768844221e-05, | |
| "loss": 1.2312, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 6.654348850250244, | |
| "learning_rate": 1.720743718592965e-05, | |
| "loss": 1.1956, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 4.683749675750732, | |
| "learning_rate": 1.7187336683417087e-05, | |
| "loss": 1.1598, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 5.769094467163086, | |
| "learning_rate": 1.7167236180904522e-05, | |
| "loss": 1.1387, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 7.586219310760498, | |
| "learning_rate": 1.7147135678391963e-05, | |
| "loss": 1.1994, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 5.573954105377197, | |
| "learning_rate": 1.7127035175879398e-05, | |
| "loss": 1.1887, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 6.4866251945495605, | |
| "learning_rate": 1.7106934673366836e-05, | |
| "loss": 1.1892, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 4.954825401306152, | |
| "learning_rate": 1.708683417085427e-05, | |
| "loss": 1.1742, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 3.952847480773926, | |
| "learning_rate": 1.7066733668341712e-05, | |
| "loss": 1.143, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 5.170006275177002, | |
| "learning_rate": 1.7046633165829146e-05, | |
| "loss": 1.1881, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 4.910400390625, | |
| "learning_rate": 1.7026532663316584e-05, | |
| "loss": 1.131, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 4.728166580200195, | |
| "learning_rate": 1.700643216080402e-05, | |
| "loss": 1.1854, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 6.516223430633545, | |
| "learning_rate": 1.698633165829146e-05, | |
| "loss": 1.2069, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 5.914300918579102, | |
| "learning_rate": 1.6966231155778895e-05, | |
| "loss": 1.1663, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 4.6894378662109375, | |
| "learning_rate": 1.6946130653266333e-05, | |
| "loss": 1.145, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 4.994019031524658, | |
| "learning_rate": 1.692603015075377e-05, | |
| "loss": 1.1156, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 5.994630813598633, | |
| "learning_rate": 1.690592964824121e-05, | |
| "loss": 1.1583, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 6.7444562911987305, | |
| "learning_rate": 1.6885829145728643e-05, | |
| "loss": 1.1821, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 5.461032867431641, | |
| "learning_rate": 1.686572864321608e-05, | |
| "loss": 1.1388, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 5.0775251388549805, | |
| "learning_rate": 1.684562814070352e-05, | |
| "loss": 1.1576, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 4.469027042388916, | |
| "learning_rate": 1.6825527638190957e-05, | |
| "loss": 1.1792, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 6.780773639678955, | |
| "learning_rate": 1.6805427135678395e-05, | |
| "loss": 1.1441, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 6.338268756866455, | |
| "learning_rate": 1.678532663316583e-05, | |
| "loss": 1.1087, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 4.28759241104126, | |
| "learning_rate": 1.6765226130653268e-05, | |
| "loss": 1.1616, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 4.656599998474121, | |
| "learning_rate": 1.6745125628140706e-05, | |
| "loss": 1.1086, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 4.506341457366943, | |
| "learning_rate": 1.6725025125628144e-05, | |
| "loss": 1.1821, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 5.074087142944336, | |
| "learning_rate": 1.670492462311558e-05, | |
| "loss": 1.1376, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 4.427557468414307, | |
| "learning_rate": 1.6684824120603016e-05, | |
| "loss": 1.1608, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 4.684313774108887, | |
| "learning_rate": 1.666472361809045e-05, | |
| "loss": 1.1374, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 4.981125354766846, | |
| "learning_rate": 1.6644623115577892e-05, | |
| "loss": 1.1157, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 6.36452579498291, | |
| "learning_rate": 1.6624522613065327e-05, | |
| "loss": 1.1547, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 4.986701011657715, | |
| "learning_rate": 1.6604422110552765e-05, | |
| "loss": 1.147, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 6.206230640411377, | |
| "learning_rate": 1.6584321608040203e-05, | |
| "loss": 1.1235, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 5.597214221954346, | |
| "learning_rate": 1.656422110552764e-05, | |
| "loss": 1.1472, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 5.753964424133301, | |
| "learning_rate": 1.6544120603015076e-05, | |
| "loss": 1.0838, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 5.263125896453857, | |
| "learning_rate": 1.6524020100502513e-05, | |
| "loss": 1.1149, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 2.9451704025268555, | |
| "learning_rate": 1.6504120603015076e-05, | |
| "loss": 1.1162, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 6.694633960723877, | |
| "learning_rate": 1.6484020100502514e-05, | |
| "loss": 1.1268, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 5.449553489685059, | |
| "learning_rate": 1.6463919597989952e-05, | |
| "loss": 1.1307, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 5.502272129058838, | |
| "learning_rate": 1.644381909547739e-05, | |
| "loss": 1.1031, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 6.899608612060547, | |
| "learning_rate": 1.6423718592964824e-05, | |
| "loss": 1.1389, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 4.675032615661621, | |
| "learning_rate": 1.6403618090452262e-05, | |
| "loss": 1.1541, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 7.353012561798096, | |
| "learning_rate": 1.63835175879397e-05, | |
| "loss": 1.1213, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 4.253681659698486, | |
| "learning_rate": 1.636341708542714e-05, | |
| "loss": 1.1672, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 6.5902018547058105, | |
| "learning_rate": 1.6343316582914573e-05, | |
| "loss": 1.1349, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 5.40578556060791, | |
| "learning_rate": 1.632321608040201e-05, | |
| "loss": 1.1283, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 3.9744160175323486, | |
| "learning_rate": 1.630311557788945e-05, | |
| "loss": 1.1463, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 6.115358352661133, | |
| "learning_rate": 1.6283015075376887e-05, | |
| "loss": 1.1443, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 2.9785940647125244, | |
| "learning_rate": 1.6262914572864325e-05, | |
| "loss": 1.1409, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 5.200758934020996, | |
| "learning_rate": 1.6243015075376887e-05, | |
| "loss": 1.1629, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 5.975739479064941, | |
| "learning_rate": 1.6222914572864322e-05, | |
| "loss": 1.083, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 6.220870018005371, | |
| "learning_rate": 1.620281407035176e-05, | |
| "loss": 1.1305, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 4.187997341156006, | |
| "learning_rate": 1.6182713567839198e-05, | |
| "loss": 1.1028, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 5.540648937225342, | |
| "learning_rate": 1.6162613065326636e-05, | |
| "loss": 1.1176, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 5.99765157699585, | |
| "learning_rate": 1.6142512562814074e-05, | |
| "loss": 1.0932, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 4.647700786590576, | |
| "learning_rate": 1.6122412060301508e-05, | |
| "loss": 1.1294, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 6.05048131942749, | |
| "learning_rate": 1.6102311557788946e-05, | |
| "loss": 1.0828, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 4.912966251373291, | |
| "learning_rate": 1.608221105527638e-05, | |
| "loss": 1.0975, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 1.132000207901001, | |
| "eval_runtime": 21.5853, | |
| "eval_samples_per_second": 46.328, | |
| "eval_steps_per_second": 5.791, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 5.5869340896606445, | |
| "learning_rate": 1.6062110552763822e-05, | |
| "loss": 1.1428, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 4.5555739402771, | |
| "learning_rate": 1.6042010050251257e-05, | |
| "loss": 1.0939, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 3.527172803878784, | |
| "learning_rate": 1.6021909547738695e-05, | |
| "loss": 1.1184, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 2.7429285049438477, | |
| "learning_rate": 1.600180904522613e-05, | |
| "loss": 1.1028, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 3.6536190509796143, | |
| "learning_rate": 1.598170854271357e-05, | |
| "loss": 1.0954, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 4.48521089553833, | |
| "learning_rate": 1.5961608040201005e-05, | |
| "loss": 1.1001, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 7.937503814697266, | |
| "learning_rate": 1.5941507537688443e-05, | |
| "loss": 1.0676, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 7.802252769470215, | |
| "learning_rate": 1.592140703517588e-05, | |
| "loss": 1.1007, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 6.141603469848633, | |
| "learning_rate": 1.590130653266332e-05, | |
| "loss": 1.0749, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 5.166286945343018, | |
| "learning_rate": 1.5881206030150754e-05, | |
| "loss": 1.0704, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 5.407045364379883, | |
| "learning_rate": 1.5861105527638192e-05, | |
| "loss": 1.0852, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 5.4536967277526855, | |
| "learning_rate": 1.584100502512563e-05, | |
| "loss": 1.1152, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 5.464064121246338, | |
| "learning_rate": 1.5820904522613068e-05, | |
| "loss": 1.0546, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 3.853875160217285, | |
| "learning_rate": 1.580100502512563e-05, | |
| "loss": 1.0858, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 4.8497443199157715, | |
| "learning_rate": 1.5780904522613068e-05, | |
| "loss": 1.0973, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 4.255434513092041, | |
| "learning_rate": 1.5760804020100503e-05, | |
| "loss": 1.0872, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 4.134657382965088, | |
| "learning_rate": 1.574070351758794e-05, | |
| "loss": 1.1127, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 9.425840377807617, | |
| "learning_rate": 1.572060301507538e-05, | |
| "loss": 1.1147, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 5.42075777053833, | |
| "learning_rate": 1.5700502512562817e-05, | |
| "loss": 1.0719, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 5.076992988586426, | |
| "learning_rate": 1.5680402010050255e-05, | |
| "loss": 1.095, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 5.668195724487305, | |
| "learning_rate": 1.566030150753769e-05, | |
| "loss": 1.0799, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 5.9342474937438965, | |
| "learning_rate": 1.5640201005025127e-05, | |
| "loss": 1.0965, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 5.112601280212402, | |
| "learning_rate": 1.5620100502512565e-05, | |
| "loss": 1.0951, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 10.206339836120605, | |
| "learning_rate": 1.5600000000000003e-05, | |
| "loss": 1.0837, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 3.8015480041503906, | |
| "learning_rate": 1.5579899497487438e-05, | |
| "loss": 1.0871, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 4.524369239807129, | |
| "learning_rate": 1.5559798994974876e-05, | |
| "loss": 1.1263, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 5.1671671867370605, | |
| "learning_rate": 1.553969849246231e-05, | |
| "loss": 1.085, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 4.96006965637207, | |
| "learning_rate": 1.551959798994975e-05, | |
| "loss": 1.0893, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 6.482675075531006, | |
| "learning_rate": 1.5499497487437186e-05, | |
| "loss": 1.0667, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 4.591585636138916, | |
| "learning_rate": 1.5479396984924624e-05, | |
| "loss": 1.0861, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 4.026520729064941, | |
| "learning_rate": 1.5459296482412062e-05, | |
| "loss": 1.0772, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 5.972117900848389, | |
| "learning_rate": 1.54391959798995e-05, | |
| "loss": 1.0818, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 4.737887382507324, | |
| "learning_rate": 1.5419095477386935e-05, | |
| "loss": 1.0752, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 4.748262882232666, | |
| "learning_rate": 1.5398994974874373e-05, | |
| "loss": 1.0803, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 4.94175386428833, | |
| "learning_rate": 1.537889447236181e-05, | |
| "loss": 1.0754, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 4.3259172439575195, | |
| "learning_rate": 1.535879396984925e-05, | |
| "loss": 1.0463, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 5.240546703338623, | |
| "learning_rate": 1.5338693467336687e-05, | |
| "loss": 1.0547, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 6.120886325836182, | |
| "learning_rate": 1.531859296482412e-05, | |
| "loss": 1.0861, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 5.634921550750732, | |
| "learning_rate": 1.529849246231156e-05, | |
| "loss": 1.0722, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 5.39201021194458, | |
| "learning_rate": 1.5278391959798997e-05, | |
| "loss": 1.07, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 6.85221004486084, | |
| "learning_rate": 1.5258291457286433e-05, | |
| "loss": 1.0578, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 4.522882461547852, | |
| "learning_rate": 1.523819095477387e-05, | |
| "loss": 1.0895, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 4.020057201385498, | |
| "learning_rate": 1.5218090452261308e-05, | |
| "loss": 1.0377, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 4.188474655151367, | |
| "learning_rate": 1.5197989949748746e-05, | |
| "loss": 1.0469, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 6.872804164886475, | |
| "learning_rate": 1.5177889447236182e-05, | |
| "loss": 1.0795, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 5.834617614746094, | |
| "learning_rate": 1.515778894472362e-05, | |
| "loss": 1.0827, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 4.008932590484619, | |
| "learning_rate": 1.5137688442211056e-05, | |
| "loss": 1.069, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 5.309475898742676, | |
| "learning_rate": 1.5117587939698494e-05, | |
| "loss": 1.0668, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 6.02021598815918, | |
| "learning_rate": 1.5097487437185932e-05, | |
| "loss": 1.0611, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 4.143280029296875, | |
| "learning_rate": 1.5077587939698495e-05, | |
| "loss": 1.0526, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 4.231622695922852, | |
| "learning_rate": 1.505748743718593e-05, | |
| "loss": 1.0706, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 4.0399322509765625, | |
| "learning_rate": 1.5037386934673369e-05, | |
| "loss": 1.0878, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 4.2283759117126465, | |
| "learning_rate": 1.5017286432160805e-05, | |
| "loss": 1.0903, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 6.159567356109619, | |
| "learning_rate": 1.4997185929648241e-05, | |
| "loss": 1.069, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 5.181605815887451, | |
| "learning_rate": 1.4977085427135681e-05, | |
| "loss": 1.0712, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 4.90966796875, | |
| "learning_rate": 1.4956984924623117e-05, | |
| "loss": 1.0672, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 4.765697479248047, | |
| "learning_rate": 1.4936884422110554e-05, | |
| "loss": 1.0338, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 4.3462018966674805, | |
| "learning_rate": 1.491678391959799e-05, | |
| "loss": 1.0408, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 5.249480247497559, | |
| "learning_rate": 1.489668341708543e-05, | |
| "loss": 1.0576, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 5.543900489807129, | |
| "learning_rate": 1.4876582914572866e-05, | |
| "loss": 1.0651, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 6.526113033294678, | |
| "learning_rate": 1.4856482412060302e-05, | |
| "loss": 1.0596, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 4.725895404815674, | |
| "learning_rate": 1.4836381909547738e-05, | |
| "loss": 1.0969, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 6.068490028381348, | |
| "learning_rate": 1.4816281407035178e-05, | |
| "loss": 1.0284, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 4.363389015197754, | |
| "learning_rate": 1.4796180904522614e-05, | |
| "loss": 1.0589, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 6.8659257888793945, | |
| "learning_rate": 1.477608040201005e-05, | |
| "loss": 1.0803, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 5.061355113983154, | |
| "learning_rate": 1.4755979899497489e-05, | |
| "loss": 1.066, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 4.511940956115723, | |
| "learning_rate": 1.4735879396984927e-05, | |
| "loss": 1.0447, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 4.449003219604492, | |
| "learning_rate": 1.4715778894472363e-05, | |
| "loss": 1.0532, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 5.1782307624816895, | |
| "learning_rate": 1.46956783919598e-05, | |
| "loss": 1.0608, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 5.087260723114014, | |
| "learning_rate": 1.4675577889447237e-05, | |
| "loss": 1.0371, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 4.387496471405029, | |
| "learning_rate": 1.4655477386934675e-05, | |
| "loss": 1.055, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 4.9253010749816895, | |
| "learning_rate": 1.4635376884422113e-05, | |
| "loss": 1.0385, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 4.611992835998535, | |
| "learning_rate": 1.461527638190955e-05, | |
| "loss": 1.0338, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 2.981304168701172, | |
| "learning_rate": 1.4595175879396986e-05, | |
| "loss": 1.0516, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 5.678966045379639, | |
| "learning_rate": 1.4575075376884422e-05, | |
| "loss": 1.0788, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 5.3079752922058105, | |
| "learning_rate": 1.4554974874371862e-05, | |
| "loss": 1.0853, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 5.990561485290527, | |
| "learning_rate": 1.4534874371859298e-05, | |
| "loss": 1.0187, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 7.396142482757568, | |
| "learning_rate": 1.4514773869346734e-05, | |
| "loss": 1.0694, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 4.319200038909912, | |
| "learning_rate": 1.449467336683417e-05, | |
| "loss": 1.0668, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 2.7691450119018555, | |
| "learning_rate": 1.447457286432161e-05, | |
| "loss": 1.0652, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 8.814241409301758, | |
| "learning_rate": 1.4454472361809046e-05, | |
| "loss": 1.0423, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 5.264801979064941, | |
| "learning_rate": 1.4434371859296483e-05, | |
| "loss": 1.0918, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 4.573727130889893, | |
| "learning_rate": 1.441427135678392e-05, | |
| "loss": 1.0822, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 3.6568844318389893, | |
| "learning_rate": 1.4394170854271359e-05, | |
| "loss": 1.0492, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 4.999285697937012, | |
| "learning_rate": 1.437427135678392e-05, | |
| "loss": 1.0583, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 4.125443458557129, | |
| "learning_rate": 1.4354170854271359e-05, | |
| "loss": 1.0422, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 6.014279365539551, | |
| "learning_rate": 1.4334070351758795e-05, | |
| "loss": 1.0347, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 8.18229866027832, | |
| "learning_rate": 1.4313969849246232e-05, | |
| "loss": 1.0133, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 3.3756470680236816, | |
| "learning_rate": 1.4294070351758796e-05, | |
| "loss": 1.0684, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 5.568530559539795, | |
| "learning_rate": 1.4273969849246232e-05, | |
| "loss": 1.0666, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 4.440110683441162, | |
| "learning_rate": 1.4253869346733668e-05, | |
| "loss": 1.057, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 6.835775852203369, | |
| "learning_rate": 1.4233768844221108e-05, | |
| "loss": 1.0176, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 5.715722560882568, | |
| "learning_rate": 1.4213668341708544e-05, | |
| "loss": 1.0996, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 6.401480674743652, | |
| "learning_rate": 1.419356783919598e-05, | |
| "loss": 1.0459, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 7.125598430633545, | |
| "learning_rate": 1.4173467336683417e-05, | |
| "loss": 1.0067, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 5.287647724151611, | |
| "learning_rate": 1.4153366834170856e-05, | |
| "loss": 1.0475, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 5.175357818603516, | |
| "learning_rate": 1.4133266331658293e-05, | |
| "loss": 1.0361, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 4.676697731018066, | |
| "learning_rate": 1.4113165829145729e-05, | |
| "loss": 0.9925, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 4.375120162963867, | |
| "learning_rate": 1.4093065326633167e-05, | |
| "loss": 1.0145, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 4.380770683288574, | |
| "learning_rate": 1.4072964824120605e-05, | |
| "loss": 1.0763, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "eval_loss": 1.0519436597824097, | |
| "eval_runtime": 21.613, | |
| "eval_samples_per_second": 46.269, | |
| "eval_steps_per_second": 5.784, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 5.796531677246094, | |
| "learning_rate": 1.4052864321608041e-05, | |
| "loss": 1.0563, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 2.713714361190796, | |
| "learning_rate": 1.4032763819095479e-05, | |
| "loss": 1.0549, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 6.333755016326904, | |
| "learning_rate": 1.4012663316582915e-05, | |
| "loss": 1.042, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 3.8109474182128906, | |
| "learning_rate": 1.3992562814070353e-05, | |
| "loss": 1.0773, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 6.425621509552002, | |
| "learning_rate": 1.3972462311557791e-05, | |
| "loss": 1.0066, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 4.9127607345581055, | |
| "learning_rate": 1.3952361809045228e-05, | |
| "loss": 1.0022, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 4.212081432342529, | |
| "learning_rate": 1.3932261306532664e-05, | |
| "loss": 1.0358, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 7.6413187980651855, | |
| "learning_rate": 1.39121608040201e-05, | |
| "loss": 1.0413, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 4.2576494216918945, | |
| "learning_rate": 1.389206030150754e-05, | |
| "loss": 1.0332, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 4.797669887542725, | |
| "learning_rate": 1.3871959798994976e-05, | |
| "loss": 1.0396, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 5.891973972320557, | |
| "learning_rate": 1.3851859296482412e-05, | |
| "loss": 1.0281, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 5.9344964027404785, | |
| "learning_rate": 1.3831758793969849e-05, | |
| "loss": 1.024, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 4.902309417724609, | |
| "learning_rate": 1.3811658291457288e-05, | |
| "loss": 1.027, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 6.387609958648682, | |
| "learning_rate": 1.3791557788944725e-05, | |
| "loss": 1.0207, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 5.870815277099609, | |
| "learning_rate": 1.3771457286432161e-05, | |
| "loss": 1.0128, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 6.101361274719238, | |
| "learning_rate": 1.3751356783919599e-05, | |
| "loss": 1.0412, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 5.250607967376709, | |
| "learning_rate": 1.3731256281407037e-05, | |
| "loss": 1.0146, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 5.449378967285156, | |
| "learning_rate": 1.3711155778894473e-05, | |
| "loss": 1.03, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 4.564045429229736, | |
| "learning_rate": 1.3691055276381911e-05, | |
| "loss": 1.0567, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 5.84417200088501, | |
| "learning_rate": 1.3670954773869347e-05, | |
| "loss": 1.0782, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 4.725462436676025, | |
| "learning_rate": 1.3650854271356785e-05, | |
| "loss": 1.0327, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 6.999115943908691, | |
| "learning_rate": 1.3630753768844223e-05, | |
| "loss": 1.0405, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 4.241363525390625, | |
| "learning_rate": 1.361065326633166e-05, | |
| "loss": 1.02, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 5.884255886077881, | |
| "learning_rate": 1.3590552763819096e-05, | |
| "loss": 1.0634, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 3.674698829650879, | |
| "learning_rate": 1.3570452261306536e-05, | |
| "loss": 1.0389, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 4.227616310119629, | |
| "learning_rate": 1.3550351758793972e-05, | |
| "loss": 0.9992, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 4.682816982269287, | |
| "learning_rate": 1.3530251256281408e-05, | |
| "loss": 1.0111, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 4.632464408874512, | |
| "learning_rate": 1.3510150753768844e-05, | |
| "loss": 1.0223, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 6.061766147613525, | |
| "learning_rate": 1.349005025125628e-05, | |
| "loss": 0.9837, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 5.4998908042907715, | |
| "learning_rate": 1.346994974874372e-05, | |
| "loss": 1.041, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 6.294175624847412, | |
| "learning_rate": 1.3449849246231157e-05, | |
| "loss": 1.0311, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 5.177206039428711, | |
| "learning_rate": 1.3429748743718593e-05, | |
| "loss": 1.0435, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 4.389501571655273, | |
| "learning_rate": 1.3409648241206031e-05, | |
| "loss": 1.0104, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 5.105901718139648, | |
| "learning_rate": 1.3389547738693469e-05, | |
| "loss": 0.9868, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 3.407482147216797, | |
| "learning_rate": 1.3369447236180905e-05, | |
| "loss": 1.0559, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 6.496652126312256, | |
| "learning_rate": 1.3349346733668343e-05, | |
| "loss": 0.9849, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 6.241397857666016, | |
| "learning_rate": 1.332924623115578e-05, | |
| "loss": 0.9995, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 5.998499870300293, | |
| "learning_rate": 1.3309145728643217e-05, | |
| "loss": 1.0355, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 5.380569934844971, | |
| "learning_rate": 1.3289045226130655e-05, | |
| "loss": 1.0082, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 5.168824195861816, | |
| "learning_rate": 1.3268944723618092e-05, | |
| "loss": 1.052, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 5.691008567810059, | |
| "learning_rate": 1.3248844221105528e-05, | |
| "loss": 1.0424, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 5.678094387054443, | |
| "learning_rate": 1.3228743718592968e-05, | |
| "loss": 1.0083, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 6.432235240936279, | |
| "learning_rate": 1.3208643216080404e-05, | |
| "loss": 0.9766, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 6.016462326049805, | |
| "learning_rate": 1.318854271356784e-05, | |
| "loss": 1.0059, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 4.596778392791748, | |
| "learning_rate": 1.3168442211055276e-05, | |
| "loss": 0.9462, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 7.2965850830078125, | |
| "learning_rate": 1.3148341708542716e-05, | |
| "loss": 0.972, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 5.232773780822754, | |
| "learning_rate": 1.3128241206030152e-05, | |
| "loss": 1.0532, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 5.5057783126831055, | |
| "learning_rate": 1.3108140703517589e-05, | |
| "loss": 0.9835, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 3.0561375617980957, | |
| "learning_rate": 1.3088040201005025e-05, | |
| "loss": 1.0293, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 4.761837959289551, | |
| "learning_rate": 1.3068140703517589e-05, | |
| "loss": 1.0232, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 7.006007671356201, | |
| "learning_rate": 1.3048040201005025e-05, | |
| "loss": 0.9945, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 4.829462051391602, | |
| "learning_rate": 1.3027939698492465e-05, | |
| "loss": 1.0589, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 3.8825013637542725, | |
| "learning_rate": 1.3007839195979901e-05, | |
| "loss": 0.9984, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 5.655978202819824, | |
| "learning_rate": 1.2987738693467338e-05, | |
| "loss": 1.0004, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 5.612642765045166, | |
| "learning_rate": 1.2967638190954774e-05, | |
| "loss": 0.9874, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 17.78661346435547, | |
| "learning_rate": 1.2947537688442212e-05, | |
| "loss": 1.0322, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 4.723743915557861, | |
| "learning_rate": 1.292743718592965e-05, | |
| "loss": 0.9984, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 5.048336982727051, | |
| "learning_rate": 1.2907336683417086e-05, | |
| "loss": 1.0588, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 6.086093425750732, | |
| "learning_rate": 1.2887236180904524e-05, | |
| "loss": 1.0075, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 6.542403697967529, | |
| "learning_rate": 1.286713567839196e-05, | |
| "loss": 1.0219, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 5.013860702514648, | |
| "learning_rate": 1.2847035175879398e-05, | |
| "loss": 1.0307, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 5.978675365447998, | |
| "learning_rate": 1.2826934673366835e-05, | |
| "loss": 1.0026, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 6.217547416687012, | |
| "learning_rate": 1.2806834170854273e-05, | |
| "loss": 1.0196, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 4.577905654907227, | |
| "learning_rate": 1.2786733668341709e-05, | |
| "loss": 0.9767, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 4.999172210693359, | |
| "learning_rate": 1.2766633165829147e-05, | |
| "loss": 1.0261, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 3.3435771465301514, | |
| "learning_rate": 1.2746532663316585e-05, | |
| "loss": 0.9751, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 6.218837261199951, | |
| "learning_rate": 1.2726432160804021e-05, | |
| "loss": 0.9887, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 2.914499044418335, | |
| "learning_rate": 1.2706331658291457e-05, | |
| "loss": 1.0172, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 4.287944793701172, | |
| "learning_rate": 1.2686231155778897e-05, | |
| "loss": 1.0336, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 9.045112609863281, | |
| "learning_rate": 1.2666331658291458e-05, | |
| "loss": 0.9966, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 3.9664063453674316, | |
| "learning_rate": 1.2646231155778896e-05, | |
| "loss": 1.0315, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 5.26336145401001, | |
| "learning_rate": 1.2626130653266334e-05, | |
| "loss": 1.031, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 5.820954322814941, | |
| "learning_rate": 1.260603015075377e-05, | |
| "loss": 0.9786, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 3.7999236583709717, | |
| "learning_rate": 1.2585929648241206e-05, | |
| "loss": 1.0008, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 4.96231746673584, | |
| "learning_rate": 1.2565829145728646e-05, | |
| "loss": 0.9823, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 5.442008018493652, | |
| "learning_rate": 1.2545728643216082e-05, | |
| "loss": 0.9993, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 3.0178353786468506, | |
| "learning_rate": 1.2525628140703518e-05, | |
| "loss": 1.009, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 4.0404052734375, | |
| "learning_rate": 1.2505527638190955e-05, | |
| "loss": 1.0047, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 3.924924850463867, | |
| "learning_rate": 1.2485427135678394e-05, | |
| "loss": 0.9681, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 6.560153961181641, | |
| "learning_rate": 1.246532663316583e-05, | |
| "loss": 0.9346, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 4.826027870178223, | |
| "learning_rate": 1.2445226130653267e-05, | |
| "loss": 0.9878, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 3.489680767059326, | |
| "learning_rate": 1.2425125628140703e-05, | |
| "loss": 0.9943, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 4.7767014503479, | |
| "learning_rate": 1.2405025125628141e-05, | |
| "loss": 1.02, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 7.311853408813477, | |
| "learning_rate": 1.238492462311558e-05, | |
| "loss": 0.946, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 4.217949390411377, | |
| "learning_rate": 1.236502512562814e-05, | |
| "loss": 0.974, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 8.919093132019043, | |
| "learning_rate": 1.234492462311558e-05, | |
| "loss": 0.9628, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 4.355369567871094, | |
| "learning_rate": 1.2324824120603016e-05, | |
| "loss": 0.9325, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 5.474518775939941, | |
| "learning_rate": 1.2304723618090452e-05, | |
| "loss": 0.9505, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 6.389540195465088, | |
| "learning_rate": 1.228462311557789e-05, | |
| "loss": 0.9574, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 6.9164719581604, | |
| "learning_rate": 1.2264522613065328e-05, | |
| "loss": 0.9644, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 4.559136390686035, | |
| "learning_rate": 1.2244422110552764e-05, | |
| "loss": 1.0306, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 6.381926536560059, | |
| "learning_rate": 1.2224321608040202e-05, | |
| "loss": 0.9542, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 7.826279163360596, | |
| "learning_rate": 1.2204221105527639e-05, | |
| "loss": 0.9818, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 5.7296929359436035, | |
| "learning_rate": 1.2184120603015077e-05, | |
| "loss": 0.9591, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 6.479053974151611, | |
| "learning_rate": 1.2164020100502515e-05, | |
| "loss": 1.0083, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 5.9377241134643555, | |
| "learning_rate": 1.2144120603015077e-05, | |
| "loss": 0.9969, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 4.59481143951416, | |
| "learning_rate": 1.2124020100502513e-05, | |
| "loss": 1.015, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 4.979703903198242, | |
| "learning_rate": 1.2103919597989951e-05, | |
| "loss": 0.977, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 6.539973735809326, | |
| "learning_rate": 1.2083819095477388e-05, | |
| "loss": 0.9938, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 5.971490383148193, | |
| "learning_rate": 1.2063718592964825e-05, | |
| "loss": 0.9848, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.9915822744369507, | |
| "eval_runtime": 21.5957, | |
| "eval_samples_per_second": 46.305, | |
| "eval_steps_per_second": 5.788, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 5.639512538909912, | |
| "learning_rate": 1.2043618090452262e-05, | |
| "loss": 0.9401, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 3.0007824897766113, | |
| "learning_rate": 1.20235175879397e-05, | |
| "loss": 0.9769, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 4.346365451812744, | |
| "learning_rate": 1.2003417085427136e-05, | |
| "loss": 0.9247, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 6.308602809906006, | |
| "learning_rate": 1.1983316582914574e-05, | |
| "loss": 0.9685, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 4.597143173217773, | |
| "learning_rate": 1.1963216080402012e-05, | |
| "loss": 0.907, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 6.000264644622803, | |
| "learning_rate": 1.1943115577889448e-05, | |
| "loss": 0.9311, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 4.718263149261475, | |
| "learning_rate": 1.1923015075376885e-05, | |
| "loss": 0.9707, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 3.7472355365753174, | |
| "learning_rate": 1.1902914572864324e-05, | |
| "loss": 0.9812, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 4.8061017990112305, | |
| "learning_rate": 1.188281407035176e-05, | |
| "loss": 0.9461, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 5.2381391525268555, | |
| "learning_rate": 1.1862713567839197e-05, | |
| "loss": 0.9972, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 6.1567583084106445, | |
| "learning_rate": 1.1842613065326633e-05, | |
| "loss": 0.9611, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 5.496160984039307, | |
| "learning_rate": 1.1822512562814071e-05, | |
| "loss": 0.9612, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 6.659996509552002, | |
| "learning_rate": 1.1802412060301509e-05, | |
| "loss": 0.9593, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 7.010763645172119, | |
| "learning_rate": 1.1782311557788945e-05, | |
| "loss": 0.9079, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 5.539340496063232, | |
| "learning_rate": 1.1762211055276383e-05, | |
| "loss": 0.946, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 4.7269368171691895, | |
| "learning_rate": 1.174211055276382e-05, | |
| "loss": 0.9702, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 6.573697090148926, | |
| "learning_rate": 1.1722010050251257e-05, | |
| "loss": 0.9166, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 5.467616558074951, | |
| "learning_rate": 1.1701909547738694e-05, | |
| "loss": 0.9479, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 7.292219638824463, | |
| "learning_rate": 1.1681809045226132e-05, | |
| "loss": 0.9694, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 5.9063849449157715, | |
| "learning_rate": 1.1661708542713568e-05, | |
| "loss": 0.9467, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 7.106956958770752, | |
| "learning_rate": 1.1641608040201006e-05, | |
| "loss": 0.9344, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 2.7898268699645996, | |
| "learning_rate": 1.1621507537688444e-05, | |
| "loss": 0.9174, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 5.543144226074219, | |
| "learning_rate": 1.160140703517588e-05, | |
| "loss": 0.9399, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 4.507541656494141, | |
| "learning_rate": 1.1581306532663317e-05, | |
| "loss": 0.8989, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 7.4493937492370605, | |
| "learning_rate": 1.1561206030150756e-05, | |
| "loss": 0.9663, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 5.758662700653076, | |
| "learning_rate": 1.1541105527638192e-05, | |
| "loss": 0.983, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 4.6601386070251465, | |
| "learning_rate": 1.1521005025125629e-05, | |
| "loss": 0.936, | |
| "step": 42700 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 6.687641620635986, | |
| "learning_rate": 1.1500904522613065e-05, | |
| "loss": 0.9452, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 6.454759120941162, | |
| "learning_rate": 1.1480804020100505e-05, | |
| "loss": 0.9494, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 6.235274314880371, | |
| "learning_rate": 1.1460703517587941e-05, | |
| "loss": 0.9107, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 6.445216655731201, | |
| "learning_rate": 1.1440603015075377e-05, | |
| "loss": 0.9448, | |
| "step": 43100 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 4.565326690673828, | |
| "learning_rate": 1.1420502512562814e-05, | |
| "loss": 0.9435, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 4.653913497924805, | |
| "learning_rate": 1.1400402010050253e-05, | |
| "loss": 0.9492, | |
| "step": 43300 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 4.022702693939209, | |
| "learning_rate": 1.138030150753769e-05, | |
| "loss": 0.9365, | |
| "step": 43400 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 6.998848915100098, | |
| "learning_rate": 1.1360201005025126e-05, | |
| "loss": 0.9215, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 3.925429344177246, | |
| "learning_rate": 1.1340100502512564e-05, | |
| "loss": 0.9408, | |
| "step": 43600 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 5.22701358795166, | |
| "learning_rate": 1.132e-05, | |
| "loss": 0.9755, | |
| "step": 43700 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 5.142667293548584, | |
| "learning_rate": 1.1299899497487438e-05, | |
| "loss": 0.8938, | |
| "step": 43800 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 5.2655158042907715, | |
| "learning_rate": 1.1279798994974876e-05, | |
| "loss": 0.9751, | |
| "step": 43900 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 5.084207057952881, | |
| "learning_rate": 1.1259698492462312e-05, | |
| "loss": 0.9141, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 4.578594207763672, | |
| "learning_rate": 1.1239597989949749e-05, | |
| "loss": 0.9403, | |
| "step": 44100 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 3.3010849952697754, | |
| "learning_rate": 1.1219497487437188e-05, | |
| "loss": 0.9657, | |
| "step": 44200 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 6.451618194580078, | |
| "learning_rate": 1.1199396984924624e-05, | |
| "loss": 0.9297, | |
| "step": 44300 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 5.8492655754089355, | |
| "learning_rate": 1.117929648241206e-05, | |
| "loss": 0.9157, | |
| "step": 44400 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 5.015758037567139, | |
| "learning_rate": 1.1159195979899497e-05, | |
| "loss": 0.9508, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 4.814078330993652, | |
| "learning_rate": 1.1139095477386937e-05, | |
| "loss": 0.9361, | |
| "step": 44600 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 5.004156112670898, | |
| "learning_rate": 1.1118994974874373e-05, | |
| "loss": 0.958, | |
| "step": 44700 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 5.016057968139648, | |
| "learning_rate": 1.109889447236181e-05, | |
| "loss": 0.9755, | |
| "step": 44800 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 5.041826248168945, | |
| "learning_rate": 1.1078793969849246e-05, | |
| "loss": 0.9082, | |
| "step": 44900 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 5.787368297576904, | |
| "learning_rate": 1.1058693467336685e-05, | |
| "loss": 0.9076, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 5.170538902282715, | |
| "learning_rate": 1.1038592964824122e-05, | |
| "loss": 0.9117, | |
| "step": 45100 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 7.477475643157959, | |
| "learning_rate": 1.1018492462311558e-05, | |
| "loss": 0.8987, | |
| "step": 45200 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 4.626328945159912, | |
| "learning_rate": 1.0998391959798996e-05, | |
| "loss": 0.9197, | |
| "step": 45300 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 5.800539016723633, | |
| "learning_rate": 1.0978291457286434e-05, | |
| "loss": 0.9025, | |
| "step": 45400 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 4.291562080383301, | |
| "learning_rate": 1.0958391959798994e-05, | |
| "loss": 0.9348, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 5.439847946166992, | |
| "learning_rate": 1.0938291457286434e-05, | |
| "loss": 0.9416, | |
| "step": 45600 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 5.728611946105957, | |
| "learning_rate": 1.091819095477387e-05, | |
| "loss": 0.9124, | |
| "step": 45700 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 3.7975008487701416, | |
| "learning_rate": 1.0898090452261307e-05, | |
| "loss": 0.9345, | |
| "step": 45800 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 7.625438690185547, | |
| "learning_rate": 1.0877989949748745e-05, | |
| "loss": 0.8925, | |
| "step": 45900 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 4.858023643493652, | |
| "learning_rate": 1.0857889447236183e-05, | |
| "loss": 0.9103, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 6.363548755645752, | |
| "learning_rate": 1.0837788944723619e-05, | |
| "loss": 0.9523, | |
| "step": 46100 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 4.639822959899902, | |
| "learning_rate": 1.0817688442211057e-05, | |
| "loss": 0.9322, | |
| "step": 46200 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 4.796472072601318, | |
| "learning_rate": 1.0797587939698493e-05, | |
| "loss": 0.9242, | |
| "step": 46300 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 3.8870980739593506, | |
| "learning_rate": 1.077748743718593e-05, | |
| "loss": 0.9048, | |
| "step": 46400 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 6.010646343231201, | |
| "learning_rate": 1.0757386934673369e-05, | |
| "loss": 0.9566, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 3.925715684890747, | |
| "learning_rate": 1.0737286432160805e-05, | |
| "loss": 0.9274, | |
| "step": 46600 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 5.222326278686523, | |
| "learning_rate": 1.0717185929648242e-05, | |
| "loss": 0.8914, | |
| "step": 46700 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 5.363781929016113, | |
| "learning_rate": 1.0697085427135678e-05, | |
| "loss": 0.922, | |
| "step": 46800 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 6.332427024841309, | |
| "learning_rate": 1.0676984924623118e-05, | |
| "loss": 0.9017, | |
| "step": 46900 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 4.68159818649292, | |
| "learning_rate": 1.0656884422110554e-05, | |
| "loss": 0.9089, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 4.770488739013672, | |
| "learning_rate": 1.063678391959799e-05, | |
| "loss": 0.9738, | |
| "step": 47100 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 6.209041595458984, | |
| "learning_rate": 1.0616683417085426e-05, | |
| "loss": 0.9301, | |
| "step": 47200 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 5.330206394195557, | |
| "learning_rate": 1.0596582914572866e-05, | |
| "loss": 0.9515, | |
| "step": 47300 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 7.701655387878418, | |
| "learning_rate": 1.0576482412060302e-05, | |
| "loss": 0.9072, | |
| "step": 47400 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 4.921889305114746, | |
| "learning_rate": 1.0556381909547739e-05, | |
| "loss": 0.9326, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 5.353864669799805, | |
| "learning_rate": 1.0536281407035177e-05, | |
| "loss": 0.902, | |
| "step": 47600 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 4.63252592086792, | |
| "learning_rate": 1.0516180904522615e-05, | |
| "loss": 0.9357, | |
| "step": 47700 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 5.968425750732422, | |
| "learning_rate": 1.0496281407035175e-05, | |
| "loss": 0.9416, | |
| "step": 47800 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 5.979503154754639, | |
| "learning_rate": 1.0476180904522615e-05, | |
| "loss": 0.9461, | |
| "step": 47900 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 5.928488731384277, | |
| "learning_rate": 1.0456080402010051e-05, | |
| "loss": 0.9045, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 12.569512367248535, | |
| "learning_rate": 1.0435979899497488e-05, | |
| "loss": 0.9205, | |
| "step": 48100 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 4.5606865882873535, | |
| "learning_rate": 1.0415879396984926e-05, | |
| "loss": 0.9005, | |
| "step": 48200 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 5.511040210723877, | |
| "learning_rate": 1.0395778894472364e-05, | |
| "loss": 0.9105, | |
| "step": 48300 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 5.660979747772217, | |
| "learning_rate": 1.03756783919598e-05, | |
| "loss": 0.8911, | |
| "step": 48400 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 3.5720648765563965, | |
| "learning_rate": 1.0355577889447238e-05, | |
| "loss": 0.9468, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 5.770594120025635, | |
| "learning_rate": 1.0335477386934674e-05, | |
| "loss": 0.9296, | |
| "step": 48600 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 4.0545477867126465, | |
| "learning_rate": 1.0315376884422112e-05, | |
| "loss": 0.9133, | |
| "step": 48700 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 4.586203575134277, | |
| "learning_rate": 1.0295276381909548e-05, | |
| "loss": 0.906, | |
| "step": 48800 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 5.315196514129639, | |
| "learning_rate": 1.0275175879396986e-05, | |
| "loss": 0.9065, | |
| "step": 48900 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 5.344489574432373, | |
| "learning_rate": 1.0255075376884423e-05, | |
| "loss": 0.9363, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 6.762577533721924, | |
| "learning_rate": 1.0234974874371859e-05, | |
| "loss": 0.9366, | |
| "step": 49100 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 4.087870121002197, | |
| "learning_rate": 1.0214874371859299e-05, | |
| "loss": 0.8812, | |
| "step": 49200 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 5.586741924285889, | |
| "learning_rate": 1.0194773869346735e-05, | |
| "loss": 0.9341, | |
| "step": 49300 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 8.180070877075195, | |
| "learning_rate": 1.0174673366834171e-05, | |
| "loss": 0.9381, | |
| "step": 49400 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 4.389576435089111, | |
| "learning_rate": 1.0154572864321607e-05, | |
| "loss": 0.9288, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 4.339807033538818, | |
| "learning_rate": 1.0134472361809047e-05, | |
| "loss": 0.9282, | |
| "step": 49600 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 7.801273345947266, | |
| "learning_rate": 1.0114371859296483e-05, | |
| "loss": 0.9657, | |
| "step": 49700 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 6.016520977020264, | |
| "learning_rate": 1.009427135678392e-05, | |
| "loss": 0.8704, | |
| "step": 49800 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 5.2764506340026855, | |
| "learning_rate": 1.0074170854271358e-05, | |
| "loss": 0.9226, | |
| "step": 49900 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 4.923444747924805, | |
| "learning_rate": 1.0054070351758796e-05, | |
| "loss": 0.9084, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "eval_loss": 0.9846327900886536, | |
| "eval_runtime": 21.5925, | |
| "eval_samples_per_second": 46.312, | |
| "eval_steps_per_second": 5.789, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 6.061006546020508, | |
| "learning_rate": 1.0033969849246232e-05, | |
| "loss": 0.9218, | |
| "step": 50100 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 4.1440348625183105, | |
| "learning_rate": 1.0013869346733668e-05, | |
| "loss": 0.9324, | |
| "step": 50200 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 4.084045886993408, | |
| "learning_rate": 9.993768844221106e-06, | |
| "loss": 0.8859, | |
| "step": 50300 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 3.723971366882324, | |
| "learning_rate": 9.973668341708544e-06, | |
| "loss": 0.9128, | |
| "step": 50400 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 3.9887030124664307, | |
| "learning_rate": 9.95356783919598e-06, | |
| "loss": 0.8987, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 5.572610855102539, | |
| "learning_rate": 9.933467336683418e-06, | |
| "loss": 0.9287, | |
| "step": 50600 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 5.956911087036133, | |
| "learning_rate": 9.913366834170856e-06, | |
| "loss": 0.8808, | |
| "step": 50700 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 3.948564052581787, | |
| "learning_rate": 9.893266331658293e-06, | |
| "loss": 0.9243, | |
| "step": 50800 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 5.561892509460449, | |
| "learning_rate": 9.87316582914573e-06, | |
| "loss": 0.9174, | |
| "step": 50900 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 5.9155755043029785, | |
| "learning_rate": 9.853065326633167e-06, | |
| "loss": 0.8951, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 5.4488348960876465, | |
| "learning_rate": 9.832964824120603e-06, | |
| "loss": 0.8864, | |
| "step": 51100 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 4.52565860748291, | |
| "learning_rate": 9.812864321608041e-06, | |
| "loss": 0.9103, | |
| "step": 51200 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 3.995807647705078, | |
| "learning_rate": 9.792763819095477e-06, | |
| "loss": 0.8999, | |
| "step": 51300 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 9.156529426574707, | |
| "learning_rate": 9.772663316582915e-06, | |
| "loss": 0.9383, | |
| "step": 51400 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 6.388377666473389, | |
| "learning_rate": 9.752562814070352e-06, | |
| "loss": 0.908, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 3.975545644760132, | |
| "learning_rate": 9.73246231155779e-06, | |
| "loss": 0.9006, | |
| "step": 51600 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 4.579479694366455, | |
| "learning_rate": 9.712361809045226e-06, | |
| "loss": 0.9443, | |
| "step": 51700 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 5.22560977935791, | |
| "learning_rate": 9.69246231155779e-06, | |
| "loss": 0.909, | |
| "step": 51800 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 5.2606587409973145, | |
| "learning_rate": 9.672361809045226e-06, | |
| "loss": 0.9255, | |
| "step": 51900 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 4.772227764129639, | |
| "learning_rate": 9.652261306532664e-06, | |
| "loss": 0.9161, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 4.636828899383545, | |
| "learning_rate": 9.6321608040201e-06, | |
| "loss": 0.874, | |
| "step": 52100 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 4.5946784019470215, | |
| "learning_rate": 9.612060301507538e-06, | |
| "loss": 0.902, | |
| "step": 52200 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 4.0993266105651855, | |
| "learning_rate": 9.591959798994975e-06, | |
| "loss": 0.9369, | |
| "step": 52300 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 5.468399524688721, | |
| "learning_rate": 9.571859296482413e-06, | |
| "loss": 0.9359, | |
| "step": 52400 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 10.92428970336914, | |
| "learning_rate": 9.551758793969849e-06, | |
| "loss": 0.8889, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 6.2350029945373535, | |
| "learning_rate": 9.531658291457287e-06, | |
| "loss": 0.9304, | |
| "step": 52600 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 4.780547618865967, | |
| "learning_rate": 9.511557788944725e-06, | |
| "loss": 0.9396, | |
| "step": 52700 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 3.1009738445281982, | |
| "learning_rate": 9.491457286432161e-06, | |
| "loss": 0.9077, | |
| "step": 52800 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 7.036947727203369, | |
| "learning_rate": 9.4713567839196e-06, | |
| "loss": 0.8753, | |
| "step": 52900 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 4.945110321044922, | |
| "learning_rate": 9.451256281407035e-06, | |
| "loss": 0.9157, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 5.361321926116943, | |
| "learning_rate": 9.431155778894473e-06, | |
| "loss": 0.8929, | |
| "step": 53100 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 3.351379632949829, | |
| "learning_rate": 9.411055276381911e-06, | |
| "loss": 0.8636, | |
| "step": 53200 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 5.34309720993042, | |
| "learning_rate": 9.390954773869348e-06, | |
| "loss": 0.8865, | |
| "step": 53300 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 5.316425800323486, | |
| "learning_rate": 9.370854271356786e-06, | |
| "loss": 0.9178, | |
| "step": 53400 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 4.478712558746338, | |
| "learning_rate": 9.350753768844222e-06, | |
| "loss": 0.9181, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 5.095877647399902, | |
| "learning_rate": 9.33065326633166e-06, | |
| "loss": 0.902, | |
| "step": 53600 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 4.4164862632751465, | |
| "learning_rate": 9.310552763819096e-06, | |
| "loss": 0.887, | |
| "step": 53700 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 6.3961591720581055, | |
| "learning_rate": 9.290452261306533e-06, | |
| "loss": 0.8778, | |
| "step": 53800 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 7.141729354858398, | |
| "learning_rate": 9.270552763819097e-06, | |
| "loss": 0.9144, | |
| "step": 53900 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 5.858211040496826, | |
| "learning_rate": 9.250452261306535e-06, | |
| "loss": 0.8889, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 5.192725658416748, | |
| "learning_rate": 9.230351758793971e-06, | |
| "loss": 0.8928, | |
| "step": 54100 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 6.190788745880127, | |
| "learning_rate": 9.210251256281407e-06, | |
| "loss": 0.8683, | |
| "step": 54200 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 4.610683441162109, | |
| "learning_rate": 9.190150753768845e-06, | |
| "loss": 0.9473, | |
| "step": 54300 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 5.043734550476074, | |
| "learning_rate": 9.170050251256281e-06, | |
| "loss": 0.9142, | |
| "step": 54400 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 5.166931629180908, | |
| "learning_rate": 9.14994974874372e-06, | |
| "loss": 0.8894, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 5.05250358581543, | |
| "learning_rate": 9.129849246231156e-06, | |
| "loss": 0.8799, | |
| "step": 54600 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 5.468914031982422, | |
| "learning_rate": 9.109748743718594e-06, | |
| "loss": 0.9099, | |
| "step": 54700 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 4.162414073944092, | |
| "learning_rate": 9.08964824120603e-06, | |
| "loss": 0.8859, | |
| "step": 54800 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 5.149291515350342, | |
| "learning_rate": 9.069547738693468e-06, | |
| "loss": 0.9096, | |
| "step": 54900 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 4.889472961425781, | |
| "learning_rate": 9.049447236180904e-06, | |
| "loss": 0.8953, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 4.146818161010742, | |
| "learning_rate": 9.029346733668342e-06, | |
| "loss": 0.8917, | |
| "step": 55100 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 5.937385559082031, | |
| "learning_rate": 9.00924623115578e-06, | |
| "loss": 0.9295, | |
| "step": 55200 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 4.749314785003662, | |
| "learning_rate": 8.989145728643216e-06, | |
| "loss": 0.8776, | |
| "step": 55300 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 6.271254539489746, | |
| "learning_rate": 8.969045226130654e-06, | |
| "loss": 0.8593, | |
| "step": 55400 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 5.769760608673096, | |
| "learning_rate": 8.948944723618092e-06, | |
| "loss": 0.891, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 4.185112476348877, | |
| "learning_rate": 8.928844221105529e-06, | |
| "loss": 0.8869, | |
| "step": 55600 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 3.2164394855499268, | |
| "learning_rate": 8.908743718592967e-06, | |
| "loss": 0.8992, | |
| "step": 55700 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 4.406613349914551, | |
| "learning_rate": 8.888643216080403e-06, | |
| "loss": 0.8971, | |
| "step": 55800 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 5.101110458374023, | |
| "learning_rate": 8.868542713567841e-06, | |
| "loss": 0.9066, | |
| "step": 55900 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 4.963405132293701, | |
| "learning_rate": 8.848643216080403e-06, | |
| "loss": 0.881, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 7.5268683433532715, | |
| "learning_rate": 8.82854271356784e-06, | |
| "loss": 0.8692, | |
| "step": 56100 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 5.325132369995117, | |
| "learning_rate": 8.808442211055278e-06, | |
| "loss": 0.895, | |
| "step": 56200 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 4.687073707580566, | |
| "learning_rate": 8.788341708542715e-06, | |
| "loss": 0.9007, | |
| "step": 56300 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 4.215831279754639, | |
| "learning_rate": 8.768241206030152e-06, | |
| "loss": 0.8783, | |
| "step": 56400 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 6.363833427429199, | |
| "learning_rate": 8.74814070351759e-06, | |
| "loss": 0.9276, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 4.2875518798828125, | |
| "learning_rate": 8.728040201005026e-06, | |
| "loss": 0.8758, | |
| "step": 56600 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 4.461952209472656, | |
| "learning_rate": 8.707939698492464e-06, | |
| "loss": 0.8789, | |
| "step": 56700 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 7.590397834777832, | |
| "learning_rate": 8.6878391959799e-06, | |
| "loss": 0.8726, | |
| "step": 56800 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 5.754077911376953, | |
| "learning_rate": 8.667738693467337e-06, | |
| "loss": 0.9022, | |
| "step": 56900 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 4.305074214935303, | |
| "learning_rate": 8.647638190954775e-06, | |
| "loss": 0.8933, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 4.451827526092529, | |
| "learning_rate": 8.627738693467337e-06, | |
| "loss": 0.934, | |
| "step": 57100 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 6.323834419250488, | |
| "learning_rate": 8.607638190954775e-06, | |
| "loss": 0.8858, | |
| "step": 57200 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 6.937102317810059, | |
| "learning_rate": 8.587537688442211e-06, | |
| "loss": 0.9263, | |
| "step": 57300 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 7.153318881988525, | |
| "learning_rate": 8.56743718592965e-06, | |
| "loss": 0.8868, | |
| "step": 57400 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 4.7994842529296875, | |
| "learning_rate": 8.547336683417085e-06, | |
| "loss": 0.877, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 6.6480793952941895, | |
| "learning_rate": 8.527236180904523e-06, | |
| "loss": 0.8774, | |
| "step": 57600 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 7.170138835906982, | |
| "learning_rate": 8.50713567839196e-06, | |
| "loss": 0.8704, | |
| "step": 57700 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 4.006447792053223, | |
| "learning_rate": 8.487035175879398e-06, | |
| "loss": 0.8736, | |
| "step": 57800 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 4.998128414154053, | |
| "learning_rate": 8.466934673366834e-06, | |
| "loss": 0.8653, | |
| "step": 57900 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 5.756192207336426, | |
| "learning_rate": 8.446834170854272e-06, | |
| "loss": 0.8756, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 5.486929893493652, | |
| "learning_rate": 8.426733668341708e-06, | |
| "loss": 0.9047, | |
| "step": 58100 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 4.589926242828369, | |
| "learning_rate": 8.406633165829146e-06, | |
| "loss": 0.8766, | |
| "step": 58200 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 4.535083293914795, | |
| "learning_rate": 8.386532663316584e-06, | |
| "loss": 0.8758, | |
| "step": 58300 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 3.2254798412323, | |
| "learning_rate": 8.36643216080402e-06, | |
| "loss": 0.8813, | |
| "step": 58400 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 6.055229187011719, | |
| "learning_rate": 8.346331658291458e-06, | |
| "loss": 0.8779, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 4.221169471740723, | |
| "learning_rate": 8.326231155778895e-06, | |
| "loss": 0.9325, | |
| "step": 58600 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 5.035799026489258, | |
| "learning_rate": 8.306130653266333e-06, | |
| "loss": 0.8896, | |
| "step": 58700 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 6.551968574523926, | |
| "learning_rate": 8.28603015075377e-06, | |
| "loss": 0.8644, | |
| "step": 58800 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 4.297557353973389, | |
| "learning_rate": 8.265929648241207e-06, | |
| "loss": 0.8853, | |
| "step": 58900 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 6.603255271911621, | |
| "learning_rate": 8.245829145728645e-06, | |
| "loss": 0.9237, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 6.272432804107666, | |
| "learning_rate": 8.225728643216081e-06, | |
| "loss": 0.8708, | |
| "step": 59100 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 5.919680595397949, | |
| "learning_rate": 8.20562814070352e-06, | |
| "loss": 0.8525, | |
| "step": 59200 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 4.834166049957275, | |
| "learning_rate": 8.185527638190955e-06, | |
| "loss": 0.8576, | |
| "step": 59300 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "grad_norm": 5.948410987854004, | |
| "learning_rate": 8.165427135678393e-06, | |
| "loss": 0.9017, | |
| "step": 59400 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 7.001020431518555, | |
| "learning_rate": 8.14532663316583e-06, | |
| "loss": 0.891, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 5.623896598815918, | |
| "learning_rate": 8.125226130653266e-06, | |
| "loss": 0.8255, | |
| "step": 59600 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 5.0935468673706055, | |
| "learning_rate": 8.105125628140704e-06, | |
| "loss": 0.8709, | |
| "step": 59700 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 6.403896808624268, | |
| "learning_rate": 8.08502512562814e-06, | |
| "loss": 0.8957, | |
| "step": 59800 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 5.92683744430542, | |
| "learning_rate": 8.064924623115578e-06, | |
| "loss": 0.9102, | |
| "step": 59900 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 3.4657108783721924, | |
| "learning_rate": 8.04502512562814e-06, | |
| "loss": 0.916, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.9292559623718262, | |
| "eval_runtime": 21.5879, | |
| "eval_samples_per_second": 46.322, | |
| "eval_steps_per_second": 5.79, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 5.614874839782715, | |
| "learning_rate": 8.024924623115579e-06, | |
| "loss": 0.8151, | |
| "step": 60100 | |
| }, | |
| { | |
| "epoch": 3.01, | |
| "grad_norm": 6.177361011505127, | |
| "learning_rate": 8.004824120603015e-06, | |
| "loss": 0.8266, | |
| "step": 60200 | |
| }, | |
| { | |
| "epoch": 3.02, | |
| "grad_norm": 5.4862213134765625, | |
| "learning_rate": 7.984723618090453e-06, | |
| "loss": 0.7741, | |
| "step": 60300 | |
| }, | |
| { | |
| "epoch": 3.02, | |
| "grad_norm": 6.674380779266357, | |
| "learning_rate": 7.964623115577889e-06, | |
| "loss": 0.8134, | |
| "step": 60400 | |
| }, | |
| { | |
| "epoch": 3.02, | |
| "grad_norm": 6.712404251098633, | |
| "learning_rate": 7.944522613065327e-06, | |
| "loss": 0.8332, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 3.03, | |
| "grad_norm": 4.442228317260742, | |
| "learning_rate": 7.924422110552763e-06, | |
| "loss": 0.832, | |
| "step": 60600 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 5.503748416900635, | |
| "learning_rate": 7.904321608040201e-06, | |
| "loss": 0.844, | |
| "step": 60700 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 4.290737628936768, | |
| "learning_rate": 7.88422110552764e-06, | |
| "loss": 0.8593, | |
| "step": 60800 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 4.687915802001953, | |
| "learning_rate": 7.864120603015076e-06, | |
| "loss": 0.8506, | |
| "step": 60900 | |
| }, | |
| { | |
| "epoch": 3.05, | |
| "grad_norm": 5.838376998901367, | |
| "learning_rate": 7.844020100502514e-06, | |
| "loss": 0.8297, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 3.06, | |
| "grad_norm": 7.26198148727417, | |
| "learning_rate": 7.823919597989952e-06, | |
| "loss": 0.8463, | |
| "step": 61100 | |
| }, | |
| { | |
| "epoch": 3.06, | |
| "grad_norm": 5.693443298339844, | |
| "learning_rate": 7.803819095477388e-06, | |
| "loss": 0.8405, | |
| "step": 61200 | |
| }, | |
| { | |
| "epoch": 3.06, | |
| "grad_norm": 5.379219055175781, | |
| "learning_rate": 7.783718592964826e-06, | |
| "loss": 0.8431, | |
| "step": 61300 | |
| }, | |
| { | |
| "epoch": 3.07, | |
| "grad_norm": 5.703670501708984, | |
| "learning_rate": 7.763618090452262e-06, | |
| "loss": 0.8484, | |
| "step": 61400 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "grad_norm": 5.679072380065918, | |
| "learning_rate": 7.7435175879397e-06, | |
| "loss": 0.8272, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "grad_norm": 4.1109113693237305, | |
| "learning_rate": 7.723417085427136e-06, | |
| "loss": 0.83, | |
| "step": 61600 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "grad_norm": 5.94366979598999, | |
| "learning_rate": 7.703316582914574e-06, | |
| "loss": 0.8102, | |
| "step": 61700 | |
| }, | |
| { | |
| "epoch": 3.09, | |
| "grad_norm": 8.418631553649902, | |
| "learning_rate": 7.68321608040201e-06, | |
| "loss": 0.8396, | |
| "step": 61800 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 5.8859100341796875, | |
| "learning_rate": 7.663115577889449e-06, | |
| "loss": 0.8142, | |
| "step": 61900 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 5.267168045043945, | |
| "learning_rate": 7.643015075376885e-06, | |
| "loss": 0.8087, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 5.58022403717041, | |
| "learning_rate": 7.622914572864322e-06, | |
| "loss": 0.8407, | |
| "step": 62100 | |
| }, | |
| { | |
| "epoch": 3.11, | |
| "grad_norm": 5.999646186828613, | |
| "learning_rate": 7.602814070351759e-06, | |
| "loss": 0.8214, | |
| "step": 62200 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 4.449764251708984, | |
| "learning_rate": 7.582713567839196e-06, | |
| "loss": 0.8562, | |
| "step": 62300 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 6.2914137840271, | |
| "learning_rate": 7.562613065326634e-06, | |
| "loss": 0.8359, | |
| "step": 62400 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 5.262882709503174, | |
| "learning_rate": 7.5425125628140705e-06, | |
| "loss": 0.8368, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 3.13, | |
| "grad_norm": 4.981582164764404, | |
| "learning_rate": 7.5224120603015085e-06, | |
| "loss": 0.8138, | |
| "step": 62600 | |
| }, | |
| { | |
| "epoch": 3.13, | |
| "grad_norm": 5.330999374389648, | |
| "learning_rate": 7.502311557788945e-06, | |
| "loss": 0.8292, | |
| "step": 62700 | |
| }, | |
| { | |
| "epoch": 3.14, | |
| "grad_norm": 5.176852226257324, | |
| "learning_rate": 7.482211055276383e-06, | |
| "loss": 0.8108, | |
| "step": 62800 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "grad_norm": 8.819506645202637, | |
| "learning_rate": 7.462110552763819e-06, | |
| "loss": 0.854, | |
| "step": 62900 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "grad_norm": 5.1937642097473145, | |
| "learning_rate": 7.442010050251257e-06, | |
| "loss": 0.7943, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "grad_norm": 4.399514675140381, | |
| "learning_rate": 7.421909547738694e-06, | |
| "loss": 0.7815, | |
| "step": 63100 | |
| }, | |
| { | |
| "epoch": 3.16, | |
| "grad_norm": 5.575798034667969, | |
| "learning_rate": 7.402010050251257e-06, | |
| "loss": 0.8481, | |
| "step": 63200 | |
| }, | |
| { | |
| "epoch": 3.17, | |
| "grad_norm": 5.097688674926758, | |
| "learning_rate": 7.381909547738694e-06, | |
| "loss": 0.8412, | |
| "step": 63300 | |
| }, | |
| { | |
| "epoch": 3.17, | |
| "grad_norm": 4.748641490936279, | |
| "learning_rate": 7.361809045226132e-06, | |
| "loss": 0.8058, | |
| "step": 63400 | |
| }, | |
| { | |
| "epoch": 3.17, | |
| "grad_norm": 7.151881694793701, | |
| "learning_rate": 7.341708542713568e-06, | |
| "loss": 0.7944, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 3.18, | |
| "grad_norm": 4.642664909362793, | |
| "learning_rate": 7.321608040201006e-06, | |
| "loss": 0.8185, | |
| "step": 63600 | |
| }, | |
| { | |
| "epoch": 3.19, | |
| "grad_norm": 5.354043483734131, | |
| "learning_rate": 7.301507537688442e-06, | |
| "loss": 0.7833, | |
| "step": 63700 | |
| }, | |
| { | |
| "epoch": 3.19, | |
| "grad_norm": 5.168720245361328, | |
| "learning_rate": 7.28140703517588e-06, | |
| "loss": 0.7966, | |
| "step": 63800 | |
| }, | |
| { | |
| "epoch": 3.19, | |
| "grad_norm": 4.343645095825195, | |
| "learning_rate": 7.261306532663317e-06, | |
| "loss": 0.7851, | |
| "step": 63900 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 4.882009506225586, | |
| "learning_rate": 7.241206030150754e-06, | |
| "loss": 0.8069, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 3.21, | |
| "grad_norm": 4.954422473907471, | |
| "learning_rate": 7.2211055276381915e-06, | |
| "loss": 0.8029, | |
| "step": 64100 | |
| }, | |
| { | |
| "epoch": 3.21, | |
| "grad_norm": 3.5329108238220215, | |
| "learning_rate": 7.2010050251256295e-06, | |
| "loss": 0.8262, | |
| "step": 64200 | |
| }, | |
| { | |
| "epoch": 3.21, | |
| "grad_norm": 4.995691776275635, | |
| "learning_rate": 7.180904522613066e-06, | |
| "loss": 0.7899, | |
| "step": 64300 | |
| }, | |
| { | |
| "epoch": 3.22, | |
| "grad_norm": 4.367786884307861, | |
| "learning_rate": 7.160804020100504e-06, | |
| "loss": 0.8014, | |
| "step": 64400 | |
| }, | |
| { | |
| "epoch": 3.23, | |
| "grad_norm": 3.8841774463653564, | |
| "learning_rate": 7.14070351758794e-06, | |
| "loss": 0.8207, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 3.23, | |
| "grad_norm": 4.118581295013428, | |
| "learning_rate": 7.120603015075378e-06, | |
| "loss": 0.8399, | |
| "step": 64600 | |
| }, | |
| { | |
| "epoch": 3.23, | |
| "grad_norm": 5.320229530334473, | |
| "learning_rate": 7.100502512562814e-06, | |
| "loss": 0.8407, | |
| "step": 64700 | |
| }, | |
| { | |
| "epoch": 3.24, | |
| "grad_norm": 4.324894428253174, | |
| "learning_rate": 7.080402010050251e-06, | |
| "loss": 0.7897, | |
| "step": 64800 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 6.917771816253662, | |
| "learning_rate": 7.060301507537689e-06, | |
| "loss": 0.8019, | |
| "step": 64900 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 7.098691463470459, | |
| "learning_rate": 7.040201005025126e-06, | |
| "loss": 0.8058, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 5.166707992553711, | |
| "learning_rate": 7.020100502512564e-06, | |
| "loss": 0.7839, | |
| "step": 65100 | |
| }, | |
| { | |
| "epoch": 3.26, | |
| "grad_norm": 5.616134166717529, | |
| "learning_rate": 7e-06, | |
| "loss": 0.7821, | |
| "step": 65200 | |
| }, | |
| { | |
| "epoch": 3.27, | |
| "grad_norm": 7.216468334197998, | |
| "learning_rate": 6.979899497487438e-06, | |
| "loss": 0.7974, | |
| "step": 65300 | |
| }, | |
| { | |
| "epoch": 3.27, | |
| "grad_norm": 7.116774082183838, | |
| "learning_rate": 6.959798994974874e-06, | |
| "loss": 0.8446, | |
| "step": 65400 | |
| }, | |
| { | |
| "epoch": 3.27, | |
| "grad_norm": 6.275495529174805, | |
| "learning_rate": 6.939698492462312e-06, | |
| "loss": 0.8185, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 3.28, | |
| "grad_norm": 4.431950092315674, | |
| "learning_rate": 6.919597989949749e-06, | |
| "loss": 0.8203, | |
| "step": 65600 | |
| }, | |
| { | |
| "epoch": 3.29, | |
| "grad_norm": 6.8355302810668945, | |
| "learning_rate": 6.899497487437186e-06, | |
| "loss": 0.789, | |
| "step": 65700 | |
| }, | |
| { | |
| "epoch": 3.29, | |
| "grad_norm": 4.217498779296875, | |
| "learning_rate": 6.8793969849246235e-06, | |
| "loss": 0.7909, | |
| "step": 65800 | |
| }, | |
| { | |
| "epoch": 3.29, | |
| "grad_norm": 9.218932151794434, | |
| "learning_rate": 6.859899497487438e-06, | |
| "loss": 0.8387, | |
| "step": 65900 | |
| }, | |
| { | |
| "epoch": 3.3, | |
| "grad_norm": 5.607006072998047, | |
| "learning_rate": 6.8397989949748745e-06, | |
| "loss": 0.787, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 3.31, | |
| "grad_norm": 5.220907688140869, | |
| "learning_rate": 6.8196984924623124e-06, | |
| "loss": 0.8274, | |
| "step": 66100 | |
| }, | |
| { | |
| "epoch": 3.31, | |
| "grad_norm": 4.795065402984619, | |
| "learning_rate": 6.799597989949749e-06, | |
| "loss": 0.7833, | |
| "step": 66200 | |
| }, | |
| { | |
| "epoch": 3.31, | |
| "grad_norm": 5.653503894805908, | |
| "learning_rate": 6.779497487437187e-06, | |
| "loss": 0.7919, | |
| "step": 66300 | |
| }, | |
| { | |
| "epoch": 3.32, | |
| "grad_norm": 5.359546184539795, | |
| "learning_rate": 6.759396984924623e-06, | |
| "loss": 0.8, | |
| "step": 66400 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "grad_norm": 3.9278500080108643, | |
| "learning_rate": 6.739296482412061e-06, | |
| "loss": 0.816, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "grad_norm": 6.889082908630371, | |
| "learning_rate": 6.719195979899498e-06, | |
| "loss": 0.8559, | |
| "step": 66600 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "grad_norm": 6.555418491363525, | |
| "learning_rate": 6.699095477386935e-06, | |
| "loss": 0.8084, | |
| "step": 66700 | |
| }, | |
| { | |
| "epoch": 3.34, | |
| "grad_norm": 5.0188798904418945, | |
| "learning_rate": 6.678994974874372e-06, | |
| "loss": 0.8199, | |
| "step": 66800 | |
| }, | |
| { | |
| "epoch": 3.34, | |
| "grad_norm": 5.341757297515869, | |
| "learning_rate": 6.6588944723618094e-06, | |
| "loss": 0.8301, | |
| "step": 66900 | |
| }, | |
| { | |
| "epoch": 3.35, | |
| "grad_norm": 7.638245105743408, | |
| "learning_rate": 6.6387939698492466e-06, | |
| "loss": 0.8156, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 3.35, | |
| "grad_norm": 4.004561424255371, | |
| "learning_rate": 6.6186934673366845e-06, | |
| "loss": 0.779, | |
| "step": 67100 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 5.197673320770264, | |
| "learning_rate": 6.598592964824121e-06, | |
| "loss": 0.8086, | |
| "step": 67200 | |
| }, | |
| { | |
| "epoch": 3.37, | |
| "grad_norm": 5.757644176483154, | |
| "learning_rate": 6.578492462311559e-06, | |
| "loss": 0.8609, | |
| "step": 67300 | |
| }, | |
| { | |
| "epoch": 3.37, | |
| "grad_norm": 3.9802143573760986, | |
| "learning_rate": 6.558391959798995e-06, | |
| "loss": 0.814, | |
| "step": 67400 | |
| }, | |
| { | |
| "epoch": 3.38, | |
| "grad_norm": 4.6707892417907715, | |
| "learning_rate": 6.538291457286433e-06, | |
| "loss": 0.809, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 3.38, | |
| "grad_norm": 6.902073860168457, | |
| "learning_rate": 6.51819095477387e-06, | |
| "loss": 0.7862, | |
| "step": 67600 | |
| }, | |
| { | |
| "epoch": 3.38, | |
| "grad_norm": 4.793231010437012, | |
| "learning_rate": 6.498090452261307e-06, | |
| "loss": 0.8139, | |
| "step": 67700 | |
| }, | |
| { | |
| "epoch": 3.39, | |
| "grad_norm": 10.16287612915039, | |
| "learning_rate": 6.4779899497487444e-06, | |
| "loss": 0.78, | |
| "step": 67800 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 5.308049201965332, | |
| "learning_rate": 6.4578894472361816e-06, | |
| "loss": 0.8235, | |
| "step": 67900 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 5.0899271965026855, | |
| "learning_rate": 6.437788944723619e-06, | |
| "loss": 0.8222, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 3.41, | |
| "grad_norm": 5.174381732940674, | |
| "learning_rate": 6.417688442211055e-06, | |
| "loss": 0.7985, | |
| "step": 68100 | |
| }, | |
| { | |
| "epoch": 3.41, | |
| "grad_norm": 4.858529090881348, | |
| "learning_rate": 6.397587939698493e-06, | |
| "loss": 0.8224, | |
| "step": 68200 | |
| }, | |
| { | |
| "epoch": 3.42, | |
| "grad_norm": 8.091994285583496, | |
| "learning_rate": 6.37748743718593e-06, | |
| "loss": 0.8078, | |
| "step": 68300 | |
| }, | |
| { | |
| "epoch": 3.42, | |
| "grad_norm": 5.269526481628418, | |
| "learning_rate": 6.357386934673367e-06, | |
| "loss": 0.8006, | |
| "step": 68400 | |
| }, | |
| { | |
| "epoch": 3.42, | |
| "grad_norm": 5.161372184753418, | |
| "learning_rate": 6.337286432160804e-06, | |
| "loss": 0.814, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 3.43, | |
| "grad_norm": 4.547713279724121, | |
| "learning_rate": 6.3171859296482415e-06, | |
| "loss": 0.8024, | |
| "step": 68600 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 5.171160697937012, | |
| "learning_rate": 6.297085427135679e-06, | |
| "loss": 0.7936, | |
| "step": 68700 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 6.406951904296875, | |
| "learning_rate": 6.2769849246231166e-06, | |
| "loss": 0.7627, | |
| "step": 68800 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 6.404531955718994, | |
| "learning_rate": 6.256884422110553e-06, | |
| "loss": 0.8081, | |
| "step": 68900 | |
| }, | |
| { | |
| "epoch": 3.45, | |
| "grad_norm": 4.409193992614746, | |
| "learning_rate": 6.236783919597991e-06, | |
| "loss": 0.8284, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 3.46, | |
| "grad_norm": 6.643680572509766, | |
| "learning_rate": 6.216683417085427e-06, | |
| "loss": 0.7908, | |
| "step": 69100 | |
| }, | |
| { | |
| "epoch": 3.46, | |
| "grad_norm": 6.344150543212891, | |
| "learning_rate": 6.196582914572865e-06, | |
| "loss": 0.8028, | |
| "step": 69200 | |
| }, | |
| { | |
| "epoch": 3.46, | |
| "grad_norm": 4.83349609375, | |
| "learning_rate": 6.176482412060301e-06, | |
| "loss": 0.7946, | |
| "step": 69300 | |
| }, | |
| { | |
| "epoch": 3.47, | |
| "grad_norm": 4.103985786437988, | |
| "learning_rate": 6.156381909547739e-06, | |
| "loss": 0.8089, | |
| "step": 69400 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 4.681515693664551, | |
| "learning_rate": 6.1362814070351764e-06, | |
| "loss": 0.8104, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 3.5046350955963135, | |
| "learning_rate": 6.1161809045226136e-06, | |
| "loss": 0.8024, | |
| "step": 69600 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 5.06920051574707, | |
| "learning_rate": 6.096080402010051e-06, | |
| "loss": 0.8043, | |
| "step": 69700 | |
| }, | |
| { | |
| "epoch": 3.49, | |
| "grad_norm": 6.419402599334717, | |
| "learning_rate": 6.075979899497489e-06, | |
| "loss": 0.8218, | |
| "step": 69800 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 4.9620184898376465, | |
| "learning_rate": 6.055879396984925e-06, | |
| "loss": 0.7904, | |
| "step": 69900 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 6.6012349128723145, | |
| "learning_rate": 6.035778894472363e-06, | |
| "loss": 0.8021, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "eval_loss": 0.913910448551178, | |
| "eval_runtime": 21.5858, | |
| "eval_samples_per_second": 46.327, | |
| "eval_steps_per_second": 5.791, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 8.278429985046387, | |
| "learning_rate": 6.015678391959799e-06, | |
| "loss": 0.8255, | |
| "step": 70100 | |
| }, | |
| { | |
| "epoch": 3.51, | |
| "grad_norm": 5.309919834136963, | |
| "learning_rate": 5.995577889447237e-06, | |
| "loss": 0.8048, | |
| "step": 70200 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 5.3151535987854, | |
| "learning_rate": 5.9754773869346735e-06, | |
| "loss": 0.796, | |
| "step": 70300 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 6.962722301483154, | |
| "learning_rate": 5.9553768844221114e-06, | |
| "loss": 0.8448, | |
| "step": 70400 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 6.564899444580078, | |
| "learning_rate": 5.9352763819095486e-06, | |
| "loss": 0.7782, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 3.53, | |
| "grad_norm": 4.522327423095703, | |
| "learning_rate": 5.915175879396985e-06, | |
| "loss": 0.8306, | |
| "step": 70600 | |
| }, | |
| { | |
| "epoch": 3.54, | |
| "grad_norm": 4.783290863037109, | |
| "learning_rate": 5.895075376884423e-06, | |
| "loss": 0.8448, | |
| "step": 70700 | |
| }, | |
| { | |
| "epoch": 3.54, | |
| "grad_norm": 8.016778945922852, | |
| "learning_rate": 5.874974874371859e-06, | |
| "loss": 0.805, | |
| "step": 70800 | |
| }, | |
| { | |
| "epoch": 3.54, | |
| "grad_norm": 6.962314605712891, | |
| "learning_rate": 5.854874371859297e-06, | |
| "loss": 0.7802, | |
| "step": 70900 | |
| }, | |
| { | |
| "epoch": 3.55, | |
| "grad_norm": 4.056068420410156, | |
| "learning_rate": 5.834773869346733e-06, | |
| "loss": 0.8146, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "grad_norm": 4.548468589782715, | |
| "learning_rate": 5.814673366834171e-06, | |
| "loss": 0.7631, | |
| "step": 71100 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "grad_norm": 4.344750881195068, | |
| "learning_rate": 5.7945728643216085e-06, | |
| "loss": 0.8032, | |
| "step": 71200 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "grad_norm": 6.746843338012695, | |
| "learning_rate": 5.774472361809046e-06, | |
| "loss": 0.7622, | |
| "step": 71300 | |
| }, | |
| { | |
| "epoch": 3.57, | |
| "grad_norm": 5.048290729522705, | |
| "learning_rate": 5.754371859296483e-06, | |
| "loss": 0.8133, | |
| "step": 71400 | |
| }, | |
| { | |
| "epoch": 3.58, | |
| "grad_norm": 5.74857759475708, | |
| "learning_rate": 5.734271356783921e-06, | |
| "loss": 0.7834, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 3.58, | |
| "grad_norm": 4.5277934074401855, | |
| "learning_rate": 5.714170854271357e-06, | |
| "loss": 0.789, | |
| "step": 71600 | |
| }, | |
| { | |
| "epoch": 3.58, | |
| "grad_norm": 8.23270034790039, | |
| "learning_rate": 5.694070351758795e-06, | |
| "loss": 0.7613, | |
| "step": 71700 | |
| }, | |
| { | |
| "epoch": 3.59, | |
| "grad_norm": 3.9528987407684326, | |
| "learning_rate": 5.673969849246231e-06, | |
| "loss": 0.8081, | |
| "step": 71800 | |
| }, | |
| { | |
| "epoch": 3.59, | |
| "grad_norm": 5.704257965087891, | |
| "learning_rate": 5.653869346733669e-06, | |
| "loss": 0.8164, | |
| "step": 71900 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 4.676042079925537, | |
| "learning_rate": 5.6337688442211055e-06, | |
| "loss": 0.8202, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 5.20451021194458, | |
| "learning_rate": 5.6136683417085434e-06, | |
| "loss": 0.7953, | |
| "step": 72100 | |
| }, | |
| { | |
| "epoch": 3.61, | |
| "grad_norm": 7.501960277557373, | |
| "learning_rate": 5.5935678391959806e-06, | |
| "loss": 0.8168, | |
| "step": 72200 | |
| }, | |
| { | |
| "epoch": 3.62, | |
| "grad_norm": 7.015203475952148, | |
| "learning_rate": 5.573467336683418e-06, | |
| "loss": 0.789, | |
| "step": 72300 | |
| }, | |
| { | |
| "epoch": 3.62, | |
| "grad_norm": 4.428484916687012, | |
| "learning_rate": 5.553366834170855e-06, | |
| "loss": 0.8092, | |
| "step": 72400 | |
| }, | |
| { | |
| "epoch": 3.62, | |
| "grad_norm": 4.477147102355957, | |
| "learning_rate": 5.533266331658293e-06, | |
| "loss": 0.7843, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 3.63, | |
| "grad_norm": 5.1699748039245605, | |
| "learning_rate": 5.513165829145729e-06, | |
| "loss": 0.7996, | |
| "step": 72600 | |
| }, | |
| { | |
| "epoch": 3.63, | |
| "grad_norm": 5.133453369140625, | |
| "learning_rate": 5.493065326633167e-06, | |
| "loss": 0.8233, | |
| "step": 72700 | |
| }, | |
| { | |
| "epoch": 3.64, | |
| "grad_norm": 4.902942657470703, | |
| "learning_rate": 5.472964824120603e-06, | |
| "loss": 0.7586, | |
| "step": 72800 | |
| }, | |
| { | |
| "epoch": 3.65, | |
| "grad_norm": 6.46637487411499, | |
| "learning_rate": 5.4528643216080405e-06, | |
| "loss": 0.7959, | |
| "step": 72900 | |
| }, | |
| { | |
| "epoch": 3.65, | |
| "grad_norm": 7.144857406616211, | |
| "learning_rate": 5.432763819095478e-06, | |
| "loss": 0.8197, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 3.66, | |
| "grad_norm": 6.084510326385498, | |
| "learning_rate": 5.412663316582915e-06, | |
| "loss": 0.8133, | |
| "step": 73100 | |
| }, | |
| { | |
| "epoch": 3.66, | |
| "grad_norm": 5.132942199707031, | |
| "learning_rate": 5.392562814070353e-06, | |
| "loss": 0.7482, | |
| "step": 73200 | |
| }, | |
| { | |
| "epoch": 3.67, | |
| "grad_norm": 6.69909143447876, | |
| "learning_rate": 5.372462311557789e-06, | |
| "loss": 0.7498, | |
| "step": 73300 | |
| }, | |
| { | |
| "epoch": 3.67, | |
| "grad_norm": 7.99722146987915, | |
| "learning_rate": 5.352361809045227e-06, | |
| "loss": 0.7857, | |
| "step": 73400 | |
| }, | |
| { | |
| "epoch": 3.67, | |
| "grad_norm": 7.380476951599121, | |
| "learning_rate": 5.332261306532663e-06, | |
| "loss": 0.8081, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 6.441634178161621, | |
| "learning_rate": 5.312160804020101e-06, | |
| "loss": 0.7737, | |
| "step": 73600 | |
| }, | |
| { | |
| "epoch": 3.69, | |
| "grad_norm": 5.027355194091797, | |
| "learning_rate": 5.2920603015075375e-06, | |
| "loss": 0.7991, | |
| "step": 73700 | |
| }, | |
| { | |
| "epoch": 3.69, | |
| "grad_norm": 8.128876686096191, | |
| "learning_rate": 5.2719597989949755e-06, | |
| "loss": 0.8271, | |
| "step": 73800 | |
| }, | |
| { | |
| "epoch": 3.69, | |
| "grad_norm": 4.09487247467041, | |
| "learning_rate": 5.251859296482413e-06, | |
| "loss": 0.775, | |
| "step": 73900 | |
| }, | |
| { | |
| "epoch": 3.7, | |
| "grad_norm": 6.368048667907715, | |
| "learning_rate": 5.231959798994976e-06, | |
| "loss": 0.7872, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 3.71, | |
| "grad_norm": 4.72104549407959, | |
| "learning_rate": 5.211859296482412e-06, | |
| "loss": 0.8057, | |
| "step": 74100 | |
| }, | |
| { | |
| "epoch": 3.71, | |
| "grad_norm": 5.083056926727295, | |
| "learning_rate": 5.19175879396985e-06, | |
| "loss": 0.7839, | |
| "step": 74200 | |
| }, | |
| { | |
| "epoch": 3.71, | |
| "grad_norm": 5.289855003356934, | |
| "learning_rate": 5.171658291457286e-06, | |
| "loss": 0.7829, | |
| "step": 74300 | |
| }, | |
| { | |
| "epoch": 3.72, | |
| "grad_norm": 5.842662811279297, | |
| "learning_rate": 5.151557788944724e-06, | |
| "loss": 0.7782, | |
| "step": 74400 | |
| }, | |
| { | |
| "epoch": 3.73, | |
| "grad_norm": 6.445068836212158, | |
| "learning_rate": 5.131457286432161e-06, | |
| "loss": 0.8335, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 3.73, | |
| "grad_norm": 4.2318220138549805, | |
| "learning_rate": 5.111356783919599e-06, | |
| "loss": 0.7942, | |
| "step": 74600 | |
| }, | |
| { | |
| "epoch": 3.73, | |
| "grad_norm": 8.975232124328613, | |
| "learning_rate": 5.091256281407036e-06, | |
| "loss": 0.8284, | |
| "step": 74700 | |
| }, | |
| { | |
| "epoch": 3.74, | |
| "grad_norm": 4.482039451599121, | |
| "learning_rate": 5.071155778894473e-06, | |
| "loss": 0.8281, | |
| "step": 74800 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 4.330044269561768, | |
| "learning_rate": 5.05105527638191e-06, | |
| "loss": 0.7737, | |
| "step": 74900 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 4.636693000793457, | |
| "learning_rate": 5.030954773869348e-06, | |
| "loss": 0.7882, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 4.175960540771484, | |
| "learning_rate": 5.010854271356784e-06, | |
| "loss": 0.7417, | |
| "step": 75100 | |
| }, | |
| { | |
| "epoch": 3.76, | |
| "grad_norm": 4.081864833831787, | |
| "learning_rate": 4.990753768844221e-06, | |
| "loss": 0.7579, | |
| "step": 75200 | |
| }, | |
| { | |
| "epoch": 3.77, | |
| "grad_norm": 4.608290672302246, | |
| "learning_rate": 4.9706532663316585e-06, | |
| "loss": 0.799, | |
| "step": 75300 | |
| }, | |
| { | |
| "epoch": 3.77, | |
| "grad_norm": 4.851296901702881, | |
| "learning_rate": 4.950552763819096e-06, | |
| "loss": 0.7998, | |
| "step": 75400 | |
| }, | |
| { | |
| "epoch": 3.77, | |
| "grad_norm": 4.3285112380981445, | |
| "learning_rate": 4.930452261306533e-06, | |
| "loss": 0.8093, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 3.78, | |
| "grad_norm": 4.927236080169678, | |
| "learning_rate": 4.910552763819096e-06, | |
| "loss": 0.7793, | |
| "step": 75600 | |
| }, | |
| { | |
| "epoch": 3.79, | |
| "grad_norm": 6.193936824798584, | |
| "learning_rate": 4.890452261306533e-06, | |
| "loss": 0.8072, | |
| "step": 75700 | |
| }, | |
| { | |
| "epoch": 3.79, | |
| "grad_norm": 4.687440872192383, | |
| "learning_rate": 4.87035175879397e-06, | |
| "loss": 0.8, | |
| "step": 75800 | |
| }, | |
| { | |
| "epoch": 3.79, | |
| "grad_norm": 4.473381519317627, | |
| "learning_rate": 4.850251256281407e-06, | |
| "loss": 0.8027, | |
| "step": 75900 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 4.676540374755859, | |
| "learning_rate": 4.8301507537688445e-06, | |
| "loss": 0.8029, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 3.81, | |
| "grad_norm": 4.967388153076172, | |
| "learning_rate": 4.810050251256282e-06, | |
| "loss": 0.7539, | |
| "step": 76100 | |
| }, | |
| { | |
| "epoch": 3.81, | |
| "grad_norm": 4.699183940887451, | |
| "learning_rate": 4.789949748743719e-06, | |
| "loss": 0.7651, | |
| "step": 76200 | |
| }, | |
| { | |
| "epoch": 3.81, | |
| "grad_norm": 4.629420757293701, | |
| "learning_rate": 4.769849246231156e-06, | |
| "loss": 0.7803, | |
| "step": 76300 | |
| }, | |
| { | |
| "epoch": 3.82, | |
| "grad_norm": 5.920188903808594, | |
| "learning_rate": 4.749748743718594e-06, | |
| "loss": 0.8017, | |
| "step": 76400 | |
| }, | |
| { | |
| "epoch": 3.83, | |
| "grad_norm": 6.677817344665527, | |
| "learning_rate": 4.729648241206031e-06, | |
| "loss": 0.8216, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 3.83, | |
| "grad_norm": 5.312260627746582, | |
| "learning_rate": 4.709547738693468e-06, | |
| "loss": 0.7827, | |
| "step": 76600 | |
| }, | |
| { | |
| "epoch": 3.83, | |
| "grad_norm": 4.119052410125732, | |
| "learning_rate": 4.689447236180905e-06, | |
| "loss": 0.7483, | |
| "step": 76700 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 4.5976715087890625, | |
| "learning_rate": 4.669346733668342e-06, | |
| "loss": 0.7657, | |
| "step": 76800 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 5.721061706542969, | |
| "learning_rate": 4.649246231155779e-06, | |
| "loss": 0.7817, | |
| "step": 76900 | |
| }, | |
| { | |
| "epoch": 3.85, | |
| "grad_norm": 7.369571208953857, | |
| "learning_rate": 4.629145728643216e-06, | |
| "loss": 0.7402, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 3.85, | |
| "grad_norm": 5.615093231201172, | |
| "learning_rate": 4.609045226130654e-06, | |
| "loss": 0.811, | |
| "step": 77100 | |
| }, | |
| { | |
| "epoch": 3.86, | |
| "grad_norm": 6.276815414428711, | |
| "learning_rate": 4.588944723618091e-06, | |
| "loss": 0.7909, | |
| "step": 77200 | |
| }, | |
| { | |
| "epoch": 3.87, | |
| "grad_norm": 4.287708759307861, | |
| "learning_rate": 4.568844221105528e-06, | |
| "loss": 0.8012, | |
| "step": 77300 | |
| }, | |
| { | |
| "epoch": 3.87, | |
| "grad_norm": 4.280378818511963, | |
| "learning_rate": 4.548743718592965e-06, | |
| "loss": 0.8205, | |
| "step": 77400 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "grad_norm": 8.309846878051758, | |
| "learning_rate": 4.528643216080402e-06, | |
| "loss": 0.7785, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "grad_norm": 5.504384517669678, | |
| "learning_rate": 4.508542713567839e-06, | |
| "loss": 0.7678, | |
| "step": 77600 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "grad_norm": 4.6738996505737305, | |
| "learning_rate": 4.4884422110552765e-06, | |
| "loss": 0.8207, | |
| "step": 77700 | |
| }, | |
| { | |
| "epoch": 3.89, | |
| "grad_norm": 8.038127899169922, | |
| "learning_rate": 4.468341708542714e-06, | |
| "loss": 0.7788, | |
| "step": 77800 | |
| }, | |
| { | |
| "epoch": 3.9, | |
| "grad_norm": 6.898759365081787, | |
| "learning_rate": 4.448241206030151e-06, | |
| "loss": 0.7575, | |
| "step": 77900 | |
| }, | |
| { | |
| "epoch": 3.9, | |
| "grad_norm": 5.893388271331787, | |
| "learning_rate": 4.428140703517588e-06, | |
| "loss": 0.7842, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 3.91, | |
| "grad_norm": 7.37433385848999, | |
| "learning_rate": 4.408040201005026e-06, | |
| "loss": 0.756, | |
| "step": 78100 | |
| }, | |
| { | |
| "epoch": 3.91, | |
| "grad_norm": 6.226987838745117, | |
| "learning_rate": 4.387939698492463e-06, | |
| "loss": 0.7818, | |
| "step": 78200 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 6.20886754989624, | |
| "learning_rate": 4.368040201005025e-06, | |
| "loss": 0.8057, | |
| "step": 78300 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 3.9309849739074707, | |
| "learning_rate": 4.3479396984924625e-06, | |
| "loss": 0.8052, | |
| "step": 78400 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 4.972345352172852, | |
| "learning_rate": 4.3278391959799e-06, | |
| "loss": 0.7666, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 3.93, | |
| "grad_norm": 8.730260848999023, | |
| "learning_rate": 4.307738693467337e-06, | |
| "loss": 0.7897, | |
| "step": 78600 | |
| }, | |
| { | |
| "epoch": 3.94, | |
| "grad_norm": 6.734485626220703, | |
| "learning_rate": 4.287638190954774e-06, | |
| "loss": 0.7595, | |
| "step": 78700 | |
| }, | |
| { | |
| "epoch": 3.94, | |
| "grad_norm": 6.456557750701904, | |
| "learning_rate": 4.267537688442212e-06, | |
| "loss": 0.7924, | |
| "step": 78800 | |
| }, | |
| { | |
| "epoch": 3.94, | |
| "grad_norm": 4.421884059906006, | |
| "learning_rate": 4.247437185929649e-06, | |
| "loss": 0.7821, | |
| "step": 78900 | |
| }, | |
| { | |
| "epoch": 3.95, | |
| "grad_norm": 7.825852394104004, | |
| "learning_rate": 4.227336683417086e-06, | |
| "loss": 0.7834, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 3.96, | |
| "grad_norm": 6.445671081542969, | |
| "learning_rate": 4.207236180904523e-06, | |
| "loss": 0.7794, | |
| "step": 79100 | |
| }, | |
| { | |
| "epoch": 3.96, | |
| "grad_norm": 3.7435953617095947, | |
| "learning_rate": 4.18713567839196e-06, | |
| "loss": 0.7218, | |
| "step": 79200 | |
| }, | |
| { | |
| "epoch": 3.96, | |
| "grad_norm": 10.594905853271484, | |
| "learning_rate": 4.1670351758793975e-06, | |
| "loss": 0.7957, | |
| "step": 79300 | |
| }, | |
| { | |
| "epoch": 3.97, | |
| "grad_norm": 7.166194438934326, | |
| "learning_rate": 4.146934673366835e-06, | |
| "loss": 0.7936, | |
| "step": 79400 | |
| }, | |
| { | |
| "epoch": 3.98, | |
| "grad_norm": 4.773101329803467, | |
| "learning_rate": 4.126834170854272e-06, | |
| "loss": 0.7721, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 3.98, | |
| "grad_norm": 5.979006767272949, | |
| "learning_rate": 4.106733668341709e-06, | |
| "loss": 0.7899, | |
| "step": 79600 | |
| }, | |
| { | |
| "epoch": 3.98, | |
| "grad_norm": 6.46978235244751, | |
| "learning_rate": 4.086633165829146e-06, | |
| "loss": 0.7874, | |
| "step": 79700 | |
| }, | |
| { | |
| "epoch": 3.99, | |
| "grad_norm": 5.1106977462768555, | |
| "learning_rate": 4.066532663316583e-06, | |
| "loss": 0.7644, | |
| "step": 79800 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 7.125823974609375, | |
| "learning_rate": 4.0466331658291464e-06, | |
| "loss": 0.792, | |
| "step": 79900 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 5.539035797119141, | |
| "learning_rate": 4.026532663316583e-06, | |
| "loss": 0.7779, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.8846080303192139, | |
| "eval_runtime": 21.6073, | |
| "eval_samples_per_second": 46.281, | |
| "eval_steps_per_second": 5.785, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 5.7579193115234375, | |
| "learning_rate": 4.00643216080402e-06, | |
| "loss": 0.6947, | |
| "step": 80100 | |
| }, | |
| { | |
| "epoch": 4.01, | |
| "grad_norm": 5.583180904388428, | |
| "learning_rate": 3.986331658291458e-06, | |
| "loss": 0.6614, | |
| "step": 80200 | |
| }, | |
| { | |
| "epoch": 4.01, | |
| "grad_norm": 5.107233047485352, | |
| "learning_rate": 3.966231155778895e-06, | |
| "loss": 0.6936, | |
| "step": 80300 | |
| }, | |
| { | |
| "epoch": 4.02, | |
| "grad_norm": 5.804276466369629, | |
| "learning_rate": 3.946130653266332e-06, | |
| "loss": 0.6946, | |
| "step": 80400 | |
| }, | |
| { | |
| "epoch": 4.03, | |
| "grad_norm": 6.738204479217529, | |
| "learning_rate": 3.926030150753769e-06, | |
| "loss": 0.6681, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 4.03, | |
| "grad_norm": 6.331192970275879, | |
| "learning_rate": 3.905929648241206e-06, | |
| "loss": 0.6839, | |
| "step": 80600 | |
| }, | |
| { | |
| "epoch": 4.04, | |
| "grad_norm": 5.382104873657227, | |
| "learning_rate": 3.8858291457286434e-06, | |
| "loss": 0.6566, | |
| "step": 80700 | |
| }, | |
| { | |
| "epoch": 4.04, | |
| "grad_norm": 6.394933223724365, | |
| "learning_rate": 3.8657286432160806e-06, | |
| "loss": 0.7378, | |
| "step": 80800 | |
| }, | |
| { | |
| "epoch": 4.04, | |
| "grad_norm": 5.813870429992676, | |
| "learning_rate": 3.845628140703518e-06, | |
| "loss": 0.7112, | |
| "step": 80900 | |
| }, | |
| { | |
| "epoch": 4.05, | |
| "grad_norm": 6.095046520233154, | |
| "learning_rate": 3.825527638190955e-06, | |
| "loss": 0.6885, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 4.05, | |
| "grad_norm": 6.212576866149902, | |
| "learning_rate": 3.8054271356783924e-06, | |
| "loss": 0.6658, | |
| "step": 81100 | |
| }, | |
| { | |
| "epoch": 4.06, | |
| "grad_norm": 4.426722526550293, | |
| "learning_rate": 3.7853266331658295e-06, | |
| "loss": 0.6915, | |
| "step": 81200 | |
| }, | |
| { | |
| "epoch": 4.07, | |
| "grad_norm": 7.474303722381592, | |
| "learning_rate": 3.7652261306532666e-06, | |
| "loss": 0.6486, | |
| "step": 81300 | |
| }, | |
| { | |
| "epoch": 4.07, | |
| "grad_norm": 7.347512245178223, | |
| "learning_rate": 3.7451256281407038e-06, | |
| "loss": 0.7078, | |
| "step": 81400 | |
| }, | |
| { | |
| "epoch": 4.08, | |
| "grad_norm": 9.426233291625977, | |
| "learning_rate": 3.7250251256281413e-06, | |
| "loss": 0.6951, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 4.08, | |
| "grad_norm": 5.577968597412109, | |
| "learning_rate": 3.7049246231155784e-06, | |
| "loss": 0.6905, | |
| "step": 81600 | |
| }, | |
| { | |
| "epoch": 4.08, | |
| "grad_norm": 6.477217197418213, | |
| "learning_rate": 3.6848241206030156e-06, | |
| "loss": 0.663, | |
| "step": 81700 | |
| }, | |
| { | |
| "epoch": 4.09, | |
| "grad_norm": 6.228948593139648, | |
| "learning_rate": 3.6647236180904527e-06, | |
| "loss": 0.6677, | |
| "step": 81800 | |
| }, | |
| { | |
| "epoch": 4.09, | |
| "grad_norm": 5.777594089508057, | |
| "learning_rate": 3.64462311557789e-06, | |
| "loss": 0.6905, | |
| "step": 81900 | |
| }, | |
| { | |
| "epoch": 4.1, | |
| "grad_norm": 6.7552080154418945, | |
| "learning_rate": 3.624522613065327e-06, | |
| "loss": 0.7086, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 4.11, | |
| "grad_norm": 5.3912553787231445, | |
| "learning_rate": 3.6044221105527645e-06, | |
| "loss": 0.6833, | |
| "step": 82100 | |
| }, | |
| { | |
| "epoch": 4.11, | |
| "grad_norm": 7.366456508636475, | |
| "learning_rate": 3.5843216080402016e-06, | |
| "loss": 0.6618, | |
| "step": 82200 | |
| }, | |
| { | |
| "epoch": 4.12, | |
| "grad_norm": 4.593729019165039, | |
| "learning_rate": 3.5642211055276383e-06, | |
| "loss": 0.6397, | |
| "step": 82300 | |
| }, | |
| { | |
| "epoch": 4.12, | |
| "grad_norm": 6.743685722351074, | |
| "learning_rate": 3.5441206030150755e-06, | |
| "loss": 0.7233, | |
| "step": 82400 | |
| }, | |
| { | |
| "epoch": 4.12, | |
| "grad_norm": 6.125808238983154, | |
| "learning_rate": 3.5240201005025126e-06, | |
| "loss": 0.6804, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 4.13, | |
| "grad_norm": 7.0340752601623535, | |
| "learning_rate": 3.5039195979899497e-06, | |
| "loss": 0.699, | |
| "step": 82600 | |
| }, | |
| { | |
| "epoch": 4.13, | |
| "grad_norm": 7.293619632720947, | |
| "learning_rate": 3.4838190954773873e-06, | |
| "loss": 0.6572, | |
| "step": 82700 | |
| }, | |
| { | |
| "epoch": 4.14, | |
| "grad_norm": 6.3135552406311035, | |
| "learning_rate": 3.4637185929648244e-06, | |
| "loss": 0.6364, | |
| "step": 82800 | |
| }, | |
| { | |
| "epoch": 4.14, | |
| "grad_norm": 5.138033390045166, | |
| "learning_rate": 3.4436180904522615e-06, | |
| "loss": 0.6815, | |
| "step": 82900 | |
| }, | |
| { | |
| "epoch": 4.15, | |
| "grad_norm": 6.240560054779053, | |
| "learning_rate": 3.4235175879396986e-06, | |
| "loss": 0.6919, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "grad_norm": 4.19957971572876, | |
| "learning_rate": 3.4034170854271358e-06, | |
| "loss": 0.6845, | |
| "step": 83100 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "grad_norm": 6.340314865112305, | |
| "learning_rate": 3.383316582914573e-06, | |
| "loss": 0.653, | |
| "step": 83200 | |
| }, | |
| { | |
| "epoch": 4.17, | |
| "grad_norm": 3.309894323348999, | |
| "learning_rate": 3.3632160804020104e-06, | |
| "loss": 0.6612, | |
| "step": 83300 | |
| }, | |
| { | |
| "epoch": 4.17, | |
| "grad_norm": 5.189826011657715, | |
| "learning_rate": 3.3431155778894476e-06, | |
| "loss": 0.6871, | |
| "step": 83400 | |
| }, | |
| { | |
| "epoch": 4.17, | |
| "grad_norm": 6.599611759185791, | |
| "learning_rate": 3.3230150753768847e-06, | |
| "loss": 0.6743, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 4.18, | |
| "grad_norm": 26.47356414794922, | |
| "learning_rate": 3.302914572864322e-06, | |
| "loss": 0.6312, | |
| "step": 83600 | |
| }, | |
| { | |
| "epoch": 4.18, | |
| "grad_norm": 8.280220985412598, | |
| "learning_rate": 3.282814070351759e-06, | |
| "loss": 0.6276, | |
| "step": 83700 | |
| }, | |
| { | |
| "epoch": 4.19, | |
| "grad_norm": 7.8088555335998535, | |
| "learning_rate": 3.2627135678391965e-06, | |
| "loss": 0.6514, | |
| "step": 83800 | |
| }, | |
| { | |
| "epoch": 4.2, | |
| "grad_norm": 5.11159086227417, | |
| "learning_rate": 3.2426130653266336e-06, | |
| "loss": 0.6262, | |
| "step": 83900 | |
| }, | |
| { | |
| "epoch": 4.2, | |
| "grad_norm": 6.656592845916748, | |
| "learning_rate": 3.2225125628140708e-06, | |
| "loss": 0.6889, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 4.21, | |
| "grad_norm": 7.140279769897461, | |
| "learning_rate": 3.202412060301508e-06, | |
| "loss": 0.6435, | |
| "step": 84100 | |
| }, | |
| { | |
| "epoch": 4.21, | |
| "grad_norm": 6.478577613830566, | |
| "learning_rate": 3.182311557788945e-06, | |
| "loss": 0.6593, | |
| "step": 84200 | |
| }, | |
| { | |
| "epoch": 4.21, | |
| "grad_norm": 6.854846477508545, | |
| "learning_rate": 3.1622110552763826e-06, | |
| "loss": 0.7097, | |
| "step": 84300 | |
| }, | |
| { | |
| "epoch": 4.22, | |
| "grad_norm": 5.070549488067627, | |
| "learning_rate": 3.1421105527638197e-06, | |
| "loss": 0.6736, | |
| "step": 84400 | |
| }, | |
| { | |
| "epoch": 4.22, | |
| "grad_norm": 7.519010543823242, | |
| "learning_rate": 3.122010050251257e-06, | |
| "loss": 0.6518, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 4.23, | |
| "grad_norm": 6.662156105041504, | |
| "learning_rate": 3.1019095477386935e-06, | |
| "loss": 0.675, | |
| "step": 84600 | |
| }, | |
| { | |
| "epoch": 4.24, | |
| "grad_norm": 7.687413215637207, | |
| "learning_rate": 3.0818090452261307e-06, | |
| "loss": 0.6477, | |
| "step": 84700 | |
| }, | |
| { | |
| "epoch": 4.24, | |
| "grad_norm": 5.934724807739258, | |
| "learning_rate": 3.0617085427135678e-06, | |
| "loss": 0.6492, | |
| "step": 84800 | |
| }, | |
| { | |
| "epoch": 4.25, | |
| "grad_norm": 9.457836151123047, | |
| "learning_rate": 3.041608040201005e-06, | |
| "loss": 0.633, | |
| "step": 84900 | |
| }, | |
| { | |
| "epoch": 4.25, | |
| "grad_norm": 6.666748523712158, | |
| "learning_rate": 3.0215075376884425e-06, | |
| "loss": 0.6693, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 4.25, | |
| "grad_norm": 6.439404487609863, | |
| "learning_rate": 3.0014070351758796e-06, | |
| "loss": 0.6643, | |
| "step": 85100 | |
| }, | |
| { | |
| "epoch": 4.26, | |
| "grad_norm": 7.257474422454834, | |
| "learning_rate": 2.9813065326633167e-06, | |
| "loss": 0.6623, | |
| "step": 85200 | |
| }, | |
| { | |
| "epoch": 4.26, | |
| "grad_norm": 4.707270622253418, | |
| "learning_rate": 2.961206030150754e-06, | |
| "loss": 0.6471, | |
| "step": 85300 | |
| }, | |
| { | |
| "epoch": 4.27, | |
| "grad_norm": 5.7160844802856445, | |
| "learning_rate": 2.941105527638191e-06, | |
| "loss": 0.683, | |
| "step": 85400 | |
| }, | |
| { | |
| "epoch": 4.28, | |
| "grad_norm": 6.038240432739258, | |
| "learning_rate": 2.9210050251256285e-06, | |
| "loss": 0.6742, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 4.28, | |
| "grad_norm": 6.851832866668701, | |
| "learning_rate": 2.9009045226130656e-06, | |
| "loss": 0.6748, | |
| "step": 85600 | |
| }, | |
| { | |
| "epoch": 4.29, | |
| "grad_norm": 5.691901683807373, | |
| "learning_rate": 2.8808040201005028e-06, | |
| "loss": 0.6703, | |
| "step": 85700 | |
| }, | |
| { | |
| "epoch": 4.29, | |
| "grad_norm": 6.378291130065918, | |
| "learning_rate": 2.86070351758794e-06, | |
| "loss": 0.6487, | |
| "step": 85800 | |
| }, | |
| { | |
| "epoch": 4.29, | |
| "grad_norm": 4.439263343811035, | |
| "learning_rate": 2.840603015075377e-06, | |
| "loss": 0.6598, | |
| "step": 85900 | |
| }, | |
| { | |
| "epoch": 4.3, | |
| "grad_norm": 6.466790199279785, | |
| "learning_rate": 2.8205025125628146e-06, | |
| "loss": 0.6914, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 4.3, | |
| "grad_norm": 6.0331902503967285, | |
| "learning_rate": 2.8004020100502517e-06, | |
| "loss": 0.6929, | |
| "step": 86100 | |
| }, | |
| { | |
| "epoch": 4.31, | |
| "grad_norm": 4.750064849853516, | |
| "learning_rate": 2.780301507537689e-06, | |
| "loss": 0.6715, | |
| "step": 86200 | |
| }, | |
| { | |
| "epoch": 4.32, | |
| "grad_norm": 8.289958953857422, | |
| "learning_rate": 2.760201005025126e-06, | |
| "loss": 0.6975, | |
| "step": 86300 | |
| }, | |
| { | |
| "epoch": 4.32, | |
| "grad_norm": 10.746756553649902, | |
| "learning_rate": 2.740100502512563e-06, | |
| "loss": 0.6454, | |
| "step": 86400 | |
| }, | |
| { | |
| "epoch": 4.33, | |
| "grad_norm": 6.792548656463623, | |
| "learning_rate": 2.720201005025126e-06, | |
| "loss": 0.7056, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 4.33, | |
| "grad_norm": 5.030031204223633, | |
| "learning_rate": 2.700100502512563e-06, | |
| "loss": 0.6711, | |
| "step": 86600 | |
| }, | |
| { | |
| "epoch": 4.33, | |
| "grad_norm": 4.626148223876953, | |
| "learning_rate": 2.680201005025126e-06, | |
| "loss": 0.676, | |
| "step": 86700 | |
| }, | |
| { | |
| "epoch": 4.34, | |
| "grad_norm": 8.56241512298584, | |
| "learning_rate": 2.660100502512563e-06, | |
| "loss": 0.6548, | |
| "step": 86800 | |
| }, | |
| { | |
| "epoch": 4.34, | |
| "grad_norm": 9.747623443603516, | |
| "learning_rate": 2.64e-06, | |
| "loss": 0.6883, | |
| "step": 86900 | |
| }, | |
| { | |
| "epoch": 4.35, | |
| "grad_norm": 8.002108573913574, | |
| "learning_rate": 2.6198994974874377e-06, | |
| "loss": 0.7166, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 4.36, | |
| "grad_norm": 6.09249210357666, | |
| "learning_rate": 2.599798994974875e-06, | |
| "loss": 0.6841, | |
| "step": 87100 | |
| }, | |
| { | |
| "epoch": 4.36, | |
| "grad_norm": 5.512220859527588, | |
| "learning_rate": 2.579698492462312e-06, | |
| "loss": 0.6816, | |
| "step": 87200 | |
| }, | |
| { | |
| "epoch": 4.37, | |
| "grad_norm": 5.139577388763428, | |
| "learning_rate": 2.559597989949749e-06, | |
| "loss": 0.6475, | |
| "step": 87300 | |
| }, | |
| { | |
| "epoch": 4.37, | |
| "grad_norm": 11.360005378723145, | |
| "learning_rate": 2.539497487437186e-06, | |
| "loss": 0.7434, | |
| "step": 87400 | |
| }, | |
| { | |
| "epoch": 4.38, | |
| "grad_norm": 5.06545877456665, | |
| "learning_rate": 2.5193969849246237e-06, | |
| "loss": 0.6626, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 4.38, | |
| "grad_norm": 4.432734966278076, | |
| "learning_rate": 2.4992964824120604e-06, | |
| "loss": 0.6357, | |
| "step": 87600 | |
| }, | |
| { | |
| "epoch": 4.38, | |
| "grad_norm": 7.90862512588501, | |
| "learning_rate": 2.4791959798994976e-06, | |
| "loss": 0.6039, | |
| "step": 87700 | |
| }, | |
| { | |
| "epoch": 4.39, | |
| "grad_norm": 4.959092617034912, | |
| "learning_rate": 2.459095477386935e-06, | |
| "loss": 0.6699, | |
| "step": 87800 | |
| }, | |
| { | |
| "epoch": 4.39, | |
| "grad_norm": 7.495928764343262, | |
| "learning_rate": 2.4389949748743723e-06, | |
| "loss": 0.6648, | |
| "step": 87900 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "grad_norm": 10.80557918548584, | |
| "learning_rate": 2.4188944723618094e-06, | |
| "loss": 0.6532, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 4.41, | |
| "grad_norm": 7.1374006271362305, | |
| "learning_rate": 2.3987939698492465e-06, | |
| "loss": 0.6903, | |
| "step": 88100 | |
| }, | |
| { | |
| "epoch": 4.41, | |
| "grad_norm": 12.275821685791016, | |
| "learning_rate": 2.3786934673366836e-06, | |
| "loss": 0.6433, | |
| "step": 88200 | |
| }, | |
| { | |
| "epoch": 4.42, | |
| "grad_norm": 8.747936248779297, | |
| "learning_rate": 2.3585929648241208e-06, | |
| "loss": 0.62, | |
| "step": 88300 | |
| }, | |
| { | |
| "epoch": 4.42, | |
| "grad_norm": 5.3552985191345215, | |
| "learning_rate": 2.338492462311558e-06, | |
| "loss": 0.6525, | |
| "step": 88400 | |
| }, | |
| { | |
| "epoch": 4.42, | |
| "grad_norm": 7.049367427825928, | |
| "learning_rate": 2.318391959798995e-06, | |
| "loss": 0.6742, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 4.43, | |
| "grad_norm": 8.841930389404297, | |
| "learning_rate": 2.298291457286432e-06, | |
| "loss": 0.6806, | |
| "step": 88600 | |
| }, | |
| { | |
| "epoch": 4.43, | |
| "grad_norm": 4.58371114730835, | |
| "learning_rate": 2.2781909547738697e-06, | |
| "loss": 0.6469, | |
| "step": 88700 | |
| }, | |
| { | |
| "epoch": 4.44, | |
| "grad_norm": 8.08278751373291, | |
| "learning_rate": 2.258090452261307e-06, | |
| "loss": 0.6918, | |
| "step": 88800 | |
| }, | |
| { | |
| "epoch": 4.45, | |
| "grad_norm": 5.989361763000488, | |
| "learning_rate": 2.237989949748744e-06, | |
| "loss": 0.7048, | |
| "step": 88900 | |
| }, | |
| { | |
| "epoch": 4.45, | |
| "grad_norm": 8.200750350952148, | |
| "learning_rate": 2.217889447236181e-06, | |
| "loss": 0.6222, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 4.46, | |
| "grad_norm": 7.658218860626221, | |
| "learning_rate": 2.197788944723618e-06, | |
| "loss": 0.653, | |
| "step": 89100 | |
| }, | |
| { | |
| "epoch": 4.46, | |
| "grad_norm": 6.744418621063232, | |
| "learning_rate": 2.177889447236181e-06, | |
| "loss": 0.6698, | |
| "step": 89200 | |
| }, | |
| { | |
| "epoch": 4.46, | |
| "grad_norm": 4.423871994018555, | |
| "learning_rate": 2.157788944723618e-06, | |
| "loss": 0.6665, | |
| "step": 89300 | |
| }, | |
| { | |
| "epoch": 4.47, | |
| "grad_norm": 7.368816375732422, | |
| "learning_rate": 2.1376884422110557e-06, | |
| "loss": 0.6766, | |
| "step": 89400 | |
| }, | |
| { | |
| "epoch": 4.47, | |
| "grad_norm": 4.649584770202637, | |
| "learning_rate": 2.117587939698493e-06, | |
| "loss": 0.6464, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 7.77773904800415, | |
| "learning_rate": 2.09748743718593e-06, | |
| "loss": 0.6721, | |
| "step": 89600 | |
| }, | |
| { | |
| "epoch": 4.49, | |
| "grad_norm": 6.5589280128479, | |
| "learning_rate": 2.0773869346733667e-06, | |
| "loss": 0.6817, | |
| "step": 89700 | |
| }, | |
| { | |
| "epoch": 4.49, | |
| "grad_norm": 10.153287887573242, | |
| "learning_rate": 2.0572864321608042e-06, | |
| "loss": 0.645, | |
| "step": 89800 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 8.705924987792969, | |
| "learning_rate": 2.0371859296482414e-06, | |
| "loss": 0.707, | |
| "step": 89900 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 5.7329511642456055, | |
| "learning_rate": 2.0170854271356785e-06, | |
| "loss": 0.6834, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "eval_loss": 0.9503761529922485, | |
| "eval_runtime": 21.641, | |
| "eval_samples_per_second": 46.209, | |
| "eval_steps_per_second": 5.776, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 6.902284622192383, | |
| "learning_rate": 1.9969849246231156e-06, | |
| "loss": 0.6237, | |
| "step": 90100 | |
| }, | |
| { | |
| "epoch": 4.51, | |
| "grad_norm": 5.6710710525512695, | |
| "learning_rate": 1.9768844221105527e-06, | |
| "loss": 0.6638, | |
| "step": 90200 | |
| }, | |
| { | |
| "epoch": 4.51, | |
| "grad_norm": 6.364370346069336, | |
| "learning_rate": 1.9567839195979903e-06, | |
| "loss": 0.6537, | |
| "step": 90300 | |
| }, | |
| { | |
| "epoch": 4.52, | |
| "grad_norm": 5.928137302398682, | |
| "learning_rate": 1.9366834170854274e-06, | |
| "loss": 0.6266, | |
| "step": 90400 | |
| }, | |
| { | |
| "epoch": 4.53, | |
| "grad_norm": 8.740313529968262, | |
| "learning_rate": 1.9165829145728645e-06, | |
| "loss": 0.6198, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 4.53, | |
| "grad_norm": 8.339399337768555, | |
| "learning_rate": 1.8964824120603017e-06, | |
| "loss": 0.6482, | |
| "step": 90600 | |
| }, | |
| { | |
| "epoch": 4.54, | |
| "grad_norm": 8.13129997253418, | |
| "learning_rate": 1.876381909547739e-06, | |
| "loss": 0.6521, | |
| "step": 90700 | |
| }, | |
| { | |
| "epoch": 4.54, | |
| "grad_norm": 10.06900405883789, | |
| "learning_rate": 1.856281407035176e-06, | |
| "loss": 0.6472, | |
| "step": 90800 | |
| }, | |
| { | |
| "epoch": 4.54, | |
| "grad_norm": 6.953003406524658, | |
| "learning_rate": 1.836180904522613e-06, | |
| "loss": 0.6185, | |
| "step": 90900 | |
| }, | |
| { | |
| "epoch": 4.55, | |
| "grad_norm": 7.572219371795654, | |
| "learning_rate": 1.8160804020100504e-06, | |
| "loss": 0.664, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 4.55, | |
| "grad_norm": 8.318469047546387, | |
| "learning_rate": 1.7959798994974875e-06, | |
| "loss": 0.6442, | |
| "step": 91100 | |
| }, | |
| { | |
| "epoch": 4.56, | |
| "grad_norm": 6.608754634857178, | |
| "learning_rate": 1.7758793969849246e-06, | |
| "loss": 0.6398, | |
| "step": 91200 | |
| }, | |
| { | |
| "epoch": 4.56, | |
| "grad_norm": 7.397676467895508, | |
| "learning_rate": 1.755778894472362e-06, | |
| "loss": 0.6689, | |
| "step": 91300 | |
| }, | |
| { | |
| "epoch": 4.57, | |
| "grad_norm": 10.482325553894043, | |
| "learning_rate": 1.7356783919597991e-06, | |
| "loss": 0.6792, | |
| "step": 91400 | |
| }, | |
| { | |
| "epoch": 4.58, | |
| "grad_norm": 5.926417827606201, | |
| "learning_rate": 1.7155778894472364e-06, | |
| "loss": 0.6774, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 4.58, | |
| "grad_norm": 8.223274230957031, | |
| "learning_rate": 1.6954773869346736e-06, | |
| "loss": 0.6528, | |
| "step": 91600 | |
| }, | |
| { | |
| "epoch": 4.58, | |
| "grad_norm": 7.564822196960449, | |
| "learning_rate": 1.6753768844221107e-06, | |
| "loss": 0.6224, | |
| "step": 91700 | |
| }, | |
| { | |
| "epoch": 4.59, | |
| "grad_norm": 6.845765113830566, | |
| "learning_rate": 1.655276381909548e-06, | |
| "loss": 0.6984, | |
| "step": 91800 | |
| }, | |
| { | |
| "epoch": 4.59, | |
| "grad_norm": 6.044042587280273, | |
| "learning_rate": 1.6353768844221107e-06, | |
| "loss": 0.6211, | |
| "step": 91900 | |
| }, | |
| { | |
| "epoch": 4.6, | |
| "grad_norm": 12.825979232788086, | |
| "learning_rate": 1.615276381909548e-06, | |
| "loss": 0.6851, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 4.61, | |
| "grad_norm": 6.73763370513916, | |
| "learning_rate": 1.5951758793969851e-06, | |
| "loss": 0.6161, | |
| "step": 92100 | |
| }, | |
| { | |
| "epoch": 4.61, | |
| "grad_norm": 6.827399730682373, | |
| "learning_rate": 1.5750753768844223e-06, | |
| "loss": 0.6525, | |
| "step": 92200 | |
| }, | |
| { | |
| "epoch": 4.62, | |
| "grad_norm": 6.6664228439331055, | |
| "learning_rate": 1.5549748743718594e-06, | |
| "loss": 0.6617, | |
| "step": 92300 | |
| }, | |
| { | |
| "epoch": 4.62, | |
| "grad_norm": 9.772034645080566, | |
| "learning_rate": 1.5348743718592965e-06, | |
| "loss": 0.6687, | |
| "step": 92400 | |
| }, | |
| { | |
| "epoch": 4.62, | |
| "grad_norm": 6.625182151794434, | |
| "learning_rate": 1.5147738693467336e-06, | |
| "loss": 0.6545, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 4.63, | |
| "grad_norm": 10.207441329956055, | |
| "learning_rate": 1.494673366834171e-06, | |
| "loss": 0.6332, | |
| "step": 92600 | |
| }, | |
| { | |
| "epoch": 4.63, | |
| "grad_norm": 9.929265975952148, | |
| "learning_rate": 1.474572864321608e-06, | |
| "loss": 0.6391, | |
| "step": 92700 | |
| }, | |
| { | |
| "epoch": 4.64, | |
| "grad_norm": 6.050763130187988, | |
| "learning_rate": 1.4544723618090452e-06, | |
| "loss": 0.6708, | |
| "step": 92800 | |
| }, | |
| { | |
| "epoch": 4.64, | |
| "grad_norm": 5.504277229309082, | |
| "learning_rate": 1.4343718592964826e-06, | |
| "loss": 0.6578, | |
| "step": 92900 | |
| }, | |
| { | |
| "epoch": 4.65, | |
| "grad_norm": 7.113737106323242, | |
| "learning_rate": 1.4142713567839197e-06, | |
| "loss": 0.6419, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 4.66, | |
| "grad_norm": 7.181005001068115, | |
| "learning_rate": 1.394170854271357e-06, | |
| "loss": 0.6298, | |
| "step": 93100 | |
| }, | |
| { | |
| "epoch": 4.66, | |
| "grad_norm": 8.930741310119629, | |
| "learning_rate": 1.3740703517587942e-06, | |
| "loss": 0.6734, | |
| "step": 93200 | |
| }, | |
| { | |
| "epoch": 4.67, | |
| "grad_norm": 6.288244724273682, | |
| "learning_rate": 1.3539698492462313e-06, | |
| "loss": 0.6307, | |
| "step": 93300 | |
| }, | |
| { | |
| "epoch": 4.67, | |
| "grad_norm": 6.91972017288208, | |
| "learning_rate": 1.3338693467336686e-06, | |
| "loss": 0.676, | |
| "step": 93400 | |
| }, | |
| { | |
| "epoch": 4.67, | |
| "grad_norm": 8.017012596130371, | |
| "learning_rate": 1.3137688442211055e-06, | |
| "loss": 0.6157, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 4.68, | |
| "grad_norm": 4.738548755645752, | |
| "learning_rate": 1.2936683417085427e-06, | |
| "loss": 0.679, | |
| "step": 93600 | |
| }, | |
| { | |
| "epoch": 4.69, | |
| "grad_norm": 6.201863765716553, | |
| "learning_rate": 1.27356783919598e-06, | |
| "loss": 0.6542, | |
| "step": 93700 | |
| }, | |
| { | |
| "epoch": 4.69, | |
| "grad_norm": 7.595000267028809, | |
| "learning_rate": 1.2534673366834171e-06, | |
| "loss": 0.6659, | |
| "step": 93800 | |
| }, | |
| { | |
| "epoch": 4.7, | |
| "grad_norm": 5.57780647277832, | |
| "learning_rate": 1.2333668341708543e-06, | |
| "loss": 0.6381, | |
| "step": 93900 | |
| }, | |
| { | |
| "epoch": 4.7, | |
| "grad_norm": 8.426780700683594, | |
| "learning_rate": 1.2132663316582916e-06, | |
| "loss": 0.6705, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 4.71, | |
| "grad_norm": 7.012176990509033, | |
| "learning_rate": 1.1931658291457287e-06, | |
| "loss": 0.6874, | |
| "step": 94100 | |
| }, | |
| { | |
| "epoch": 4.71, | |
| "grad_norm": 7.747401237487793, | |
| "learning_rate": 1.173065326633166e-06, | |
| "loss": 0.6317, | |
| "step": 94200 | |
| }, | |
| { | |
| "epoch": 4.71, | |
| "grad_norm": 4.817531108856201, | |
| "learning_rate": 1.1529648241206032e-06, | |
| "loss": 0.6083, | |
| "step": 94300 | |
| }, | |
| { | |
| "epoch": 4.72, | |
| "grad_norm": 6.916783332824707, | |
| "learning_rate": 1.1328643216080403e-06, | |
| "loss": 0.6619, | |
| "step": 94400 | |
| }, | |
| { | |
| "epoch": 4.72, | |
| "grad_norm": 7.570366382598877, | |
| "learning_rate": 1.1127638190954775e-06, | |
| "loss": 0.6471, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 4.73, | |
| "grad_norm": 8.70361328125, | |
| "learning_rate": 1.0926633165829146e-06, | |
| "loss": 0.6483, | |
| "step": 94600 | |
| }, | |
| { | |
| "epoch": 4.74, | |
| "grad_norm": 9.341569900512695, | |
| "learning_rate": 1.072562814070352e-06, | |
| "loss": 0.6194, | |
| "step": 94700 | |
| }, | |
| { | |
| "epoch": 4.74, | |
| "grad_norm": 4.283209800720215, | |
| "learning_rate": 1.052462311557789e-06, | |
| "loss": 0.6111, | |
| "step": 94800 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "grad_norm": 8.134038925170898, | |
| "learning_rate": 1.0323618090452262e-06, | |
| "loss": 0.632, | |
| "step": 94900 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "grad_norm": 8.605172157287598, | |
| "learning_rate": 1.0122613065326633e-06, | |
| "loss": 0.6341, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "grad_norm": 8.067020416259766, | |
| "learning_rate": 9.921608040201006e-07, | |
| "loss": 0.6694, | |
| "step": 95100 | |
| }, | |
| { | |
| "epoch": 4.76, | |
| "grad_norm": 6.967876434326172, | |
| "learning_rate": 9.720603015075378e-07, | |
| "loss": 0.648, | |
| "step": 95200 | |
| }, | |
| { | |
| "epoch": 4.76, | |
| "grad_norm": 8.443940162658691, | |
| "learning_rate": 9.51959798994975e-07, | |
| "loss": 0.6174, | |
| "step": 95300 | |
| }, | |
| { | |
| "epoch": 4.77, | |
| "grad_norm": 8.791583061218262, | |
| "learning_rate": 9.318592964824122e-07, | |
| "loss": 0.6463, | |
| "step": 95400 | |
| }, | |
| { | |
| "epoch": 4.78, | |
| "grad_norm": 8.055484771728516, | |
| "learning_rate": 9.117587939698493e-07, | |
| "loss": 0.5966, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 4.78, | |
| "grad_norm": 5.009509563446045, | |
| "learning_rate": 8.916582914572865e-07, | |
| "loss": 0.6147, | |
| "step": 95600 | |
| }, | |
| { | |
| "epoch": 4.79, | |
| "grad_norm": 5.755350589752197, | |
| "learning_rate": 8.715577889447237e-07, | |
| "loss": 0.6101, | |
| "step": 95700 | |
| }, | |
| { | |
| "epoch": 4.79, | |
| "grad_norm": 8.774045944213867, | |
| "learning_rate": 8.514572864321608e-07, | |
| "loss": 0.6332, | |
| "step": 95800 | |
| }, | |
| { | |
| "epoch": 4.79, | |
| "grad_norm": 6.463279724121094, | |
| "learning_rate": 8.315577889447237e-07, | |
| "loss": 0.6705, | |
| "step": 95900 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 5.299009323120117, | |
| "learning_rate": 8.114572864321608e-07, | |
| "loss": 0.6605, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 6.5152130126953125, | |
| "learning_rate": 7.91356783919598e-07, | |
| "loss": 0.6456, | |
| "step": 96100 | |
| }, | |
| { | |
| "epoch": 4.81, | |
| "grad_norm": 8.499478340148926, | |
| "learning_rate": 7.712562814070353e-07, | |
| "loss": 0.6454, | |
| "step": 96200 | |
| }, | |
| { | |
| "epoch": 4.81, | |
| "grad_norm": 8.317819595336914, | |
| "learning_rate": 7.511557788944725e-07, | |
| "loss": 0.5961, | |
| "step": 96300 | |
| }, | |
| { | |
| "epoch": 4.82, | |
| "grad_norm": 7.257504940032959, | |
| "learning_rate": 7.310552763819095e-07, | |
| "loss": 0.614, | |
| "step": 96400 | |
| }, | |
| { | |
| "epoch": 4.83, | |
| "grad_norm": 3.862578868865967, | |
| "learning_rate": 7.109547738693468e-07, | |
| "loss": 0.6388, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 4.83, | |
| "grad_norm": 8.748353958129883, | |
| "learning_rate": 6.90854271356784e-07, | |
| "loss": 0.6222, | |
| "step": 96600 | |
| }, | |
| { | |
| "epoch": 4.83, | |
| "grad_norm": 8.883009910583496, | |
| "learning_rate": 6.707537688442211e-07, | |
| "loss": 0.639, | |
| "step": 96700 | |
| }, | |
| { | |
| "epoch": 4.84, | |
| "grad_norm": 7.332880973815918, | |
| "learning_rate": 6.506532663316584e-07, | |
| "loss": 0.6341, | |
| "step": 96800 | |
| }, | |
| { | |
| "epoch": 4.84, | |
| "grad_norm": 7.421239852905273, | |
| "learning_rate": 6.305527638190956e-07, | |
| "loss": 0.6378, | |
| "step": 96900 | |
| }, | |
| { | |
| "epoch": 4.85, | |
| "grad_norm": 6.633522033691406, | |
| "learning_rate": 6.104522613065327e-07, | |
| "loss": 0.6587, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 4.86, | |
| "grad_norm": 6.347668170928955, | |
| "learning_rate": 5.903517587939699e-07, | |
| "loss": 0.6355, | |
| "step": 97100 | |
| }, | |
| { | |
| "epoch": 4.86, | |
| "grad_norm": 5.266615390777588, | |
| "learning_rate": 5.702512562814071e-07, | |
| "loss": 0.5976, | |
| "step": 97200 | |
| }, | |
| { | |
| "epoch": 4.87, | |
| "grad_norm": 5.0562286376953125, | |
| "learning_rate": 5.501507537688443e-07, | |
| "loss": 0.6426, | |
| "step": 97300 | |
| }, | |
| { | |
| "epoch": 4.87, | |
| "grad_norm": 9.852864265441895, | |
| "learning_rate": 5.300502512562814e-07, | |
| "loss": 0.6434, | |
| "step": 97400 | |
| }, | |
| { | |
| "epoch": 4.88, | |
| "grad_norm": 5.227302551269531, | |
| "learning_rate": 5.099497487437187e-07, | |
| "loss": 0.674, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 4.88, | |
| "grad_norm": 7.586268424987793, | |
| "learning_rate": 4.900502512562814e-07, | |
| "loss": 0.6826, | |
| "step": 97600 | |
| }, | |
| { | |
| "epoch": 4.88, | |
| "grad_norm": 7.512186527252197, | |
| "learning_rate": 4.699497487437187e-07, | |
| "loss": 0.6428, | |
| "step": 97700 | |
| }, | |
| { | |
| "epoch": 4.89, | |
| "grad_norm": 8.383907318115234, | |
| "learning_rate": 4.498492462311558e-07, | |
| "loss": 0.6215, | |
| "step": 97800 | |
| }, | |
| { | |
| "epoch": 4.89, | |
| "grad_norm": 6.214056968688965, | |
| "learning_rate": 4.29748743718593e-07, | |
| "loss": 0.6066, | |
| "step": 97900 | |
| }, | |
| { | |
| "epoch": 4.9, | |
| "grad_norm": 8.587347030639648, | |
| "learning_rate": 4.096482412060302e-07, | |
| "loss": 0.6213, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 4.91, | |
| "grad_norm": 14.060787200927734, | |
| "learning_rate": 3.8954773869346735e-07, | |
| "loss": 0.6151, | |
| "step": 98100 | |
| }, | |
| { | |
| "epoch": 4.91, | |
| "grad_norm": 11.65833568572998, | |
| "learning_rate": 3.694472361809046e-07, | |
| "loss": 0.6226, | |
| "step": 98200 | |
| }, | |
| { | |
| "epoch": 4.92, | |
| "grad_norm": 5.729846477508545, | |
| "learning_rate": 3.4934673366834176e-07, | |
| "loss": 0.6265, | |
| "step": 98300 | |
| }, | |
| { | |
| "epoch": 4.92, | |
| "grad_norm": 5.596776485443115, | |
| "learning_rate": 3.292462311557789e-07, | |
| "loss": 0.6048, | |
| "step": 98400 | |
| }, | |
| { | |
| "epoch": 4.92, | |
| "grad_norm": 5.834877967834473, | |
| "learning_rate": 3.091457286432161e-07, | |
| "loss": 0.6358, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 4.93, | |
| "grad_norm": 7.830298900604248, | |
| "learning_rate": 2.890452261306533e-07, | |
| "loss": 0.6381, | |
| "step": 98600 | |
| }, | |
| { | |
| "epoch": 4.94, | |
| "grad_norm": 7.147890567779541, | |
| "learning_rate": 2.689447236180905e-07, | |
| "loss": 0.6428, | |
| "step": 98700 | |
| }, | |
| { | |
| "epoch": 4.94, | |
| "grad_norm": 5.18765926361084, | |
| "learning_rate": 2.4884422110552766e-07, | |
| "loss": 0.6098, | |
| "step": 98800 | |
| }, | |
| { | |
| "epoch": 4.95, | |
| "grad_norm": 7.276676654815674, | |
| "learning_rate": 2.2874371859296484e-07, | |
| "loss": 0.6329, | |
| "step": 98900 | |
| }, | |
| { | |
| "epoch": 4.95, | |
| "grad_norm": 7.58540678024292, | |
| "learning_rate": 2.0864321608040202e-07, | |
| "loss": 0.6095, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "grad_norm": 5.402534008026123, | |
| "learning_rate": 1.8854271356783923e-07, | |
| "loss": 0.605, | |
| "step": 99100 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "grad_norm": 7.289499282836914, | |
| "learning_rate": 1.684422110552764e-07, | |
| "loss": 0.6694, | |
| "step": 99200 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "grad_norm": 7.618215560913086, | |
| "learning_rate": 1.483417085427136e-07, | |
| "loss": 0.6313, | |
| "step": 99300 | |
| }, | |
| { | |
| "epoch": 4.97, | |
| "grad_norm": 7.560898780822754, | |
| "learning_rate": 1.2824120603015077e-07, | |
| "loss": 0.6073, | |
| "step": 99400 | |
| }, | |
| { | |
| "epoch": 4.97, | |
| "grad_norm": 5.637300968170166, | |
| "learning_rate": 1.0834170854271359e-07, | |
| "loss": 0.6211, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 4.98, | |
| "grad_norm": 8.691441535949707, | |
| "learning_rate": 8.824120603015076e-08, | |
| "loss": 0.6085, | |
| "step": 99600 | |
| }, | |
| { | |
| "epoch": 4.99, | |
| "grad_norm": 4.510754585266113, | |
| "learning_rate": 6.814070351758795e-08, | |
| "loss": 0.6193, | |
| "step": 99700 | |
| }, | |
| { | |
| "epoch": 4.99, | |
| "grad_norm": 7.4050703048706055, | |
| "learning_rate": 4.804020100502513e-08, | |
| "loss": 0.6642, | |
| "step": 99800 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 9.641931533813477, | |
| "learning_rate": 2.7939698492462312e-08, | |
| "loss": 0.6304, | |
| "step": 99900 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 7.846133232116699, | |
| "learning_rate": 7.839195979899499e-09, | |
| "loss": 0.6181, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.9481298923492432, | |
| "eval_runtime": 21.6157, | |
| "eval_samples_per_second": 46.263, | |
| "eval_steps_per_second": 5.783, | |
| "step": 100000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 100000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 10000, | |
| "total_flos": 1.1800273747968e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |