| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 408768, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 1.4115859270095825, | |
| "learning_rate": 4.993884061374668e-05, | |
| "loss": 7.5674, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.173668384552002, | |
| "learning_rate": 4.987768122749335e-05, | |
| "loss": 7.1474, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.1809221506118774, | |
| "learning_rate": 4.981652184124002e-05, | |
| "loss": 7.0218, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.898521065711975, | |
| "learning_rate": 4.975536245498669e-05, | |
| "loss": 6.9342, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 2.6537320613861084, | |
| "learning_rate": 4.969420306873337e-05, | |
| "loss": 6.8595, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.9779026508331299, | |
| "learning_rate": 4.963304368248004e-05, | |
| "loss": 6.8074, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 3.4129042625427246, | |
| "learning_rate": 4.957188429622671e-05, | |
| "loss": 6.7628, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 2.0669405460357666, | |
| "learning_rate": 4.951072490997339e-05, | |
| "loss": 6.7217, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 2.6271133422851562, | |
| "learning_rate": 4.944956552372006e-05, | |
| "loss": 6.6808, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.944228410720825, | |
| "learning_rate": 4.938840613746673e-05, | |
| "loss": 6.6526, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.9261572360992432, | |
| "learning_rate": 4.9327246751213406e-05, | |
| "loss": 6.6157, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.7971737384796143, | |
| "learning_rate": 4.926608736496008e-05, | |
| "loss": 6.585, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.061311960220337, | |
| "learning_rate": 4.920492797870675e-05, | |
| "loss": 6.5625, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.9171266555786133, | |
| "learning_rate": 4.914376859245342e-05, | |
| "loss": 6.5476, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.557237386703491, | |
| "learning_rate": 4.9082609206200096e-05, | |
| "loss": 6.5063, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 1.9273940324783325, | |
| "learning_rate": 4.902144981994677e-05, | |
| "loss": 6.4848, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.46166729927063, | |
| "learning_rate": 4.896029043369344e-05, | |
| "loss": 6.4549, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.1991207599639893, | |
| "learning_rate": 4.889913104744012e-05, | |
| "loss": 6.4151, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 3.0304417610168457, | |
| "learning_rate": 4.8837971661186786e-05, | |
| "loss": 6.3421, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.5440313816070557, | |
| "learning_rate": 4.877681227493346e-05, | |
| "loss": 6.2588, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 2.8865408897399902, | |
| "learning_rate": 4.8715652888680135e-05, | |
| "loss": 6.1081, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.689366579055786, | |
| "learning_rate": 4.865449350242681e-05, | |
| "loss": 5.965, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.3553738594055176, | |
| "learning_rate": 4.8593334116173477e-05, | |
| "loss": 5.8321, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.879040479660034, | |
| "learning_rate": 4.853217472992016e-05, | |
| "loss": 5.7097, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.504615306854248, | |
| "learning_rate": 4.8471015343666825e-05, | |
| "loss": 5.5818, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 3.884676218032837, | |
| "learning_rate": 4.84098559574135e-05, | |
| "loss": 5.4549, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 3.560129165649414, | |
| "learning_rate": 4.8348696571160174e-05, | |
| "loss": 5.3163, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 3.542060613632202, | |
| "learning_rate": 4.828753718490685e-05, | |
| "loss": 5.1749, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 3.485215425491333, | |
| "learning_rate": 4.8226377798653515e-05, | |
| "loss": 5.0056, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 3.323887586593628, | |
| "learning_rate": 4.816521841240019e-05, | |
| "loss": 4.8195, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 3.07159161567688, | |
| "learning_rate": 4.8104059026146864e-05, | |
| "loss": 4.6421, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.2840936183929443, | |
| "learning_rate": 4.804289963989354e-05, | |
| "loss": 4.475, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 2.858837366104126, | |
| "learning_rate": 4.798174025364021e-05, | |
| "loss": 4.3387, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 2.913006544113159, | |
| "learning_rate": 4.792058086738688e-05, | |
| "loss": 4.2142, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 2.8114237785339355, | |
| "learning_rate": 4.7859421481133554e-05, | |
| "loss": 4.116, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 3.1581108570098877, | |
| "learning_rate": 4.779826209488023e-05, | |
| "loss": 4.0327, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 2.9168930053710938, | |
| "learning_rate": 4.77371027086269e-05, | |
| "loss": 3.9404, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 2.8912034034729004, | |
| "learning_rate": 4.767594332237357e-05, | |
| "loss": 3.874, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 2.909111976623535, | |
| "learning_rate": 4.7614783936120244e-05, | |
| "loss": 3.8014, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 3.0821359157562256, | |
| "learning_rate": 4.755362454986692e-05, | |
| "loss": 3.7285, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 2.603447675704956, | |
| "learning_rate": 4.749246516361359e-05, | |
| "loss": 3.6757, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 2.466923952102661, | |
| "learning_rate": 4.743130577736026e-05, | |
| "loss": 3.6054, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 2.949532985687256, | |
| "learning_rate": 4.737014639110694e-05, | |
| "loss": 3.5478, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 2.7499911785125732, | |
| "learning_rate": 4.730898700485361e-05, | |
| "loss": 3.5018, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 2.8840396404266357, | |
| "learning_rate": 4.724782761860028e-05, | |
| "loss": 3.4503, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 2.7901082038879395, | |
| "learning_rate": 4.718666823234696e-05, | |
| "loss": 3.385, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 2.499488353729248, | |
| "learning_rate": 4.712550884609363e-05, | |
| "loss": 3.3385, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 2.77323055267334, | |
| "learning_rate": 4.70643494598403e-05, | |
| "loss": 3.2953, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 3.0041913986206055, | |
| "learning_rate": 4.700319007358698e-05, | |
| "loss": 3.2472, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 2.72615909576416, | |
| "learning_rate": 4.694203068733365e-05, | |
| "loss": 3.2107, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 2.766868829727173, | |
| "learning_rate": 4.688087130108032e-05, | |
| "loss": 3.1768, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 2.5954720973968506, | |
| "learning_rate": 4.6819711914826996e-05, | |
| "loss": 3.1358, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 2.7101314067840576, | |
| "learning_rate": 4.675855252857367e-05, | |
| "loss": 3.0949, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 2.7481093406677246, | |
| "learning_rate": 4.669739314232034e-05, | |
| "loss": 3.0655, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 2.869677782058716, | |
| "learning_rate": 4.663623375606702e-05, | |
| "loss": 3.0188, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 3.081693649291992, | |
| "learning_rate": 4.6575074369813686e-05, | |
| "loss": 2.9893, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 3.1503679752349854, | |
| "learning_rate": 4.651391498356036e-05, | |
| "loss": 2.975, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 2.9327192306518555, | |
| "learning_rate": 4.645275559730703e-05, | |
| "loss": 2.9276, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 2.580777406692505, | |
| "learning_rate": 4.639159621105371e-05, | |
| "loss": 2.9062, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 2.6058924198150635, | |
| "learning_rate": 4.633043682480038e-05, | |
| "loss": 2.8895, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 2.9445390701293945, | |
| "learning_rate": 4.626927743854705e-05, | |
| "loss": 2.8634, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 3.4329161643981934, | |
| "learning_rate": 4.6208118052293725e-05, | |
| "loss": 2.8399, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 2.910855770111084, | |
| "learning_rate": 4.61469586660404e-05, | |
| "loss": 2.8167, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 2.9852588176727295, | |
| "learning_rate": 4.608579927978707e-05, | |
| "loss": 2.8001, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 2.6188220977783203, | |
| "learning_rate": 4.602463989353374e-05, | |
| "loss": 2.7929, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 2.753516912460327, | |
| "learning_rate": 4.5963480507280416e-05, | |
| "loss": 2.7677, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 2.5995850563049316, | |
| "learning_rate": 4.590232112102709e-05, | |
| "loss": 2.7399, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 2.697634696960449, | |
| "learning_rate": 4.5841161734773764e-05, | |
| "loss": 2.7241, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 3.0841758251190186, | |
| "learning_rate": 4.578000234852043e-05, | |
| "loss": 2.7143, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 3.0621519088745117, | |
| "learning_rate": 4.5718842962267106e-05, | |
| "loss": 2.6897, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 2.912416934967041, | |
| "learning_rate": 4.565768357601378e-05, | |
| "loss": 2.6762, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 2.95345401763916, | |
| "learning_rate": 4.5596524189760454e-05, | |
| "loss": 2.6708, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 2.7842564582824707, | |
| "learning_rate": 4.553536480350712e-05, | |
| "loss": 2.6507, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 2.9051456451416016, | |
| "learning_rate": 4.54742054172538e-05, | |
| "loss": 2.6361, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 2.8248302936553955, | |
| "learning_rate": 4.541304603100047e-05, | |
| "loss": 2.619, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 3.076840400695801, | |
| "learning_rate": 4.5351886644747145e-05, | |
| "loss": 2.6011, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 2.7716526985168457, | |
| "learning_rate": 4.529072725849382e-05, | |
| "loss": 2.5928, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 2.8269171714782715, | |
| "learning_rate": 4.522956787224049e-05, | |
| "loss": 2.5813, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 3.041726589202881, | |
| "learning_rate": 4.516840848598716e-05, | |
| "loss": 2.5583, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 2.676236391067505, | |
| "learning_rate": 4.510724909973384e-05, | |
| "loss": 2.5538, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 2.8164145946502686, | |
| "learning_rate": 4.504608971348051e-05, | |
| "loss": 2.55, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 2.637202024459839, | |
| "learning_rate": 4.498493032722718e-05, | |
| "loss": 2.5273, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 2.9004831314086914, | |
| "learning_rate": 4.492377094097385e-05, | |
| "loss": 2.5142, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 2.7393102645874023, | |
| "learning_rate": 4.486261155472053e-05, | |
| "loss": 2.5093, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 2.814011335372925, | |
| "learning_rate": 4.48014521684672e-05, | |
| "loss": 2.4936, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 2.625415802001953, | |
| "learning_rate": 4.4740292782213874e-05, | |
| "loss": 2.4787, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 2.955406427383423, | |
| "learning_rate": 4.467913339596055e-05, | |
| "loss": 2.471, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 2.6002650260925293, | |
| "learning_rate": 4.461797400970722e-05, | |
| "loss": 2.4626, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 2.75281023979187, | |
| "learning_rate": 4.455681462345389e-05, | |
| "loss": 2.459, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 2.5558836460113525, | |
| "learning_rate": 4.449565523720057e-05, | |
| "loss": 2.4441, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 2.634889602661133, | |
| "learning_rate": 4.443449585094724e-05, | |
| "loss": 2.4392, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 3.015256643295288, | |
| "learning_rate": 4.437333646469391e-05, | |
| "loss": 2.4218, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 2.656592607498169, | |
| "learning_rate": 4.431217707844059e-05, | |
| "loss": 2.4148, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 2.9560022354125977, | |
| "learning_rate": 4.425101769218726e-05, | |
| "loss": 2.4074, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 2.8009862899780273, | |
| "learning_rate": 4.418985830593393e-05, | |
| "loss": 2.4028, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 2.7291080951690674, | |
| "learning_rate": 4.41286989196806e-05, | |
| "loss": 2.3874, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 2.959327220916748, | |
| "learning_rate": 4.406753953342728e-05, | |
| "loss": 2.3803, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 2.4076898097991943, | |
| "learning_rate": 4.400638014717395e-05, | |
| "loss": 2.3641, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 2.703214168548584, | |
| "learning_rate": 4.3945220760920625e-05, | |
| "loss": 2.3707, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 2.7057530879974365, | |
| "learning_rate": 4.388406137466729e-05, | |
| "loss": 2.3551, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 2.656576156616211, | |
| "learning_rate": 4.382290198841397e-05, | |
| "loss": 2.3501, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 2.7500181198120117, | |
| "learning_rate": 4.376174260216064e-05, | |
| "loss": 2.342, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 2.560018301010132, | |
| "learning_rate": 4.3700583215907316e-05, | |
| "loss": 2.3378, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 2.5425586700439453, | |
| "learning_rate": 4.363942382965398e-05, | |
| "loss": 2.3263, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 2.7227046489715576, | |
| "learning_rate": 4.357826444340066e-05, | |
| "loss": 2.3216, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 2.8094546794891357, | |
| "learning_rate": 4.351710505714733e-05, | |
| "loss": 2.3168, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 2.559802293777466, | |
| "learning_rate": 4.3455945670894006e-05, | |
| "loss": 2.3129, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 2.9485251903533936, | |
| "learning_rate": 4.3394786284640673e-05, | |
| "loss": 2.3055, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 2.554258346557617, | |
| "learning_rate": 4.3333626898387355e-05, | |
| "loss": 2.2897, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 2.9118120670318604, | |
| "learning_rate": 4.327246751213402e-05, | |
| "loss": 2.2804, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 2.6395227909088135, | |
| "learning_rate": 4.3211308125880696e-05, | |
| "loss": 2.285, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 2.8277575969696045, | |
| "learning_rate": 4.315014873962737e-05, | |
| "loss": 2.2725, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 2.6950581073760986, | |
| "learning_rate": 4.3088989353374045e-05, | |
| "loss": 2.2726, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 2.5823190212249756, | |
| "learning_rate": 4.302782996712071e-05, | |
| "loss": 2.2676, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 2.9325387477874756, | |
| "learning_rate": 4.296667058086739e-05, | |
| "loss": 2.25, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 2.915308952331543, | |
| "learning_rate": 4.290551119461406e-05, | |
| "loss": 2.237, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 2.782322883605957, | |
| "learning_rate": 4.2844351808360735e-05, | |
| "loss": 2.2435, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 2.8868165016174316, | |
| "learning_rate": 4.278319242210741e-05, | |
| "loss": 2.2282, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 2.906238079071045, | |
| "learning_rate": 4.2722033035854084e-05, | |
| "loss": 2.2278, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 2.8209023475646973, | |
| "learning_rate": 4.266087364960075e-05, | |
| "loss": 2.2212, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 2.944169044494629, | |
| "learning_rate": 4.259971426334743e-05, | |
| "loss": 2.2107, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 3.010463237762451, | |
| "learning_rate": 4.25385548770941e-05, | |
| "loss": 2.2112, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 2.5480105876922607, | |
| "learning_rate": 4.2477395490840774e-05, | |
| "loss": 2.2232, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 2.619253635406494, | |
| "learning_rate": 4.241623610458745e-05, | |
| "loss": 2.1997, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 2.716602325439453, | |
| "learning_rate": 4.235507671833412e-05, | |
| "loss": 2.1967, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 2.7304656505584717, | |
| "learning_rate": 4.229391733208079e-05, | |
| "loss": 2.1865, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 2.3193280696868896, | |
| "learning_rate": 4.2232757945827464e-05, | |
| "loss": 2.1783, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 2.659099817276001, | |
| "learning_rate": 4.217159855957414e-05, | |
| "loss": 2.1669, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 2.8396430015563965, | |
| "learning_rate": 4.211043917332081e-05, | |
| "loss": 2.1794, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 2.5535542964935303, | |
| "learning_rate": 4.204927978706748e-05, | |
| "loss": 2.1688, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 2.776742696762085, | |
| "learning_rate": 4.1988120400814154e-05, | |
| "loss": 2.1661, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 2.4852330684661865, | |
| "learning_rate": 4.192696101456083e-05, | |
| "loss": 2.1591, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 2.8885903358459473, | |
| "learning_rate": 4.1865801628307496e-05, | |
| "loss": 2.1581, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 2.6640567779541016, | |
| "learning_rate": 4.180464224205418e-05, | |
| "loss": 2.1549, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 2.27127742767334, | |
| "learning_rate": 4.1743482855800845e-05, | |
| "loss": 2.1492, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 2.591395139694214, | |
| "learning_rate": 4.168232346954752e-05, | |
| "loss": 2.1382, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 2.8147051334381104, | |
| "learning_rate": 4.162116408329419e-05, | |
| "loss": 2.1407, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 2.650275707244873, | |
| "learning_rate": 4.156000469704087e-05, | |
| "loss": 2.1345, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 2.7816410064697266, | |
| "learning_rate": 4.1498845310787535e-05, | |
| "loss": 2.1371, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 2.6034348011016846, | |
| "learning_rate": 4.1437685924534216e-05, | |
| "loss": 2.1264, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 2.6060938835144043, | |
| "learning_rate": 4.1376526538280883e-05, | |
| "loss": 2.1186, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 2.754519462585449, | |
| "learning_rate": 4.131536715202756e-05, | |
| "loss": 2.1167, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 2.665511131286621, | |
| "learning_rate": 4.125420776577423e-05, | |
| "loss": 2.1025, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 2.8608968257904053, | |
| "learning_rate": 4.1193048379520906e-05, | |
| "loss": 2.1087, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 2.521726369857788, | |
| "learning_rate": 4.1131888993267574e-05, | |
| "loss": 2.1035, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 2.713449001312256, | |
| "learning_rate": 4.1070729607014255e-05, | |
| "loss": 2.0853, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 2.6165554523468018, | |
| "learning_rate": 4.100957022076092e-05, | |
| "loss": 2.1032, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 2.5650808811187744, | |
| "learning_rate": 4.0948410834507596e-05, | |
| "loss": 2.0823, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 2.7067785263061523, | |
| "learning_rate": 4.0887251448254264e-05, | |
| "loss": 2.0843, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 2.6049702167510986, | |
| "learning_rate": 4.0826092062000945e-05, | |
| "loss": 2.0851, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 2.487355947494507, | |
| "learning_rate": 4.076493267574761e-05, | |
| "loss": 2.0745, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 2.4251976013183594, | |
| "learning_rate": 4.070377328949429e-05, | |
| "loss": 2.0728, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 2.6600770950317383, | |
| "learning_rate": 4.064261390324096e-05, | |
| "loss": 2.0725, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 2.488598346710205, | |
| "learning_rate": 4.0581454516987635e-05, | |
| "loss": 2.0638, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 2.8305716514587402, | |
| "learning_rate": 4.05202951307343e-05, | |
| "loss": 2.0673, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 2.607948064804077, | |
| "learning_rate": 4.0459135744480984e-05, | |
| "loss": 2.0698, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 2.558473825454712, | |
| "learning_rate": 4.039797635822765e-05, | |
| "loss": 2.0531, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 2.675361394882202, | |
| "learning_rate": 4.0336816971974326e-05, | |
| "loss": 2.0568, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 2.430924654006958, | |
| "learning_rate": 4.0275657585721e-05, | |
| "loss": 2.0472, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 2.637683153152466, | |
| "learning_rate": 4.021449819946767e-05, | |
| "loss": 2.0422, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 2.8251748085021973, | |
| "learning_rate": 4.015333881321434e-05, | |
| "loss": 2.0438, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 2.9130163192749023, | |
| "learning_rate": 4.0092179426961016e-05, | |
| "loss": 2.0334, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 2.6857762336730957, | |
| "learning_rate": 4.003102004070769e-05, | |
| "loss": 2.0298, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 2.7029411792755127, | |
| "learning_rate": 3.996986065445436e-05, | |
| "loss": 2.0266, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 2.43390154838562, | |
| "learning_rate": 3.990870126820104e-05, | |
| "loss": 2.0323, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 2.8077657222747803, | |
| "learning_rate": 3.9847541881947706e-05, | |
| "loss": 2.0204, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 2.6263062953948975, | |
| "learning_rate": 3.978638249569438e-05, | |
| "loss": 2.0238, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 2.582228660583496, | |
| "learning_rate": 3.9725223109441055e-05, | |
| "loss": 2.0157, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 2.989870548248291, | |
| "learning_rate": 3.966406372318773e-05, | |
| "loss": 2.0123, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 2.50876522064209, | |
| "learning_rate": 3.9602904336934396e-05, | |
| "loss": 2.0049, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 2.754103183746338, | |
| "learning_rate": 3.954174495068108e-05, | |
| "loss": 2.0108, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 2.742558240890503, | |
| "learning_rate": 3.9480585564427745e-05, | |
| "loss": 2.0013, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 2.7211074829101562, | |
| "learning_rate": 3.941942617817442e-05, | |
| "loss": 2.0056, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 2.5889461040496826, | |
| "learning_rate": 3.9358266791921087e-05, | |
| "loss": 1.9933, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 2.581122398376465, | |
| "learning_rate": 3.929710740566777e-05, | |
| "loss": 1.9896, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 2.7021663188934326, | |
| "learning_rate": 3.9235948019414435e-05, | |
| "loss": 1.9878, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 2.6844136714935303, | |
| "learning_rate": 3.917478863316111e-05, | |
| "loss": 1.9838, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 2.6349422931671143, | |
| "learning_rate": 3.9113629246907784e-05, | |
| "loss": 1.9796, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 2.6799721717834473, | |
| "learning_rate": 3.905246986065446e-05, | |
| "loss": 1.9793, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 2.502464771270752, | |
| "learning_rate": 3.8991310474401125e-05, | |
| "loss": 1.9789, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 2.897421360015869, | |
| "learning_rate": 3.8930151088147806e-05, | |
| "loss": 1.9766, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 2.6226820945739746, | |
| "learning_rate": 3.8868991701894474e-05, | |
| "loss": 1.9785, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 2.7630228996276855, | |
| "learning_rate": 3.880783231564115e-05, | |
| "loss": 1.9805, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 2.7849583625793457, | |
| "learning_rate": 3.874667292938782e-05, | |
| "loss": 1.9672, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 2.643397569656372, | |
| "learning_rate": 3.86855135431345e-05, | |
| "loss": 1.9618, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 2.6938283443450928, | |
| "learning_rate": 3.8624354156881164e-05, | |
| "loss": 1.9674, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 2.7914974689483643, | |
| "learning_rate": 3.856319477062784e-05, | |
| "loss": 1.9616, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 2.6223716735839844, | |
| "learning_rate": 3.850203538437451e-05, | |
| "loss": 1.9566, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 2.6575443744659424, | |
| "learning_rate": 3.844087599812119e-05, | |
| "loss": 1.9611, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 2.684488534927368, | |
| "learning_rate": 3.837971661186786e-05, | |
| "loss": 1.9467, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 2.668365001678467, | |
| "learning_rate": 3.831855722561453e-05, | |
| "loss": 1.9548, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 2.967519521713257, | |
| "learning_rate": 3.82573978393612e-05, | |
| "loss": 1.9535, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 2.5876193046569824, | |
| "learning_rate": 3.819623845310788e-05, | |
| "loss": 1.9446, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 2.7293176651000977, | |
| "learning_rate": 3.813507906685455e-05, | |
| "loss": 1.9481, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 2.6928365230560303, | |
| "learning_rate": 3.807391968060122e-05, | |
| "loss": 1.9296, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 2.568150043487549, | |
| "learning_rate": 3.801276029434789e-05, | |
| "loss": 1.9422, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 2.7275748252868652, | |
| "learning_rate": 3.795160090809457e-05, | |
| "loss": 1.9314, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 2.613135576248169, | |
| "learning_rate": 3.789044152184124e-05, | |
| "loss": 1.931, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 2.408534049987793, | |
| "learning_rate": 3.782928213558791e-05, | |
| "loss": 1.9314, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 2.896430015563965, | |
| "learning_rate": 3.776812274933459e-05, | |
| "loss": 1.9221, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 2.6729750633239746, | |
| "learning_rate": 3.770696336308126e-05, | |
| "loss": 1.9247, | |
| "step": 100500 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 2.5819644927978516, | |
| "learning_rate": 3.764580397682793e-05, | |
| "loss": 1.9243, | |
| "step": 101000 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 2.561739206314087, | |
| "learning_rate": 3.7584644590574606e-05, | |
| "loss": 1.9291, | |
| "step": 101500 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 2.663254976272583, | |
| "learning_rate": 3.752348520432128e-05, | |
| "loss": 1.9138, | |
| "step": 102000 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 2.662156581878662, | |
| "learning_rate": 3.746232581806795e-05, | |
| "loss": 1.9128, | |
| "step": 102500 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 2.6324357986450195, | |
| "learning_rate": 3.740116643181463e-05, | |
| "loss": 1.9215, | |
| "step": 103000 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 2.619344711303711, | |
| "learning_rate": 3.7340007045561297e-05, | |
| "loss": 1.9043, | |
| "step": 103500 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 2.681112051010132, | |
| "learning_rate": 3.727884765930797e-05, | |
| "loss": 1.9077, | |
| "step": 104000 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 2.846804141998291, | |
| "learning_rate": 3.7217688273054645e-05, | |
| "loss": 1.9034, | |
| "step": 104500 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 3.0270516872406006, | |
| "learning_rate": 3.715652888680132e-05, | |
| "loss": 1.901, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 2.5290517807006836, | |
| "learning_rate": 3.709536950054799e-05, | |
| "loss": 1.9031, | |
| "step": 105500 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 2.661867380142212, | |
| "learning_rate": 3.703421011429467e-05, | |
| "loss": 1.902, | |
| "step": 106000 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 2.816241979598999, | |
| "learning_rate": 3.6973050728041335e-05, | |
| "loss": 1.8885, | |
| "step": 106500 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 2.8065085411071777, | |
| "learning_rate": 3.691189134178801e-05, | |
| "loss": 1.8931, | |
| "step": 107000 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 2.4863102436065674, | |
| "learning_rate": 3.6850731955534684e-05, | |
| "loss": 1.8907, | |
| "step": 107500 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 2.4044525623321533, | |
| "learning_rate": 3.678957256928136e-05, | |
| "loss": 1.8911, | |
| "step": 108000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 2.6208319664001465, | |
| "learning_rate": 3.6728413183028026e-05, | |
| "loss": 1.8889, | |
| "step": 108500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 2.4432547092437744, | |
| "learning_rate": 3.66672537967747e-05, | |
| "loss": 1.8794, | |
| "step": 109000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 2.9175052642822266, | |
| "learning_rate": 3.6606094410521374e-05, | |
| "loss": 1.8769, | |
| "step": 109500 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 2.7171223163604736, | |
| "learning_rate": 3.654493502426805e-05, | |
| "loss": 1.8834, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 2.5070419311523438, | |
| "learning_rate": 3.6483775638014716e-05, | |
| "loss": 1.8706, | |
| "step": 110500 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 2.59771990776062, | |
| "learning_rate": 3.642261625176139e-05, | |
| "loss": 1.8687, | |
| "step": 111000 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 3.022465944290161, | |
| "learning_rate": 3.6361456865508064e-05, | |
| "loss": 1.8721, | |
| "step": 111500 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 2.8927907943725586, | |
| "learning_rate": 3.630029747925474e-05, | |
| "loss": 1.8761, | |
| "step": 112000 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 2.612518787384033, | |
| "learning_rate": 3.623913809300141e-05, | |
| "loss": 1.8744, | |
| "step": 112500 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 2.7625935077667236, | |
| "learning_rate": 3.617797870674808e-05, | |
| "loss": 1.8636, | |
| "step": 113000 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 2.535382032394409, | |
| "learning_rate": 3.6116819320494755e-05, | |
| "loss": 1.8683, | |
| "step": 113500 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 2.575298547744751, | |
| "learning_rate": 3.605565993424143e-05, | |
| "loss": 1.8657, | |
| "step": 114000 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 2.6413776874542236, | |
| "learning_rate": 3.59945005479881e-05, | |
| "loss": 1.8571, | |
| "step": 114500 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 2.675283908843994, | |
| "learning_rate": 3.593334116173477e-05, | |
| "loss": 1.8575, | |
| "step": 115000 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 2.7104618549346924, | |
| "learning_rate": 3.587218177548145e-05, | |
| "loss": 1.8555, | |
| "step": 115500 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 2.7391483783721924, | |
| "learning_rate": 3.581102238922812e-05, | |
| "loss": 1.8555, | |
| "step": 116000 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 2.561523675918579, | |
| "learning_rate": 3.5749863002974793e-05, | |
| "loss": 1.8475, | |
| "step": 116500 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 2.590790033340454, | |
| "learning_rate": 3.568870361672147e-05, | |
| "loss": 1.8425, | |
| "step": 117000 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 2.889119863510132, | |
| "learning_rate": 3.562754423046814e-05, | |
| "loss": 1.8491, | |
| "step": 117500 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 2.77632212638855, | |
| "learning_rate": 3.556638484421481e-05, | |
| "loss": 1.8469, | |
| "step": 118000 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 2.720357894897461, | |
| "learning_rate": 3.550522545796149e-05, | |
| "loss": 1.8462, | |
| "step": 118500 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 2.8213210105895996, | |
| "learning_rate": 3.544406607170816e-05, | |
| "loss": 1.8399, | |
| "step": 119000 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 2.834599733352661, | |
| "learning_rate": 3.538290668545483e-05, | |
| "loss": 1.8429, | |
| "step": 119500 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 2.578364133834839, | |
| "learning_rate": 3.5321747299201506e-05, | |
| "loss": 1.8386, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 2.6725339889526367, | |
| "learning_rate": 3.526058791294818e-05, | |
| "loss": 1.85, | |
| "step": 120500 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 2.5288286209106445, | |
| "learning_rate": 3.519942852669485e-05, | |
| "loss": 1.831, | |
| "step": 121000 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 2.4805219173431396, | |
| "learning_rate": 3.513826914044152e-05, | |
| "loss": 1.836, | |
| "step": 121500 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 2.6729605197906494, | |
| "learning_rate": 3.50771097541882e-05, | |
| "loss": 1.8297, | |
| "step": 122000 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 2.5666863918304443, | |
| "learning_rate": 3.501595036793487e-05, | |
| "loss": 1.8231, | |
| "step": 122500 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 2.5059523582458496, | |
| "learning_rate": 3.495479098168154e-05, | |
| "loss": 1.8296, | |
| "step": 123000 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 2.758755922317505, | |
| "learning_rate": 3.489363159542822e-05, | |
| "loss": 1.8218, | |
| "step": 123500 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 2.5374386310577393, | |
| "learning_rate": 3.483247220917489e-05, | |
| "loss": 1.8234, | |
| "step": 124000 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 2.5575644969940186, | |
| "learning_rate": 3.477131282292156e-05, | |
| "loss": 1.8158, | |
| "step": 124500 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 2.567166328430176, | |
| "learning_rate": 3.4710153436668236e-05, | |
| "loss": 1.8179, | |
| "step": 125000 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 2.4243805408477783, | |
| "learning_rate": 3.464899405041491e-05, | |
| "loss": 1.82, | |
| "step": 125500 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 2.665632486343384, | |
| "learning_rate": 3.458783466416158e-05, | |
| "loss": 1.8151, | |
| "step": 126000 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 2.90759539604187, | |
| "learning_rate": 3.452667527790825e-05, | |
| "loss": 1.8109, | |
| "step": 126500 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 2.6775169372558594, | |
| "learning_rate": 3.4465515891654926e-05, | |
| "loss": 1.8199, | |
| "step": 127000 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 2.430788516998291, | |
| "learning_rate": 3.44043565054016e-05, | |
| "loss": 1.8128, | |
| "step": 127500 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 2.9997665882110596, | |
| "learning_rate": 3.4343197119148274e-05, | |
| "loss": 1.8106, | |
| "step": 128000 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 2.672255039215088, | |
| "learning_rate": 3.428203773289494e-05, | |
| "loss": 1.8141, | |
| "step": 128500 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 2.4398484230041504, | |
| "learning_rate": 3.4220878346641616e-05, | |
| "loss": 1.8036, | |
| "step": 129000 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 2.897477149963379, | |
| "learning_rate": 3.415971896038829e-05, | |
| "loss": 1.8006, | |
| "step": 129500 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 2.7050111293792725, | |
| "learning_rate": 3.4098559574134965e-05, | |
| "loss": 1.803, | |
| "step": 130000 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 2.503981828689575, | |
| "learning_rate": 3.403740018788163e-05, | |
| "loss": 1.7992, | |
| "step": 130500 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 2.9682281017303467, | |
| "learning_rate": 3.397624080162831e-05, | |
| "loss": 1.796, | |
| "step": 131000 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 3.1613168716430664, | |
| "learning_rate": 3.391508141537498e-05, | |
| "loss": 1.8083, | |
| "step": 131500 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 2.6427714824676514, | |
| "learning_rate": 3.3853922029121655e-05, | |
| "loss": 1.7968, | |
| "step": 132000 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 2.6238105297088623, | |
| "learning_rate": 3.379276264286832e-05, | |
| "loss": 1.7887, | |
| "step": 132500 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 2.566740036010742, | |
| "learning_rate": 3.3731603256615e-05, | |
| "loss": 1.7979, | |
| "step": 133000 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 2.6818795204162598, | |
| "learning_rate": 3.367044387036167e-05, | |
| "loss": 1.7856, | |
| "step": 133500 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 2.7290897369384766, | |
| "learning_rate": 3.3609284484108345e-05, | |
| "loss": 1.785, | |
| "step": 134000 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 2.8657619953155518, | |
| "learning_rate": 3.354812509785502e-05, | |
| "loss": 1.7925, | |
| "step": 134500 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 2.4724438190460205, | |
| "learning_rate": 3.3486965711601694e-05, | |
| "loss": 1.7854, | |
| "step": 135000 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 2.658123016357422, | |
| "learning_rate": 3.342580632534836e-05, | |
| "loss": 1.7904, | |
| "step": 135500 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 2.828024387359619, | |
| "learning_rate": 3.336464693909504e-05, | |
| "loss": 1.7768, | |
| "step": 136000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 2.7401976585388184, | |
| "learning_rate": 3.330348755284171e-05, | |
| "loss": 1.7808, | |
| "step": 136500 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 2.651334524154663, | |
| "learning_rate": 3.3242328166588384e-05, | |
| "loss": 1.7769, | |
| "step": 137000 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 2.9663844108581543, | |
| "learning_rate": 3.318116878033506e-05, | |
| "loss": 1.7685, | |
| "step": 137500 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 2.6409363746643066, | |
| "learning_rate": 3.312000939408173e-05, | |
| "loss": 1.7819, | |
| "step": 138000 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 2.4980876445770264, | |
| "learning_rate": 3.30588500078284e-05, | |
| "loss": 1.7781, | |
| "step": 138500 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 2.583472967147827, | |
| "learning_rate": 3.299769062157508e-05, | |
| "loss": 1.7642, | |
| "step": 139000 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 2.7035281658172607, | |
| "learning_rate": 3.293653123532175e-05, | |
| "loss": 1.7707, | |
| "step": 139500 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 2.647327184677124, | |
| "learning_rate": 3.287537184906842e-05, | |
| "loss": 1.77, | |
| "step": 140000 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 2.6016039848327637, | |
| "learning_rate": 3.28142124628151e-05, | |
| "loss": 1.7689, | |
| "step": 140500 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 2.871412515640259, | |
| "learning_rate": 3.275305307656177e-05, | |
| "loss": 1.7614, | |
| "step": 141000 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 2.5391454696655273, | |
| "learning_rate": 3.269189369030844e-05, | |
| "loss": 1.7666, | |
| "step": 141500 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 2.7399189472198486, | |
| "learning_rate": 3.263073430405511e-05, | |
| "loss": 1.7698, | |
| "step": 142000 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 2.550523281097412, | |
| "learning_rate": 3.256957491780179e-05, | |
| "loss": 1.7661, | |
| "step": 142500 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 2.7033884525299072, | |
| "learning_rate": 3.250841553154846e-05, | |
| "loss": 1.7576, | |
| "step": 143000 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 2.712890386581421, | |
| "learning_rate": 3.244725614529513e-05, | |
| "loss": 1.759, | |
| "step": 143500 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 2.7690348625183105, | |
| "learning_rate": 3.23860967590418e-05, | |
| "loss": 1.757, | |
| "step": 144000 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 2.5629446506500244, | |
| "learning_rate": 3.232493737278848e-05, | |
| "loss": 1.7593, | |
| "step": 144500 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 2.5981359481811523, | |
| "learning_rate": 3.226377798653515e-05, | |
| "loss": 1.7506, | |
| "step": 145000 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 2.465391159057617, | |
| "learning_rate": 3.2202618600281826e-05, | |
| "loss": 1.7558, | |
| "step": 145500 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 3.0468790531158447, | |
| "learning_rate": 3.2141459214028493e-05, | |
| "loss": 1.7559, | |
| "step": 146000 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 2.5434939861297607, | |
| "learning_rate": 3.208029982777517e-05, | |
| "loss": 1.7507, | |
| "step": 146500 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 2.479449987411499, | |
| "learning_rate": 3.201914044152184e-05, | |
| "loss": 1.7457, | |
| "step": 147000 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 2.621965169906616, | |
| "learning_rate": 3.1957981055268516e-05, | |
| "loss": 1.7513, | |
| "step": 147500 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 2.6055707931518555, | |
| "learning_rate": 3.1896821669015184e-05, | |
| "loss": 1.7427, | |
| "step": 148000 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 2.2937374114990234, | |
| "learning_rate": 3.1835662282761865e-05, | |
| "loss": 1.7524, | |
| "step": 148500 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 2.7363667488098145, | |
| "learning_rate": 3.177450289650853e-05, | |
| "loss": 1.7492, | |
| "step": 149000 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 2.660330057144165, | |
| "learning_rate": 3.1713343510255207e-05, | |
| "loss": 1.7405, | |
| "step": 149500 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 2.589137077331543, | |
| "learning_rate": 3.165218412400188e-05, | |
| "loss": 1.7389, | |
| "step": 150000 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 2.7419252395629883, | |
| "learning_rate": 3.1591024737748555e-05, | |
| "loss": 1.7478, | |
| "step": 150500 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 2.6772820949554443, | |
| "learning_rate": 3.152986535149522e-05, | |
| "loss": 1.7365, | |
| "step": 151000 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 2.765460968017578, | |
| "learning_rate": 3.1468705965241904e-05, | |
| "loss": 1.7399, | |
| "step": 151500 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 2.5273730754852295, | |
| "learning_rate": 3.140754657898857e-05, | |
| "loss": 1.7336, | |
| "step": 152000 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 2.6962389945983887, | |
| "learning_rate": 3.1346387192735245e-05, | |
| "loss": 1.7347, | |
| "step": 152500 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 2.6664233207702637, | |
| "learning_rate": 3.128522780648192e-05, | |
| "loss": 1.7212, | |
| "step": 153000 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 2.923704147338867, | |
| "learning_rate": 3.1224068420228594e-05, | |
| "loss": 1.7327, | |
| "step": 153500 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 2.7295753955841064, | |
| "learning_rate": 3.116290903397526e-05, | |
| "loss": 1.7304, | |
| "step": 154000 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 2.6523427963256836, | |
| "learning_rate": 3.110174964772194e-05, | |
| "loss": 1.7295, | |
| "step": 154500 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 2.544809341430664, | |
| "learning_rate": 3.104059026146861e-05, | |
| "loss": 1.7303, | |
| "step": 155000 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 2.608091354370117, | |
| "learning_rate": 3.0979430875215284e-05, | |
| "loss": 1.7288, | |
| "step": 155500 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 2.6781680583953857, | |
| "learning_rate": 3.091827148896195e-05, | |
| "loss": 1.7333, | |
| "step": 156000 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 2.5772953033447266, | |
| "learning_rate": 3.085711210270863e-05, | |
| "loss": 1.7313, | |
| "step": 156500 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 2.6381750106811523, | |
| "learning_rate": 3.07959527164553e-05, | |
| "loss": 1.723, | |
| "step": 157000 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 2.609584093093872, | |
| "learning_rate": 3.0734793330201974e-05, | |
| "loss": 1.7317, | |
| "step": 157500 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 2.556508779525757, | |
| "learning_rate": 3.067363394394865e-05, | |
| "loss": 1.7168, | |
| "step": 158000 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 2.4912405014038086, | |
| "learning_rate": 3.061247455769532e-05, | |
| "loss": 1.7221, | |
| "step": 158500 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 3.072758436203003, | |
| "learning_rate": 3.055131517144199e-05, | |
| "loss": 1.7193, | |
| "step": 159000 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 2.5012080669403076, | |
| "learning_rate": 3.0490155785188668e-05, | |
| "loss": 1.7187, | |
| "step": 159500 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 2.6786646842956543, | |
| "learning_rate": 3.042899639893534e-05, | |
| "loss": 1.7099, | |
| "step": 160000 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 2.6225085258483887, | |
| "learning_rate": 3.036783701268201e-05, | |
| "loss": 1.7135, | |
| "step": 160500 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 2.6484742164611816, | |
| "learning_rate": 3.0306677626428687e-05, | |
| "loss": 1.7086, | |
| "step": 161000 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 2.948645830154419, | |
| "learning_rate": 3.0245518240175358e-05, | |
| "loss": 1.7098, | |
| "step": 161500 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 2.766124725341797, | |
| "learning_rate": 3.018435885392203e-05, | |
| "loss": 1.7072, | |
| "step": 162000 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 2.8391172885894775, | |
| "learning_rate": 3.0123199467668707e-05, | |
| "loss": 1.7116, | |
| "step": 162500 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 2.5152761936187744, | |
| "learning_rate": 3.0062040081415378e-05, | |
| "loss": 1.7073, | |
| "step": 163000 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 2.719190835952759, | |
| "learning_rate": 3.000088069516205e-05, | |
| "loss": 1.7017, | |
| "step": 163500 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 2.5250284671783447, | |
| "learning_rate": 2.9939721308908726e-05, | |
| "loss": 1.7055, | |
| "step": 164000 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 3.0202980041503906, | |
| "learning_rate": 2.9878561922655397e-05, | |
| "loss": 1.7041, | |
| "step": 164500 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 2.7549800872802734, | |
| "learning_rate": 2.9817402536402068e-05, | |
| "loss": 1.6984, | |
| "step": 165000 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 2.891324281692505, | |
| "learning_rate": 2.9756243150148742e-05, | |
| "loss": 1.7033, | |
| "step": 165500 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 3.0068359375, | |
| "learning_rate": 2.9695083763895416e-05, | |
| "loss": 1.6969, | |
| "step": 166000 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 2.5715763568878174, | |
| "learning_rate": 2.9633924377642087e-05, | |
| "loss": 1.698, | |
| "step": 166500 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 2.5422136783599854, | |
| "learning_rate": 2.9572764991388758e-05, | |
| "loss": 1.6987, | |
| "step": 167000 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 2.5007290840148926, | |
| "learning_rate": 2.9511605605135432e-05, | |
| "loss": 1.6993, | |
| "step": 167500 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 2.5966525077819824, | |
| "learning_rate": 2.9450446218882107e-05, | |
| "loss": 1.6923, | |
| "step": 168000 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 2.5626885890960693, | |
| "learning_rate": 2.9389286832628778e-05, | |
| "loss": 1.6964, | |
| "step": 168500 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 2.76600980758667, | |
| "learning_rate": 2.9328127446375452e-05, | |
| "loss": 1.6875, | |
| "step": 169000 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 2.922257661819458, | |
| "learning_rate": 2.9266968060122123e-05, | |
| "loss": 1.6985, | |
| "step": 169500 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 2.641627311706543, | |
| "learning_rate": 2.9205808673868794e-05, | |
| "loss": 1.6888, | |
| "step": 170000 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 2.6927127838134766, | |
| "learning_rate": 2.914464928761547e-05, | |
| "loss": 1.6924, | |
| "step": 170500 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 2.4802746772766113, | |
| "learning_rate": 2.9083489901362142e-05, | |
| "loss": 1.6946, | |
| "step": 171000 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 2.5318949222564697, | |
| "learning_rate": 2.9022330515108813e-05, | |
| "loss": 1.6894, | |
| "step": 171500 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 2.7798898220062256, | |
| "learning_rate": 2.896117112885549e-05, | |
| "loss": 1.6861, | |
| "step": 172000 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 2.7204222679138184, | |
| "learning_rate": 2.890001174260216e-05, | |
| "loss": 1.6839, | |
| "step": 172500 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 2.8077077865600586, | |
| "learning_rate": 2.8838852356348832e-05, | |
| "loss": 1.684, | |
| "step": 173000 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 2.565995454788208, | |
| "learning_rate": 2.877769297009551e-05, | |
| "loss": 1.682, | |
| "step": 173500 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 2.7671403884887695, | |
| "learning_rate": 2.871653358384218e-05, | |
| "loss": 1.6782, | |
| "step": 174000 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 2.6801698207855225, | |
| "learning_rate": 2.8655374197588852e-05, | |
| "loss": 1.6853, | |
| "step": 174500 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 2.810450553894043, | |
| "learning_rate": 2.859421481133553e-05, | |
| "loss": 1.6841, | |
| "step": 175000 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 2.797452211380005, | |
| "learning_rate": 2.85330554250822e-05, | |
| "loss": 1.6753, | |
| "step": 175500 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 2.5832931995391846, | |
| "learning_rate": 2.847189603882887e-05, | |
| "loss": 1.6905, | |
| "step": 176000 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 2.8013391494750977, | |
| "learning_rate": 2.841073665257555e-05, | |
| "loss": 1.673, | |
| "step": 176500 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 2.5176284313201904, | |
| "learning_rate": 2.834957726632222e-05, | |
| "loss": 1.6742, | |
| "step": 177000 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 2.7248387336730957, | |
| "learning_rate": 2.828841788006889e-05, | |
| "loss": 1.6663, | |
| "step": 177500 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 3.006441831588745, | |
| "learning_rate": 2.822725849381556e-05, | |
| "loss": 1.6762, | |
| "step": 178000 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 2.754427671432495, | |
| "learning_rate": 2.816609910756224e-05, | |
| "loss": 1.6711, | |
| "step": 178500 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 2.6871962547302246, | |
| "learning_rate": 2.810493972130891e-05, | |
| "loss": 1.6749, | |
| "step": 179000 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 2.7660982608795166, | |
| "learning_rate": 2.804378033505558e-05, | |
| "loss": 1.6694, | |
| "step": 179500 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 2.5820930004119873, | |
| "learning_rate": 2.798262094880226e-05, | |
| "loss": 1.68, | |
| "step": 180000 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 2.50264048576355, | |
| "learning_rate": 2.792146156254893e-05, | |
| "loss": 1.6745, | |
| "step": 180500 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 2.759570837020874, | |
| "learning_rate": 2.78603021762956e-05, | |
| "loss": 1.6602, | |
| "step": 181000 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 2.6648566722869873, | |
| "learning_rate": 2.7799142790042278e-05, | |
| "loss": 1.67, | |
| "step": 181500 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 2.7140583992004395, | |
| "learning_rate": 2.773798340378895e-05, | |
| "loss": 1.6645, | |
| "step": 182000 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 2.4393863677978516, | |
| "learning_rate": 2.767682401753562e-05, | |
| "loss": 1.665, | |
| "step": 182500 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 2.66521954536438, | |
| "learning_rate": 2.7615664631282294e-05, | |
| "loss": 1.6694, | |
| "step": 183000 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 2.9926490783691406, | |
| "learning_rate": 2.7554505245028965e-05, | |
| "loss": 1.669, | |
| "step": 183500 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 2.611051321029663, | |
| "learning_rate": 2.749334585877564e-05, | |
| "loss": 1.6606, | |
| "step": 184000 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 2.6490185260772705, | |
| "learning_rate": 2.7432186472522313e-05, | |
| "loss": 1.6639, | |
| "step": 184500 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 2.7830920219421387, | |
| "learning_rate": 2.7371027086268984e-05, | |
| "loss": 1.6584, | |
| "step": 185000 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 2.776111602783203, | |
| "learning_rate": 2.7309867700015655e-05, | |
| "loss": 1.6564, | |
| "step": 185500 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 2.5335960388183594, | |
| "learning_rate": 2.7248708313762333e-05, | |
| "loss": 1.6553, | |
| "step": 186000 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 2.585458755493164, | |
| "learning_rate": 2.7187548927509004e-05, | |
| "loss": 1.653, | |
| "step": 186500 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 2.851865768432617, | |
| "learning_rate": 2.7126389541255674e-05, | |
| "loss": 1.6654, | |
| "step": 187000 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 2.649545907974243, | |
| "learning_rate": 2.7065230155002352e-05, | |
| "loss": 1.657, | |
| "step": 187500 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 2.552381753921509, | |
| "learning_rate": 2.7004070768749023e-05, | |
| "loss": 1.6569, | |
| "step": 188000 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 2.6055853366851807, | |
| "learning_rate": 2.6942911382495694e-05, | |
| "loss": 1.6522, | |
| "step": 188500 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 2.848911762237549, | |
| "learning_rate": 2.6881751996242365e-05, | |
| "loss": 1.6435, | |
| "step": 189000 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 2.8290162086486816, | |
| "learning_rate": 2.6820592609989042e-05, | |
| "loss": 1.6457, | |
| "step": 189500 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 2.682929277420044, | |
| "learning_rate": 2.6759433223735713e-05, | |
| "loss": 1.6521, | |
| "step": 190000 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 2.7035279273986816, | |
| "learning_rate": 2.6698273837482384e-05, | |
| "loss": 1.652, | |
| "step": 190500 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 2.6156182289123535, | |
| "learning_rate": 2.6637114451229062e-05, | |
| "loss": 1.6421, | |
| "step": 191000 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 2.7647957801818848, | |
| "learning_rate": 2.6575955064975733e-05, | |
| "loss": 1.6496, | |
| "step": 191500 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 2.5763864517211914, | |
| "learning_rate": 2.6514795678722403e-05, | |
| "loss": 1.6454, | |
| "step": 192000 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 2.6585116386413574, | |
| "learning_rate": 2.645363629246908e-05, | |
| "loss": 1.6443, | |
| "step": 192500 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 2.7471868991851807, | |
| "learning_rate": 2.6392476906215752e-05, | |
| "loss": 1.6494, | |
| "step": 193000 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 2.7787129878997803, | |
| "learning_rate": 2.6331317519962423e-05, | |
| "loss": 1.6441, | |
| "step": 193500 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 2.3297078609466553, | |
| "learning_rate": 2.62701581337091e-05, | |
| "loss": 1.6462, | |
| "step": 194000 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 2.8310294151306152, | |
| "learning_rate": 2.620899874745577e-05, | |
| "loss": 1.6473, | |
| "step": 194500 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 2.6443045139312744, | |
| "learning_rate": 2.6147839361202442e-05, | |
| "loss": 1.6468, | |
| "step": 195000 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 2.5064589977264404, | |
| "learning_rate": 2.608667997494912e-05, | |
| "loss": 1.6385, | |
| "step": 195500 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 2.6140296459198, | |
| "learning_rate": 2.602552058869579e-05, | |
| "loss": 1.6357, | |
| "step": 196000 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 2.461705207824707, | |
| "learning_rate": 2.596436120244246e-05, | |
| "loss": 1.6401, | |
| "step": 196500 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 2.782813787460327, | |
| "learning_rate": 2.590320181618914e-05, | |
| "loss": 1.6426, | |
| "step": 197000 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 2.5911970138549805, | |
| "learning_rate": 2.584204242993581e-05, | |
| "loss": 1.6409, | |
| "step": 197500 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 2.593752384185791, | |
| "learning_rate": 2.578088304368248e-05, | |
| "loss": 1.6345, | |
| "step": 198000 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 2.9096670150756836, | |
| "learning_rate": 2.5719723657429155e-05, | |
| "loss": 1.6351, | |
| "step": 198500 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 2.9551987648010254, | |
| "learning_rate": 2.5658564271175826e-05, | |
| "loss": 1.6322, | |
| "step": 199000 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 2.6173858642578125, | |
| "learning_rate": 2.55974048849225e-05, | |
| "loss": 1.6307, | |
| "step": 199500 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 2.695869207382202, | |
| "learning_rate": 2.5536245498669175e-05, | |
| "loss": 1.6264, | |
| "step": 200000 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 2.711869955062866, | |
| "learning_rate": 2.5475086112415846e-05, | |
| "loss": 1.6283, | |
| "step": 200500 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 2.570518732070923, | |
| "learning_rate": 2.5413926726162516e-05, | |
| "loss": 1.6323, | |
| "step": 201000 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 2.7032439708709717, | |
| "learning_rate": 2.535276733990919e-05, | |
| "loss": 1.6339, | |
| "step": 201500 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 2.7625739574432373, | |
| "learning_rate": 2.5291607953655865e-05, | |
| "loss": 1.6279, | |
| "step": 202000 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 2.829380989074707, | |
| "learning_rate": 2.5230448567402536e-05, | |
| "loss": 1.6263, | |
| "step": 202500 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 2.499410629272461, | |
| "learning_rate": 2.5169289181149207e-05, | |
| "loss": 1.6201, | |
| "step": 203000 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 2.6228952407836914, | |
| "learning_rate": 2.5108129794895884e-05, | |
| "loss": 1.6244, | |
| "step": 203500 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 2.609665870666504, | |
| "learning_rate": 2.5046970408642555e-05, | |
| "loss": 1.6319, | |
| "step": 204000 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 2.8935351371765137, | |
| "learning_rate": 2.498581102238923e-05, | |
| "loss": 1.6216, | |
| "step": 204500 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 2.7964882850646973, | |
| "learning_rate": 2.49246516361359e-05, | |
| "loss": 1.621, | |
| "step": 205000 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 2.465930938720703, | |
| "learning_rate": 2.4863492249882575e-05, | |
| "loss": 1.6332, | |
| "step": 205500 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 2.9245595932006836, | |
| "learning_rate": 2.480233286362925e-05, | |
| "loss": 1.6239, | |
| "step": 206000 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 2.548551321029663, | |
| "learning_rate": 2.474117347737592e-05, | |
| "loss": 1.6148, | |
| "step": 206500 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 2.6611809730529785, | |
| "learning_rate": 2.4680014091122594e-05, | |
| "loss": 1.6216, | |
| "step": 207000 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 2.6596455574035645, | |
| "learning_rate": 2.4618854704869268e-05, | |
| "loss": 1.6057, | |
| "step": 207500 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 2.645918607711792, | |
| "learning_rate": 2.455769531861594e-05, | |
| "loss": 1.613, | |
| "step": 208000 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 2.6304965019226074, | |
| "learning_rate": 2.4496535932362613e-05, | |
| "loss": 1.6148, | |
| "step": 208500 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 2.9523110389709473, | |
| "learning_rate": 2.4435376546109284e-05, | |
| "loss": 1.615, | |
| "step": 209000 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 2.6215062141418457, | |
| "learning_rate": 2.437421715985596e-05, | |
| "loss": 1.623, | |
| "step": 209500 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 2.7585043907165527, | |
| "learning_rate": 2.4313057773602633e-05, | |
| "loss": 1.6228, | |
| "step": 210000 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 2.626432418823242, | |
| "learning_rate": 2.4251898387349304e-05, | |
| "loss": 1.6152, | |
| "step": 210500 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 2.481905221939087, | |
| "learning_rate": 2.4190739001095978e-05, | |
| "loss": 1.6041, | |
| "step": 211000 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 2.5762555599212646, | |
| "learning_rate": 2.4129579614842652e-05, | |
| "loss": 1.6117, | |
| "step": 211500 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 2.6616873741149902, | |
| "learning_rate": 2.4068420228589323e-05, | |
| "loss": 1.6123, | |
| "step": 212000 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 2.6225013732910156, | |
| "learning_rate": 2.4007260842335997e-05, | |
| "loss": 1.6096, | |
| "step": 212500 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 2.6868574619293213, | |
| "learning_rate": 2.394610145608267e-05, | |
| "loss": 1.6103, | |
| "step": 213000 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 2.8061540126800537, | |
| "learning_rate": 2.3884942069829342e-05, | |
| "loss": 1.6051, | |
| "step": 213500 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 2.733086585998535, | |
| "learning_rate": 2.3823782683576017e-05, | |
| "loss": 1.6069, | |
| "step": 214000 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 2.596497058868408, | |
| "learning_rate": 2.3762623297322688e-05, | |
| "loss": 1.602, | |
| "step": 214500 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 2.496598243713379, | |
| "learning_rate": 2.3701463911069362e-05, | |
| "loss": 1.6107, | |
| "step": 215000 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 2.4470176696777344, | |
| "learning_rate": 2.3640304524816033e-05, | |
| "loss": 1.6021, | |
| "step": 215500 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 2.589895486831665, | |
| "learning_rate": 2.3579145138562707e-05, | |
| "loss": 1.6029, | |
| "step": 216000 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 2.7477266788482666, | |
| "learning_rate": 2.3517985752309378e-05, | |
| "loss": 1.6011, | |
| "step": 216500 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 2.7007384300231934, | |
| "learning_rate": 2.3456826366056052e-05, | |
| "loss": 1.6023, | |
| "step": 217000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 2.6846890449523926, | |
| "learning_rate": 2.3395666979802723e-05, | |
| "loss": 1.592, | |
| "step": 217500 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 2.69858455657959, | |
| "learning_rate": 2.3334507593549397e-05, | |
| "loss": 1.6047, | |
| "step": 218000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 2.6157824993133545, | |
| "learning_rate": 2.327334820729607e-05, | |
| "loss": 1.5992, | |
| "step": 218500 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 2.616908073425293, | |
| "learning_rate": 2.3212188821042742e-05, | |
| "loss": 1.596, | |
| "step": 219000 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 2.7912027835845947, | |
| "learning_rate": 2.3151029434789417e-05, | |
| "loss": 1.604, | |
| "step": 219500 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 2.6151885986328125, | |
| "learning_rate": 2.308987004853609e-05, | |
| "loss": 1.5953, | |
| "step": 220000 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 2.8206794261932373, | |
| "learning_rate": 2.3028710662282762e-05, | |
| "loss": 1.602, | |
| "step": 220500 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 2.6507091522216797, | |
| "learning_rate": 2.2967551276029436e-05, | |
| "loss": 1.5903, | |
| "step": 221000 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 2.752617359161377, | |
| "learning_rate": 2.2906391889776107e-05, | |
| "loss": 1.5971, | |
| "step": 221500 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 2.8615899085998535, | |
| "learning_rate": 2.284523250352278e-05, | |
| "loss": 1.596, | |
| "step": 222000 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 3.0563414096832275, | |
| "learning_rate": 2.2784073117269455e-05, | |
| "loss": 1.5993, | |
| "step": 222500 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 2.715120553970337, | |
| "learning_rate": 2.2722913731016126e-05, | |
| "loss": 1.5965, | |
| "step": 223000 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 2.8256382942199707, | |
| "learning_rate": 2.26617543447628e-05, | |
| "loss": 1.5883, | |
| "step": 223500 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 2.8050873279571533, | |
| "learning_rate": 2.2600594958509475e-05, | |
| "loss": 1.5914, | |
| "step": 224000 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 2.773902416229248, | |
| "learning_rate": 2.2539435572256146e-05, | |
| "loss": 1.5884, | |
| "step": 224500 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 2.7655787467956543, | |
| "learning_rate": 2.247827618600282e-05, | |
| "loss": 1.5965, | |
| "step": 225000 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 2.7787845134735107, | |
| "learning_rate": 2.2417116799749494e-05, | |
| "loss": 1.5872, | |
| "step": 225500 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 2.73518705368042, | |
| "learning_rate": 2.2355957413496165e-05, | |
| "loss": 1.588, | |
| "step": 226000 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 2.743821382522583, | |
| "learning_rate": 2.229479802724284e-05, | |
| "loss": 1.5871, | |
| "step": 226500 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 2.444350242614746, | |
| "learning_rate": 2.223363864098951e-05, | |
| "loss": 1.5802, | |
| "step": 227000 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 2.597966194152832, | |
| "learning_rate": 2.2172479254736185e-05, | |
| "loss": 1.5872, | |
| "step": 227500 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 2.7924256324768066, | |
| "learning_rate": 2.211131986848286e-05, | |
| "loss": 1.5877, | |
| "step": 228000 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 2.5780932903289795, | |
| "learning_rate": 2.205016048222953e-05, | |
| "loss": 1.583, | |
| "step": 228500 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 2.9303081035614014, | |
| "learning_rate": 2.1989001095976204e-05, | |
| "loss": 1.5901, | |
| "step": 229000 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 2.601661443710327, | |
| "learning_rate": 2.1927841709722878e-05, | |
| "loss": 1.5837, | |
| "step": 229500 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 2.6851816177368164, | |
| "learning_rate": 2.186668232346955e-05, | |
| "loss": 1.5736, | |
| "step": 230000 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 2.592660903930664, | |
| "learning_rate": 2.1805522937216223e-05, | |
| "loss": 1.5797, | |
| "step": 230500 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 2.876065492630005, | |
| "learning_rate": 2.1744363550962894e-05, | |
| "loss": 1.5851, | |
| "step": 231000 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 2.507368564605713, | |
| "learning_rate": 2.168320416470957e-05, | |
| "loss": 1.5868, | |
| "step": 231500 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 2.5661709308624268, | |
| "learning_rate": 2.162204477845624e-05, | |
| "loss": 1.5826, | |
| "step": 232000 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 2.698857545852661, | |
| "learning_rate": 2.1560885392202914e-05, | |
| "loss": 1.5739, | |
| "step": 232500 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 2.5845346450805664, | |
| "learning_rate": 2.1499726005949584e-05, | |
| "loss": 1.5769, | |
| "step": 233000 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 2.7823565006256104, | |
| "learning_rate": 2.143856661969626e-05, | |
| "loss": 1.5781, | |
| "step": 233500 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 2.675457239151001, | |
| "learning_rate": 2.137740723344293e-05, | |
| "loss": 1.5773, | |
| "step": 234000 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 2.789083957672119, | |
| "learning_rate": 2.1316247847189604e-05, | |
| "loss": 1.5662, | |
| "step": 234500 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 2.5719103813171387, | |
| "learning_rate": 2.1255088460936278e-05, | |
| "loss": 1.58, | |
| "step": 235000 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 2.7980144023895264, | |
| "learning_rate": 2.119392907468295e-05, | |
| "loss": 1.5769, | |
| "step": 235500 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 2.6691505908966064, | |
| "learning_rate": 2.1132769688429623e-05, | |
| "loss": 1.5703, | |
| "step": 236000 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 2.839600086212158, | |
| "learning_rate": 2.1071610302176297e-05, | |
| "loss": 1.5714, | |
| "step": 236500 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 2.8428940773010254, | |
| "learning_rate": 2.101045091592297e-05, | |
| "loss": 1.5723, | |
| "step": 237000 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 2.5756494998931885, | |
| "learning_rate": 2.0949291529669643e-05, | |
| "loss": 1.5672, | |
| "step": 237500 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 2.4937775135040283, | |
| "learning_rate": 2.0888132143416314e-05, | |
| "loss": 1.5715, | |
| "step": 238000 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 2.8386645317077637, | |
| "learning_rate": 2.0826972757162988e-05, | |
| "loss": 1.566, | |
| "step": 238500 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 2.9764533042907715, | |
| "learning_rate": 2.0765813370909662e-05, | |
| "loss": 1.5678, | |
| "step": 239000 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 2.5615928173065186, | |
| "learning_rate": 2.0704653984656333e-05, | |
| "loss": 1.5711, | |
| "step": 239500 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 2.431802988052368, | |
| "learning_rate": 2.0643494598403007e-05, | |
| "loss": 1.5632, | |
| "step": 240000 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 2.691328287124634, | |
| "learning_rate": 2.058233521214968e-05, | |
| "loss": 1.5761, | |
| "step": 240500 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 2.833160161972046, | |
| "learning_rate": 2.0521175825896352e-05, | |
| "loss": 1.568, | |
| "step": 241000 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 2.9443514347076416, | |
| "learning_rate": 2.0460016439643027e-05, | |
| "loss": 1.5644, | |
| "step": 241500 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 2.6418864727020264, | |
| "learning_rate": 2.03988570533897e-05, | |
| "loss": 1.5644, | |
| "step": 242000 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 2.559652090072632, | |
| "learning_rate": 2.033769766713637e-05, | |
| "loss": 1.5588, | |
| "step": 242500 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 2.376955509185791, | |
| "learning_rate": 2.0276538280883046e-05, | |
| "loss": 1.5659, | |
| "step": 243000 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 3.0132250785827637, | |
| "learning_rate": 2.0215378894629717e-05, | |
| "loss": 1.56, | |
| "step": 243500 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 2.493617534637451, | |
| "learning_rate": 2.015421950837639e-05, | |
| "loss": 1.5602, | |
| "step": 244000 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 2.6484365463256836, | |
| "learning_rate": 2.0093060122123065e-05, | |
| "loss": 1.5646, | |
| "step": 244500 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 2.5682971477508545, | |
| "learning_rate": 2.0031900735869736e-05, | |
| "loss": 1.5622, | |
| "step": 245000 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 2.783363103866577, | |
| "learning_rate": 1.997074134961641e-05, | |
| "loss": 1.5568, | |
| "step": 245500 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 2.5576345920562744, | |
| "learning_rate": 1.9909581963363085e-05, | |
| "loss": 1.558, | |
| "step": 246000 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 2.3469157218933105, | |
| "learning_rate": 1.9848422577109756e-05, | |
| "loss": 1.562, | |
| "step": 246500 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 2.7063257694244385, | |
| "learning_rate": 1.978726319085643e-05, | |
| "loss": 1.5582, | |
| "step": 247000 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 3.00256085395813, | |
| "learning_rate": 1.97261038046031e-05, | |
| "loss": 1.5574, | |
| "step": 247500 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 2.35555100440979, | |
| "learning_rate": 1.966494441834977e-05, | |
| "loss": 1.5567, | |
| "step": 248000 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 2.5847179889678955, | |
| "learning_rate": 1.9603785032096446e-05, | |
| "loss": 1.5647, | |
| "step": 248500 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 2.629279613494873, | |
| "learning_rate": 1.9542625645843117e-05, | |
| "loss": 1.552, | |
| "step": 249000 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 2.6433770656585693, | |
| "learning_rate": 1.948146625958979e-05, | |
| "loss": 1.5547, | |
| "step": 249500 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 2.6378979682922363, | |
| "learning_rate": 1.9420306873336465e-05, | |
| "loss": 1.5549, | |
| "step": 250000 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 2.7272751331329346, | |
| "learning_rate": 1.9359147487083136e-05, | |
| "loss": 1.5496, | |
| "step": 250500 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 2.661400556564331, | |
| "learning_rate": 1.929798810082981e-05, | |
| "loss": 1.5597, | |
| "step": 251000 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 2.76647686958313, | |
| "learning_rate": 1.9236828714576485e-05, | |
| "loss": 1.5559, | |
| "step": 251500 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 2.4355571269989014, | |
| "learning_rate": 1.9175669328323156e-05, | |
| "loss": 1.5512, | |
| "step": 252000 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 2.503006935119629, | |
| "learning_rate": 1.911450994206983e-05, | |
| "loss": 1.5459, | |
| "step": 252500 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 2.4940273761749268, | |
| "learning_rate": 1.9053350555816504e-05, | |
| "loss": 1.5563, | |
| "step": 253000 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 3.0512688159942627, | |
| "learning_rate": 1.8992191169563175e-05, | |
| "loss": 1.5542, | |
| "step": 253500 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 2.811276912689209, | |
| "learning_rate": 1.893103178330985e-05, | |
| "loss": 1.5447, | |
| "step": 254000 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 2.565730571746826, | |
| "learning_rate": 1.8869872397056523e-05, | |
| "loss": 1.5446, | |
| "step": 254500 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 2.6504178047180176, | |
| "learning_rate": 1.8808713010803194e-05, | |
| "loss": 1.5506, | |
| "step": 255000 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 3.0442628860473633, | |
| "learning_rate": 1.874755362454987e-05, | |
| "loss": 1.55, | |
| "step": 255500 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 2.6336920261383057, | |
| "learning_rate": 1.868639423829654e-05, | |
| "loss": 1.5424, | |
| "step": 256000 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 2.7758066654205322, | |
| "learning_rate": 1.8625234852043214e-05, | |
| "loss": 1.5479, | |
| "step": 256500 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 2.818814992904663, | |
| "learning_rate": 1.8564075465789888e-05, | |
| "loss": 1.5486, | |
| "step": 257000 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 2.6956701278686523, | |
| "learning_rate": 1.850291607953656e-05, | |
| "loss": 1.5497, | |
| "step": 257500 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 2.7896413803100586, | |
| "learning_rate": 1.8441756693283233e-05, | |
| "loss": 1.5437, | |
| "step": 258000 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 2.917079448699951, | |
| "learning_rate": 1.8380597307029907e-05, | |
| "loss": 1.5432, | |
| "step": 258500 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 2.761766195297241, | |
| "learning_rate": 1.8319437920776578e-05, | |
| "loss": 1.5413, | |
| "step": 259000 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 2.7666103839874268, | |
| "learning_rate": 1.8258278534523253e-05, | |
| "loss": 1.5396, | |
| "step": 259500 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 2.691253423690796, | |
| "learning_rate": 1.8197119148269927e-05, | |
| "loss": 1.5372, | |
| "step": 260000 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 2.911930799484253, | |
| "learning_rate": 1.8135959762016598e-05, | |
| "loss": 1.5485, | |
| "step": 260500 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 2.5208046436309814, | |
| "learning_rate": 1.8074800375763272e-05, | |
| "loss": 1.5438, | |
| "step": 261000 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 2.41379976272583, | |
| "learning_rate": 1.8013640989509943e-05, | |
| "loss": 1.5384, | |
| "step": 261500 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 2.636869430541992, | |
| "learning_rate": 1.7952481603256617e-05, | |
| "loss": 1.5477, | |
| "step": 262000 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 2.6929407119750977, | |
| "learning_rate": 1.7891322217003288e-05, | |
| "loss": 1.5384, | |
| "step": 262500 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 2.849163055419922, | |
| "learning_rate": 1.7830162830749962e-05, | |
| "loss": 1.5394, | |
| "step": 263000 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 2.5682120323181152, | |
| "learning_rate": 1.7769003444496633e-05, | |
| "loss": 1.5353, | |
| "step": 263500 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 2.5825769901275635, | |
| "learning_rate": 1.7707844058243307e-05, | |
| "loss": 1.535, | |
| "step": 264000 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 2.426283597946167, | |
| "learning_rate": 1.7646684671989978e-05, | |
| "loss": 1.5373, | |
| "step": 264500 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 2.706394910812378, | |
| "learning_rate": 1.7585525285736652e-05, | |
| "loss": 1.5358, | |
| "step": 265000 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 2.6370396614074707, | |
| "learning_rate": 1.7524365899483327e-05, | |
| "loss": 1.5391, | |
| "step": 265500 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 2.553217649459839, | |
| "learning_rate": 1.7463206513229998e-05, | |
| "loss": 1.5299, | |
| "step": 266000 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 2.884148120880127, | |
| "learning_rate": 1.7402047126976672e-05, | |
| "loss": 1.5432, | |
| "step": 266500 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 2.7331855297088623, | |
| "learning_rate": 1.7340887740723343e-05, | |
| "loss": 1.5329, | |
| "step": 267000 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 2.841865062713623, | |
| "learning_rate": 1.7279728354470017e-05, | |
| "loss": 1.5319, | |
| "step": 267500 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 2.463677406311035, | |
| "learning_rate": 1.721856896821669e-05, | |
| "loss": 1.5274, | |
| "step": 268000 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 2.7880847454071045, | |
| "learning_rate": 1.7157409581963362e-05, | |
| "loss": 1.5244, | |
| "step": 268500 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 2.5323753356933594, | |
| "learning_rate": 1.7096250195710036e-05, | |
| "loss": 1.5343, | |
| "step": 269000 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 2.93086838722229, | |
| "learning_rate": 1.703509080945671e-05, | |
| "loss": 1.5253, | |
| "step": 269500 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 2.8919107913970947, | |
| "learning_rate": 1.697393142320338e-05, | |
| "loss": 1.5334, | |
| "step": 270000 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 2.8613593578338623, | |
| "learning_rate": 1.6912772036950056e-05, | |
| "loss": 1.5342, | |
| "step": 270500 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 2.5317909717559814, | |
| "learning_rate": 1.685161265069673e-05, | |
| "loss": 1.5278, | |
| "step": 271000 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 2.832613706588745, | |
| "learning_rate": 1.67904532644434e-05, | |
| "loss": 1.5318, | |
| "step": 271500 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 2.5811901092529297, | |
| "learning_rate": 1.6729293878190075e-05, | |
| "loss": 1.5338, | |
| "step": 272000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 2.640382766723633, | |
| "learning_rate": 1.6668134491936746e-05, | |
| "loss": 1.5278, | |
| "step": 272500 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 2.777024745941162, | |
| "learning_rate": 1.660697510568342e-05, | |
| "loss": 1.5192, | |
| "step": 273000 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 2.546867609024048, | |
| "learning_rate": 1.6545815719430095e-05, | |
| "loss": 1.5203, | |
| "step": 273500 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 2.459458589553833, | |
| "learning_rate": 1.6484656333176765e-05, | |
| "loss": 1.5232, | |
| "step": 274000 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 2.6832683086395264, | |
| "learning_rate": 1.642349694692344e-05, | |
| "loss": 1.5133, | |
| "step": 274500 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 2.6847174167633057, | |
| "learning_rate": 1.6362337560670114e-05, | |
| "loss": 1.5202, | |
| "step": 275000 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 2.819836139678955, | |
| "learning_rate": 1.6301178174416785e-05, | |
| "loss": 1.5225, | |
| "step": 275500 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 2.4789822101593018, | |
| "learning_rate": 1.624001878816346e-05, | |
| "loss": 1.5185, | |
| "step": 276000 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 2.8469271659851074, | |
| "learning_rate": 1.6178859401910133e-05, | |
| "loss": 1.5281, | |
| "step": 276500 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 2.4741554260253906, | |
| "learning_rate": 1.6117700015656804e-05, | |
| "loss": 1.5187, | |
| "step": 277000 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 2.7348639965057373, | |
| "learning_rate": 1.605654062940348e-05, | |
| "loss": 1.519, | |
| "step": 277500 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 2.590632677078247, | |
| "learning_rate": 1.599538124315015e-05, | |
| "loss": 1.5242, | |
| "step": 278000 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 2.926156997680664, | |
| "learning_rate": 1.5934221856896824e-05, | |
| "loss": 1.5182, | |
| "step": 278500 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 2.3463704586029053, | |
| "learning_rate": 1.5873062470643494e-05, | |
| "loss": 1.5186, | |
| "step": 279000 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 2.8778836727142334, | |
| "learning_rate": 1.581190308439017e-05, | |
| "loss": 1.5133, | |
| "step": 279500 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 2.7937684059143066, | |
| "learning_rate": 1.575074369813684e-05, | |
| "loss": 1.5204, | |
| "step": 280000 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 2.6967952251434326, | |
| "learning_rate": 1.5689584311883514e-05, | |
| "loss": 1.5238, | |
| "step": 280500 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 2.7939419746398926, | |
| "learning_rate": 1.5628424925630185e-05, | |
| "loss": 1.5162, | |
| "step": 281000 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 2.4184165000915527, | |
| "learning_rate": 1.556726553937686e-05, | |
| "loss": 1.5083, | |
| "step": 281500 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 2.5736517906188965, | |
| "learning_rate": 1.5506106153123533e-05, | |
| "loss": 1.5225, | |
| "step": 282000 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 2.775562286376953, | |
| "learning_rate": 1.5444946766870204e-05, | |
| "loss": 1.5107, | |
| "step": 282500 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 2.65218186378479, | |
| "learning_rate": 1.538378738061688e-05, | |
| "loss": 1.5101, | |
| "step": 283000 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 2.9510700702667236, | |
| "learning_rate": 1.532262799436355e-05, | |
| "loss": 1.5108, | |
| "step": 283500 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 2.663459300994873, | |
| "learning_rate": 1.5261468608110224e-05, | |
| "loss": 1.5039, | |
| "step": 284000 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 2.621185541152954, | |
| "learning_rate": 1.5200309221856898e-05, | |
| "loss": 1.5044, | |
| "step": 284500 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 2.7597007751464844, | |
| "learning_rate": 1.5139149835603569e-05, | |
| "loss": 1.5123, | |
| "step": 285000 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 2.9049315452575684, | |
| "learning_rate": 1.5077990449350243e-05, | |
| "loss": 1.5129, | |
| "step": 285500 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 2.7064170837402344, | |
| "learning_rate": 1.5016831063096917e-05, | |
| "loss": 1.5075, | |
| "step": 286000 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 2.8447062969207764, | |
| "learning_rate": 1.4955671676843588e-05, | |
| "loss": 1.5107, | |
| "step": 286500 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 2.63680100440979, | |
| "learning_rate": 1.4894512290590262e-05, | |
| "loss": 1.5095, | |
| "step": 287000 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 2.9696691036224365, | |
| "learning_rate": 1.4833352904336937e-05, | |
| "loss": 1.5069, | |
| "step": 287500 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 2.7010321617126465, | |
| "learning_rate": 1.4772193518083607e-05, | |
| "loss": 1.5094, | |
| "step": 288000 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 2.5756781101226807, | |
| "learning_rate": 1.4711034131830282e-05, | |
| "loss": 1.5054, | |
| "step": 288500 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 3.0450093746185303, | |
| "learning_rate": 1.4649874745576956e-05, | |
| "loss": 1.5115, | |
| "step": 289000 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 2.551755905151367, | |
| "learning_rate": 1.4588715359323627e-05, | |
| "loss": 1.5129, | |
| "step": 289500 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 2.865170478820801, | |
| "learning_rate": 1.4527555973070301e-05, | |
| "loss": 1.4972, | |
| "step": 290000 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 2.648294687271118, | |
| "learning_rate": 1.4466396586816972e-05, | |
| "loss": 1.5093, | |
| "step": 290500 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 2.600937604904175, | |
| "learning_rate": 1.4405237200563646e-05, | |
| "loss": 1.5043, | |
| "step": 291000 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 2.9919681549072266, | |
| "learning_rate": 1.4344077814310319e-05, | |
| "loss": 1.4997, | |
| "step": 291500 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 2.8291046619415283, | |
| "learning_rate": 1.4282918428056991e-05, | |
| "loss": 1.5196, | |
| "step": 292000 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 2.66756272315979, | |
| "learning_rate": 1.4221759041803664e-05, | |
| "loss": 1.5007, | |
| "step": 292500 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 2.809164524078369, | |
| "learning_rate": 1.4160599655550338e-05, | |
| "loss": 1.5033, | |
| "step": 293000 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 2.6483566761016846, | |
| "learning_rate": 1.4099440269297009e-05, | |
| "loss": 1.5065, | |
| "step": 293500 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 2.4449145793914795, | |
| "learning_rate": 1.4038280883043683e-05, | |
| "loss": 1.5032, | |
| "step": 294000 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 2.6919500827789307, | |
| "learning_rate": 1.3977121496790358e-05, | |
| "loss": 1.5053, | |
| "step": 294500 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 2.8122289180755615, | |
| "learning_rate": 1.3915962110537028e-05, | |
| "loss": 1.5079, | |
| "step": 295000 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 2.7903494834899902, | |
| "learning_rate": 1.3854802724283703e-05, | |
| "loss": 1.4965, | |
| "step": 295500 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 2.525930404663086, | |
| "learning_rate": 1.3793643338030374e-05, | |
| "loss": 1.5043, | |
| "step": 296000 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 2.493638277053833, | |
| "learning_rate": 1.3732483951777048e-05, | |
| "loss": 1.4974, | |
| "step": 296500 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 2.4521799087524414, | |
| "learning_rate": 1.3671324565523722e-05, | |
| "loss": 1.4941, | |
| "step": 297000 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 2.8091464042663574, | |
| "learning_rate": 1.3610165179270393e-05, | |
| "loss": 1.5018, | |
| "step": 297500 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 2.5954153537750244, | |
| "learning_rate": 1.3549005793017067e-05, | |
| "loss": 1.4999, | |
| "step": 298000 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 2.7937843799591064, | |
| "learning_rate": 1.348784640676374e-05, | |
| "loss": 1.4971, | |
| "step": 298500 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 2.731354236602783, | |
| "learning_rate": 1.3426687020510412e-05, | |
| "loss": 1.5019, | |
| "step": 299000 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 2.893202066421509, | |
| "learning_rate": 1.3365527634257085e-05, | |
| "loss": 1.5084, | |
| "step": 299500 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 2.5517237186431885, | |
| "learning_rate": 1.330436824800376e-05, | |
| "loss": 1.4979, | |
| "step": 300000 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 2.5626368522644043, | |
| "learning_rate": 1.324320886175043e-05, | |
| "loss": 1.5023, | |
| "step": 300500 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 2.9477968215942383, | |
| "learning_rate": 1.3182049475497104e-05, | |
| "loss": 1.4948, | |
| "step": 301000 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 2.5781774520874023, | |
| "learning_rate": 1.3120890089243775e-05, | |
| "loss": 1.4968, | |
| "step": 301500 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 2.8032429218292236, | |
| "learning_rate": 1.305973070299045e-05, | |
| "loss": 1.5017, | |
| "step": 302000 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 2.6801342964172363, | |
| "learning_rate": 1.2998571316737124e-05, | |
| "loss": 1.4932, | |
| "step": 302500 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 2.6974122524261475, | |
| "learning_rate": 1.2937411930483795e-05, | |
| "loss": 1.4965, | |
| "step": 303000 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 2.4389328956604004, | |
| "learning_rate": 1.2876252544230469e-05, | |
| "loss": 1.4933, | |
| "step": 303500 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 2.66622257232666, | |
| "learning_rate": 1.2815093157977143e-05, | |
| "loss": 1.4941, | |
| "step": 304000 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 2.6904194355010986, | |
| "learning_rate": 1.2753933771723814e-05, | |
| "loss": 1.4946, | |
| "step": 304500 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 2.673464298248291, | |
| "learning_rate": 1.2692774385470488e-05, | |
| "loss": 1.5015, | |
| "step": 305000 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 2.6104812622070312, | |
| "learning_rate": 1.2631614999217163e-05, | |
| "loss": 1.4886, | |
| "step": 305500 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 2.8773016929626465, | |
| "learning_rate": 1.2570455612963833e-05, | |
| "loss": 1.4916, | |
| "step": 306000 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 2.437274217605591, | |
| "learning_rate": 1.2509296226710508e-05, | |
| "loss": 1.4909, | |
| "step": 306500 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 3.1659114360809326, | |
| "learning_rate": 1.244813684045718e-05, | |
| "loss": 1.488, | |
| "step": 307000 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 2.607539653778076, | |
| "learning_rate": 1.2386977454203851e-05, | |
| "loss": 1.4916, | |
| "step": 307500 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 2.589136838912964, | |
| "learning_rate": 1.2325818067950524e-05, | |
| "loss": 1.4893, | |
| "step": 308000 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 3.010464668273926, | |
| "learning_rate": 1.2264658681697198e-05, | |
| "loss": 1.4867, | |
| "step": 308500 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 2.713313579559326, | |
| "learning_rate": 1.220349929544387e-05, | |
| "loss": 1.4788, | |
| "step": 309000 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 2.753493070602417, | |
| "learning_rate": 1.2142339909190543e-05, | |
| "loss": 1.4839, | |
| "step": 309500 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 2.8799803256988525, | |
| "learning_rate": 1.2081180522937217e-05, | |
| "loss": 1.4914, | |
| "step": 310000 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 2.8280301094055176, | |
| "learning_rate": 1.202002113668389e-05, | |
| "loss": 1.4809, | |
| "step": 310500 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 2.9053263664245605, | |
| "learning_rate": 1.1958861750430562e-05, | |
| "loss": 1.4788, | |
| "step": 311000 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 2.879546880722046, | |
| "learning_rate": 1.1897702364177235e-05, | |
| "loss": 1.4825, | |
| "step": 311500 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 2.473529577255249, | |
| "learning_rate": 1.183654297792391e-05, | |
| "loss": 1.4883, | |
| "step": 312000 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 2.743178367614746, | |
| "learning_rate": 1.1775383591670582e-05, | |
| "loss": 1.4804, | |
| "step": 312500 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 2.6918370723724365, | |
| "learning_rate": 1.1714224205417254e-05, | |
| "loss": 1.4891, | |
| "step": 313000 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 2.9803996086120605, | |
| "learning_rate": 1.1653064819163927e-05, | |
| "loss": 1.486, | |
| "step": 313500 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 2.544872999191284, | |
| "learning_rate": 1.1591905432910601e-05, | |
| "loss": 1.4879, | |
| "step": 314000 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 2.8242433071136475, | |
| "learning_rate": 1.1530746046657274e-05, | |
| "loss": 1.4848, | |
| "step": 314500 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 2.7912473678588867, | |
| "learning_rate": 1.1469586660403946e-05, | |
| "loss": 1.4847, | |
| "step": 315000 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 3.1455202102661133, | |
| "learning_rate": 1.1408427274150619e-05, | |
| "loss": 1.4899, | |
| "step": 315500 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 2.8553197383880615, | |
| "learning_rate": 1.1347267887897291e-05, | |
| "loss": 1.4799, | |
| "step": 316000 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 2.7605557441711426, | |
| "learning_rate": 1.1286108501643964e-05, | |
| "loss": 1.4808, | |
| "step": 316500 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 2.7065718173980713, | |
| "learning_rate": 1.1224949115390637e-05, | |
| "loss": 1.4846, | |
| "step": 317000 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 2.719977378845215, | |
| "learning_rate": 1.1163789729137311e-05, | |
| "loss": 1.4831, | |
| "step": 317500 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 2.569617509841919, | |
| "learning_rate": 1.1102630342883983e-05, | |
| "loss": 1.4798, | |
| "step": 318000 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 2.4670286178588867, | |
| "learning_rate": 1.1041470956630656e-05, | |
| "loss": 1.4765, | |
| "step": 318500 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 2.797725200653076, | |
| "learning_rate": 1.098031157037733e-05, | |
| "loss": 1.4817, | |
| "step": 319000 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 2.8332033157348633, | |
| "learning_rate": 1.0919152184124003e-05, | |
| "loss": 1.4835, | |
| "step": 319500 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 2.494609832763672, | |
| "learning_rate": 1.0857992797870675e-05, | |
| "loss": 1.4746, | |
| "step": 320000 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 2.708406925201416, | |
| "learning_rate": 1.0796833411617348e-05, | |
| "loss": 1.4764, | |
| "step": 320500 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 2.59369158744812, | |
| "learning_rate": 1.0735674025364022e-05, | |
| "loss": 1.4808, | |
| "step": 321000 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 2.803255558013916, | |
| "learning_rate": 1.0674514639110695e-05, | |
| "loss": 1.48, | |
| "step": 321500 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 2.5560402870178223, | |
| "learning_rate": 1.0613355252857367e-05, | |
| "loss": 1.4843, | |
| "step": 322000 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 2.911194324493408, | |
| "learning_rate": 1.055219586660404e-05, | |
| "loss": 1.4801, | |
| "step": 322500 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 2.8196239471435547, | |
| "learning_rate": 1.0491036480350713e-05, | |
| "loss": 1.4762, | |
| "step": 323000 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 2.709317445755005, | |
| "learning_rate": 1.0429877094097385e-05, | |
| "loss": 1.476, | |
| "step": 323500 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 2.627985715866089, | |
| "learning_rate": 1.0368717707844058e-05, | |
| "loss": 1.4781, | |
| "step": 324000 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 2.8382914066314697, | |
| "learning_rate": 1.0307558321590732e-05, | |
| "loss": 1.4728, | |
| "step": 324500 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 2.8126072883605957, | |
| "learning_rate": 1.0246398935337404e-05, | |
| "loss": 1.4778, | |
| "step": 325000 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 2.6362712383270264, | |
| "learning_rate": 1.0185239549084077e-05, | |
| "loss": 1.476, | |
| "step": 325500 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 2.761763572692871, | |
| "learning_rate": 1.012408016283075e-05, | |
| "loss": 1.4704, | |
| "step": 326000 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 2.6072146892547607, | |
| "learning_rate": 1.0062920776577424e-05, | |
| "loss": 1.4703, | |
| "step": 326500 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 3.07877254486084, | |
| "learning_rate": 1.0001761390324096e-05, | |
| "loss": 1.4863, | |
| "step": 327000 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 2.986053705215454, | |
| "learning_rate": 9.940602004070769e-06, | |
| "loss": 1.4837, | |
| "step": 327500 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 2.7128584384918213, | |
| "learning_rate": 9.879442617817442e-06, | |
| "loss": 1.4648, | |
| "step": 328000 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 2.6193928718566895, | |
| "learning_rate": 9.818283231564116e-06, | |
| "loss": 1.4628, | |
| "step": 328500 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 2.7647864818573, | |
| "learning_rate": 9.757123845310788e-06, | |
| "loss": 1.4799, | |
| "step": 329000 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 2.70143985748291, | |
| "learning_rate": 9.695964459057461e-06, | |
| "loss": 1.4629, | |
| "step": 329500 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 2.705559730529785, | |
| "learning_rate": 9.634805072804135e-06, | |
| "loss": 1.4662, | |
| "step": 330000 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 2.678466796875, | |
| "learning_rate": 9.573645686550808e-06, | |
| "loss": 1.4693, | |
| "step": 330500 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 2.5051000118255615, | |
| "learning_rate": 9.51248630029748e-06, | |
| "loss": 1.4705, | |
| "step": 331000 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 2.8841006755828857, | |
| "learning_rate": 9.451326914044153e-06, | |
| "loss": 1.4651, | |
| "step": 331500 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 2.7045044898986816, | |
| "learning_rate": 9.390167527790825e-06, | |
| "loss": 1.4665, | |
| "step": 332000 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 3.101134777069092, | |
| "learning_rate": 9.329008141537498e-06, | |
| "loss": 1.4746, | |
| "step": 332500 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 2.5567667484283447, | |
| "learning_rate": 9.26784875528417e-06, | |
| "loss": 1.4667, | |
| "step": 333000 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 2.5476863384246826, | |
| "learning_rate": 9.206689369030843e-06, | |
| "loss": 1.4593, | |
| "step": 333500 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 2.6363370418548584, | |
| "learning_rate": 9.145529982777517e-06, | |
| "loss": 1.4632, | |
| "step": 334000 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 2.9167027473449707, | |
| "learning_rate": 9.08437059652419e-06, | |
| "loss": 1.4595, | |
| "step": 334500 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 2.77966046333313, | |
| "learning_rate": 9.023211210270863e-06, | |
| "loss": 1.4604, | |
| "step": 335000 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 3.0701239109039307, | |
| "learning_rate": 8.962051824017537e-06, | |
| "loss": 1.4613, | |
| "step": 335500 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 2.6307058334350586, | |
| "learning_rate": 8.90089243776421e-06, | |
| "loss": 1.4627, | |
| "step": 336000 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 2.46291184425354, | |
| "learning_rate": 8.839733051510882e-06, | |
| "loss": 1.4667, | |
| "step": 336500 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 2.7968499660491943, | |
| "learning_rate": 8.778573665257555e-06, | |
| "loss": 1.4644, | |
| "step": 337000 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 2.7745018005371094, | |
| "learning_rate": 8.717414279004229e-06, | |
| "loss": 1.458, | |
| "step": 337500 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 2.951845645904541, | |
| "learning_rate": 8.656254892750901e-06, | |
| "loss": 1.4723, | |
| "step": 338000 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 2.6524295806884766, | |
| "learning_rate": 8.595095506497574e-06, | |
| "loss": 1.4607, | |
| "step": 338500 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 2.800586223602295, | |
| "learning_rate": 8.533936120244246e-06, | |
| "loss": 1.4606, | |
| "step": 339000 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 2.947486639022827, | |
| "learning_rate": 8.472776733990919e-06, | |
| "loss": 1.4608, | |
| "step": 339500 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 2.936547040939331, | |
| "learning_rate": 8.411617347737592e-06, | |
| "loss": 1.4582, | |
| "step": 340000 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 2.820474147796631, | |
| "learning_rate": 8.350457961484264e-06, | |
| "loss": 1.4648, | |
| "step": 340500 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 2.754638910293579, | |
| "learning_rate": 8.289298575230938e-06, | |
| "loss": 1.4613, | |
| "step": 341000 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 2.46992564201355, | |
| "learning_rate": 8.228139188977611e-06, | |
| "loss": 1.4685, | |
| "step": 341500 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 2.8246257305145264, | |
| "learning_rate": 8.166979802724284e-06, | |
| "loss": 1.4632, | |
| "step": 342000 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 2.966745138168335, | |
| "learning_rate": 8.105820416470956e-06, | |
| "loss": 1.4615, | |
| "step": 342500 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 2.9904332160949707, | |
| "learning_rate": 8.04466103021763e-06, | |
| "loss": 1.4646, | |
| "step": 343000 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 2.7209649085998535, | |
| "learning_rate": 7.983501643964303e-06, | |
| "loss": 1.4602, | |
| "step": 343500 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 2.7970163822174072, | |
| "learning_rate": 7.922342257710976e-06, | |
| "loss": 1.4557, | |
| "step": 344000 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 2.646637201309204, | |
| "learning_rate": 7.86118287145765e-06, | |
| "loss": 1.4554, | |
| "step": 344500 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 2.8239455223083496, | |
| "learning_rate": 7.800023485204322e-06, | |
| "loss": 1.4527, | |
| "step": 345000 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 2.9307682514190674, | |
| "learning_rate": 7.738864098950995e-06, | |
| "loss": 1.461, | |
| "step": 345500 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 2.840571165084839, | |
| "learning_rate": 7.677704712697668e-06, | |
| "loss": 1.4592, | |
| "step": 346000 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 2.7356936931610107, | |
| "learning_rate": 7.616545326444341e-06, | |
| "loss": 1.46, | |
| "step": 346500 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 2.902578353881836, | |
| "learning_rate": 7.5553859401910135e-06, | |
| "loss": 1.4627, | |
| "step": 347000 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 3.005869150161743, | |
| "learning_rate": 7.494226553937686e-06, | |
| "loss": 1.4592, | |
| "step": 347500 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 2.7847769260406494, | |
| "learning_rate": 7.433067167684359e-06, | |
| "loss": 1.4652, | |
| "step": 348000 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 2.821125030517578, | |
| "learning_rate": 7.371907781431033e-06, | |
| "loss": 1.4585, | |
| "step": 348500 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 3.5766139030456543, | |
| "learning_rate": 7.3107483951777054e-06, | |
| "loss": 1.4511, | |
| "step": 349000 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 2.9950060844421387, | |
| "learning_rate": 7.249589008924377e-06, | |
| "loss": 1.45, | |
| "step": 349500 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 2.922325611114502, | |
| "learning_rate": 7.1884296226710514e-06, | |
| "loss": 1.4549, | |
| "step": 350000 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 3.0797903537750244, | |
| "learning_rate": 7.127270236417724e-06, | |
| "loss": 1.4643, | |
| "step": 350500 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 2.6984283924102783, | |
| "learning_rate": 7.0661108501643966e-06, | |
| "loss": 1.4528, | |
| "step": 351000 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 2.804563283920288, | |
| "learning_rate": 7.004951463911069e-06, | |
| "loss": 1.4565, | |
| "step": 351500 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 2.6416571140289307, | |
| "learning_rate": 6.943792077657743e-06, | |
| "loss": 1.4523, | |
| "step": 352000 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 2.5998694896698, | |
| "learning_rate": 6.882632691404416e-06, | |
| "loss": 1.4546, | |
| "step": 352500 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 2.714494228363037, | |
| "learning_rate": 6.8214733051510885e-06, | |
| "loss": 1.4581, | |
| "step": 353000 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 2.8569843769073486, | |
| "learning_rate": 6.760313918897761e-06, | |
| "loss": 1.4562, | |
| "step": 353500 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 2.5230369567871094, | |
| "learning_rate": 6.6991545326444345e-06, | |
| "loss": 1.4469, | |
| "step": 354000 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 2.654069423675537, | |
| "learning_rate": 6.637995146391107e-06, | |
| "loss": 1.449, | |
| "step": 354500 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 3.2150166034698486, | |
| "learning_rate": 6.57683576013778e-06, | |
| "loss": 1.4506, | |
| "step": 355000 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 2.6119723320007324, | |
| "learning_rate": 6.515676373884454e-06, | |
| "loss": 1.4507, | |
| "step": 355500 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 2.6928510665893555, | |
| "learning_rate": 6.4545169876311265e-06, | |
| "loss": 1.4565, | |
| "step": 356000 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 2.555009603500366, | |
| "learning_rate": 6.393357601377799e-06, | |
| "loss": 1.4514, | |
| "step": 356500 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 2.785787343978882, | |
| "learning_rate": 6.332198215124472e-06, | |
| "loss": 1.4525, | |
| "step": 357000 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 2.8000409603118896, | |
| "learning_rate": 6.271038828871145e-06, | |
| "loss": 1.4498, | |
| "step": 357500 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 3.049229860305786, | |
| "learning_rate": 6.209879442617818e-06, | |
| "loss": 1.45, | |
| "step": 358000 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 2.8990321159362793, | |
| "learning_rate": 6.14872005636449e-06, | |
| "loss": 1.454, | |
| "step": 358500 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 2.6227800846099854, | |
| "learning_rate": 6.0875606701111636e-06, | |
| "loss": 1.4535, | |
| "step": 359000 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 2.856273651123047, | |
| "learning_rate": 6.026401283857837e-06, | |
| "loss": 1.453, | |
| "step": 359500 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 2.6688528060913086, | |
| "learning_rate": 5.9652418976045095e-06, | |
| "loss": 1.4396, | |
| "step": 360000 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 2.765559196472168, | |
| "learning_rate": 5.904082511351183e-06, | |
| "loss": 1.447, | |
| "step": 360500 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 2.546724319458008, | |
| "learning_rate": 5.8429231250978555e-06, | |
| "loss": 1.4445, | |
| "step": 361000 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 2.7436957359313965, | |
| "learning_rate": 5.781763738844528e-06, | |
| "loss": 1.4512, | |
| "step": 361500 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 2.612231731414795, | |
| "learning_rate": 5.720604352591201e-06, | |
| "loss": 1.4496, | |
| "step": 362000 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 2.537179946899414, | |
| "learning_rate": 5.659444966337874e-06, | |
| "loss": 1.4417, | |
| "step": 362500 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 2.82316255569458, | |
| "learning_rate": 5.598285580084547e-06, | |
| "loss": 1.447, | |
| "step": 363000 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 2.607912302017212, | |
| "learning_rate": 5.53712619383122e-06, | |
| "loss": 1.4465, | |
| "step": 363500 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 2.6389966011047363, | |
| "learning_rate": 5.4759668075778935e-06, | |
| "loss": 1.4511, | |
| "step": 364000 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 2.7712039947509766, | |
| "learning_rate": 5.414807421324566e-06, | |
| "loss": 1.4429, | |
| "step": 364500 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 2.6119940280914307, | |
| "learning_rate": 5.353648035071239e-06, | |
| "loss": 1.4473, | |
| "step": 365000 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 2.855820894241333, | |
| "learning_rate": 5.292488648817911e-06, | |
| "loss": 1.4409, | |
| "step": 365500 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 3.3360650539398193, | |
| "learning_rate": 5.2313292625645846e-06, | |
| "loss": 1.4448, | |
| "step": 366000 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 2.7165558338165283, | |
| "learning_rate": 5.170169876311257e-06, | |
| "loss": 1.4459, | |
| "step": 366500 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 2.638815402984619, | |
| "learning_rate": 5.1090104900579306e-06, | |
| "loss": 1.4384, | |
| "step": 367000 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 2.605776071548462, | |
| "learning_rate": 5.047851103804603e-06, | |
| "loss": 1.4515, | |
| "step": 367500 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 2.7470364570617676, | |
| "learning_rate": 4.9866917175512765e-06, | |
| "loss": 1.4433, | |
| "step": 368000 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 2.7127552032470703, | |
| "learning_rate": 4.925532331297949e-06, | |
| "loss": 1.4475, | |
| "step": 368500 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 2.991445541381836, | |
| "learning_rate": 4.8643729450446225e-06, | |
| "loss": 1.4392, | |
| "step": 369000 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 2.5964415073394775, | |
| "learning_rate": 4.803213558791295e-06, | |
| "loss": 1.4435, | |
| "step": 369500 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 2.73907732963562, | |
| "learning_rate": 4.742054172537968e-06, | |
| "loss": 1.4487, | |
| "step": 370000 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 2.7901346683502197, | |
| "learning_rate": 4.680894786284641e-06, | |
| "loss": 1.439, | |
| "step": 370500 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 2.4311490058898926, | |
| "learning_rate": 4.619735400031314e-06, | |
| "loss": 1.4449, | |
| "step": 371000 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 2.748305320739746, | |
| "learning_rate": 4.558576013777987e-06, | |
| "loss": 1.4558, | |
| "step": 371500 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 2.8858590126037598, | |
| "learning_rate": 4.49741662752466e-06, | |
| "loss": 1.4422, | |
| "step": 372000 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 2.893162727355957, | |
| "learning_rate": 4.436257241271333e-06, | |
| "loss": 1.4465, | |
| "step": 372500 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 2.519300937652588, | |
| "learning_rate": 4.375097855018006e-06, | |
| "loss": 1.4354, | |
| "step": 373000 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 2.798462152481079, | |
| "learning_rate": 4.313938468764678e-06, | |
| "loss": 1.4456, | |
| "step": 373500 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 2.894874095916748, | |
| "learning_rate": 4.2527790825113516e-06, | |
| "loss": 1.4358, | |
| "step": 374000 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 2.979597806930542, | |
| "learning_rate": 4.191619696258024e-06, | |
| "loss": 1.444, | |
| "step": 374500 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 2.878843069076538, | |
| "learning_rate": 4.1304603100046975e-06, | |
| "loss": 1.4423, | |
| "step": 375000 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 2.841049909591675, | |
| "learning_rate": 4.06930092375137e-06, | |
| "loss": 1.4403, | |
| "step": 375500 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 2.413423538208008, | |
| "learning_rate": 4.0081415374980435e-06, | |
| "loss": 1.4491, | |
| "step": 376000 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 2.7661304473876953, | |
| "learning_rate": 3.946982151244716e-06, | |
| "loss": 1.4399, | |
| "step": 376500 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 2.5709524154663086, | |
| "learning_rate": 3.885822764991389e-06, | |
| "loss": 1.4391, | |
| "step": 377000 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 2.8157260417938232, | |
| "learning_rate": 3.824663378738061e-06, | |
| "loss": 1.4368, | |
| "step": 377500 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 2.7713100910186768, | |
| "learning_rate": 3.7635039924847346e-06, | |
| "loss": 1.4314, | |
| "step": 378000 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 2.9643430709838867, | |
| "learning_rate": 3.702344606231407e-06, | |
| "loss": 1.4329, | |
| "step": 378500 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 2.798428535461426, | |
| "learning_rate": 3.6411852199780806e-06, | |
| "loss": 1.4358, | |
| "step": 379000 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 2.7989561557769775, | |
| "learning_rate": 3.580025833724754e-06, | |
| "loss": 1.4383, | |
| "step": 379500 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 2.718421459197998, | |
| "learning_rate": 3.5188664474714266e-06, | |
| "loss": 1.4335, | |
| "step": 380000 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 2.8248419761657715, | |
| "learning_rate": 3.4577070612180996e-06, | |
| "loss": 1.439, | |
| "step": 380500 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 2.4477925300598145, | |
| "learning_rate": 3.396547674964772e-06, | |
| "loss": 1.4356, | |
| "step": 381000 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 2.6281678676605225, | |
| "learning_rate": 3.3353882887114456e-06, | |
| "loss": 1.4441, | |
| "step": 381500 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 2.6818716526031494, | |
| "learning_rate": 3.274228902458118e-06, | |
| "loss": 1.434, | |
| "step": 382000 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 3.0857114791870117, | |
| "learning_rate": 3.213069516204791e-06, | |
| "loss": 1.4401, | |
| "step": 382500 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 2.920851230621338, | |
| "learning_rate": 3.1519101299514637e-06, | |
| "loss": 1.4394, | |
| "step": 383000 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 3.0690174102783203, | |
| "learning_rate": 3.090750743698137e-06, | |
| "loss": 1.4371, | |
| "step": 383500 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 2.846827745437622, | |
| "learning_rate": 3.02959135744481e-06, | |
| "loss": 1.4301, | |
| "step": 384000 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 2.927429676055908, | |
| "learning_rate": 2.9684319711914827e-06, | |
| "loss": 1.426, | |
| "step": 384500 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 3.124462842941284, | |
| "learning_rate": 2.9072725849381557e-06, | |
| "loss": 1.4312, | |
| "step": 385000 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 2.4795315265655518, | |
| "learning_rate": 2.8461131986848286e-06, | |
| "loss": 1.4406, | |
| "step": 385500 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 2.942664384841919, | |
| "learning_rate": 2.7849538124315016e-06, | |
| "loss": 1.4338, | |
| "step": 386000 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 2.6400539875030518, | |
| "learning_rate": 2.7237944261781746e-06, | |
| "loss": 1.4335, | |
| "step": 386500 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 2.5888681411743164, | |
| "learning_rate": 2.6626350399248476e-06, | |
| "loss": 1.4293, | |
| "step": 387000 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 2.6692755222320557, | |
| "learning_rate": 2.6014756536715206e-06, | |
| "loss": 1.4294, | |
| "step": 387500 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 2.7141776084899902, | |
| "learning_rate": 2.540316267418193e-06, | |
| "loss": 1.4418, | |
| "step": 388000 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 2.638432264328003, | |
| "learning_rate": 2.479156881164866e-06, | |
| "loss": 1.43, | |
| "step": 388500 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 2.7874903678894043, | |
| "learning_rate": 2.417997494911539e-06, | |
| "loss": 1.4337, | |
| "step": 389000 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 2.8501386642456055, | |
| "learning_rate": 2.356838108658212e-06, | |
| "loss": 1.4363, | |
| "step": 389500 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 3.1942977905273438, | |
| "learning_rate": 2.295678722404885e-06, | |
| "loss": 1.4362, | |
| "step": 390000 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 2.591784715652466, | |
| "learning_rate": 2.2345193361515577e-06, | |
| "loss": 1.44, | |
| "step": 390500 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 3.0462796688079834, | |
| "learning_rate": 2.1733599498982307e-06, | |
| "loss": 1.4318, | |
| "step": 391000 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 2.728050708770752, | |
| "learning_rate": 2.112200563644904e-06, | |
| "loss": 1.4404, | |
| "step": 391500 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 2.7059884071350098, | |
| "learning_rate": 2.0510411773915767e-06, | |
| "loss": 1.4343, | |
| "step": 392000 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 2.7897191047668457, | |
| "learning_rate": 1.9898817911382497e-06, | |
| "loss": 1.4264, | |
| "step": 392500 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 2.517503261566162, | |
| "learning_rate": 1.9287224048849227e-06, | |
| "loss": 1.4393, | |
| "step": 393000 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 2.8523480892181396, | |
| "learning_rate": 1.8675630186315954e-06, | |
| "loss": 1.4253, | |
| "step": 393500 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 2.5820095539093018, | |
| "learning_rate": 1.8064036323782684e-06, | |
| "loss": 1.4311, | |
| "step": 394000 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 2.8148810863494873, | |
| "learning_rate": 1.7452442461249414e-06, | |
| "loss": 1.4347, | |
| "step": 394500 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 2.8168435096740723, | |
| "learning_rate": 1.6840848598716142e-06, | |
| "loss": 1.4316, | |
| "step": 395000 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 2.9340474605560303, | |
| "learning_rate": 1.6229254736182872e-06, | |
| "loss": 1.4312, | |
| "step": 395500 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 2.8138039112091064, | |
| "learning_rate": 1.5617660873649602e-06, | |
| "loss": 1.4246, | |
| "step": 396000 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 2.826143980026245, | |
| "learning_rate": 1.500606701111633e-06, | |
| "loss": 1.4416, | |
| "step": 396500 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 2.8407280445098877, | |
| "learning_rate": 1.439447314858306e-06, | |
| "loss": 1.4341, | |
| "step": 397000 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 2.8632097244262695, | |
| "learning_rate": 1.378287928604979e-06, | |
| "loss": 1.4253, | |
| "step": 397500 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 2.6355011463165283, | |
| "learning_rate": 1.317128542351652e-06, | |
| "loss": 1.4387, | |
| "step": 398000 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 2.9903597831726074, | |
| "learning_rate": 1.2559691560983247e-06, | |
| "loss": 1.4269, | |
| "step": 398500 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 2.6634271144866943, | |
| "learning_rate": 1.1948097698449977e-06, | |
| "loss": 1.4278, | |
| "step": 399000 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 2.844621419906616, | |
| "learning_rate": 1.1336503835916707e-06, | |
| "loss": 1.4376, | |
| "step": 399500 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 2.8783321380615234, | |
| "learning_rate": 1.0724909973383437e-06, | |
| "loss": 1.4309, | |
| "step": 400000 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 2.565383195877075, | |
| "learning_rate": 1.0113316110850164e-06, | |
| "loss": 1.4182, | |
| "step": 400500 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 2.6493799686431885, | |
| "learning_rate": 9.501722248316894e-07, | |
| "loss": 1.4354, | |
| "step": 401000 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 2.7736098766326904, | |
| "learning_rate": 8.890128385783623e-07, | |
| "loss": 1.4269, | |
| "step": 401500 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 2.5747058391571045, | |
| "learning_rate": 8.278534523250352e-07, | |
| "loss": 1.4244, | |
| "step": 402000 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 2.9367337226867676, | |
| "learning_rate": 7.666940660717082e-07, | |
| "loss": 1.4339, | |
| "step": 402500 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 3.1144633293151855, | |
| "learning_rate": 7.055346798183812e-07, | |
| "loss": 1.4342, | |
| "step": 403000 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 2.8952739238739014, | |
| "learning_rate": 6.443752935650541e-07, | |
| "loss": 1.4216, | |
| "step": 403500 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "grad_norm": 2.9333155155181885, | |
| "learning_rate": 5.83215907311727e-07, | |
| "loss": 1.4254, | |
| "step": 404000 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "grad_norm": 2.900174140930176, | |
| "learning_rate": 5.220565210583999e-07, | |
| "loss": 1.433, | |
| "step": 404500 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "grad_norm": 2.6606194972991943, | |
| "learning_rate": 4.608971348050728e-07, | |
| "loss": 1.427, | |
| "step": 405000 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 2.6916987895965576, | |
| "learning_rate": 3.997377485517458e-07, | |
| "loss": 1.4344, | |
| "step": 405500 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 2.7830684185028076, | |
| "learning_rate": 3.385783622984187e-07, | |
| "loss": 1.432, | |
| "step": 406000 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 2.9338104724884033, | |
| "learning_rate": 2.7741897604509164e-07, | |
| "loss": 1.4311, | |
| "step": 406500 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 2.861415147781372, | |
| "learning_rate": 2.1625958979176455e-07, | |
| "loss": 1.4245, | |
| "step": 407000 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 3.0713891983032227, | |
| "learning_rate": 1.5510020353843746e-07, | |
| "loss": 1.4246, | |
| "step": 407500 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 2.7229363918304443, | |
| "learning_rate": 9.39408172851104e-08, | |
| "loss": 1.4289, | |
| "step": 408000 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 2.5204861164093018, | |
| "learning_rate": 3.278143103178331e-08, | |
| "loss": 1.4267, | |
| "step": 408500 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 408768, | |
| "total_flos": 3.442569138534612e+18, | |
| "train_loss": 1.939745183989198, | |
| "train_runtime": 329612.5954, | |
| "train_samples_per_second": 39.685, | |
| "train_steps_per_second": 1.24 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 408768, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "total_flos": 3.442569138534612e+18, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |