| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 296, | |
| "global_step": 592, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0016891891891891893, | |
| "grad_norm": 14.480994151598582, | |
| "learning_rate": 0.0, | |
| "loss": 0.8543, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0016891891891891893, | |
| "eval_loss": 0.9275368452072144, | |
| "eval_runtime": 972.3995, | |
| "eval_samples_per_second": 5.605, | |
| "eval_steps_per_second": 0.351, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0033783783783783786, | |
| "grad_norm": 12.018325552585269, | |
| "learning_rate": 1e-07, | |
| "loss": 0.8892, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.005067567567567568, | |
| "grad_norm": 12.365747505763508, | |
| "learning_rate": 2e-07, | |
| "loss": 0.8855, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.006756756756756757, | |
| "grad_norm": 12.800829498404358, | |
| "learning_rate": 3e-07, | |
| "loss": 0.9267, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.008445945945945946, | |
| "grad_norm": 11.737599190531979, | |
| "learning_rate": 4e-07, | |
| "loss": 0.9281, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.010135135135135136, | |
| "grad_norm": 8.637373821708078, | |
| "learning_rate": 5e-07, | |
| "loss": 0.8875, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.011824324324324325, | |
| "grad_norm": 7.636499275338768, | |
| "learning_rate": 6e-07, | |
| "loss": 0.872, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.013513513513513514, | |
| "grad_norm": 8.385216478501201, | |
| "learning_rate": 7e-07, | |
| "loss": 0.9226, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.015202702702702704, | |
| "grad_norm": 6.885449636338203, | |
| "learning_rate": 8e-07, | |
| "loss": 0.8754, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.016891891891891893, | |
| "grad_norm": 5.298697529194119, | |
| "learning_rate": 9e-07, | |
| "loss": 0.8494, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.018581081081081082, | |
| "grad_norm": 3.472286230584162, | |
| "learning_rate": 1e-06, | |
| "loss": 0.9455, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.02027027027027027, | |
| "grad_norm": 3.0364570491026885, | |
| "learning_rate": 1.1e-06, | |
| "loss": 0.9047, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.02195945945945946, | |
| "grad_norm": 2.6279136763742565, | |
| "learning_rate": 1.2e-06, | |
| "loss": 0.8626, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.02364864864864865, | |
| "grad_norm": 3.9893451802262754, | |
| "learning_rate": 1.3e-06, | |
| "loss": 0.8449, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.02533783783783784, | |
| "grad_norm": 2.4501659804647864, | |
| "learning_rate": 1.4e-06, | |
| "loss": 0.8304, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.02702702702702703, | |
| "grad_norm": 3.1609047731381277, | |
| "learning_rate": 1.5e-06, | |
| "loss": 0.8189, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.028716216216216218, | |
| "grad_norm": 2.708859155242058, | |
| "learning_rate": 1.6e-06, | |
| "loss": 0.8888, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.030405405405405407, | |
| "grad_norm": 2.5829432245407586, | |
| "learning_rate": 1.6999999999999998e-06, | |
| "loss": 0.8441, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.03209459459459459, | |
| "grad_norm": 2.2191942002061102, | |
| "learning_rate": 1.8e-06, | |
| "loss": 0.8493, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.033783783783783786, | |
| "grad_norm": 2.1116693433922187, | |
| "learning_rate": 1.8999999999999998e-06, | |
| "loss": 0.8055, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03547297297297297, | |
| "grad_norm": 1.8584288181443849, | |
| "learning_rate": 2e-06, | |
| "loss": 0.7823, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.037162162162162164, | |
| "grad_norm": 2.0456344199742884, | |
| "learning_rate": 2.1e-06, | |
| "loss": 0.8027, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.03885135135135135, | |
| "grad_norm": 2.9705048998517145, | |
| "learning_rate": 2.2e-06, | |
| "loss": 0.8329, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.04054054054054054, | |
| "grad_norm": 1.7403034865771894, | |
| "learning_rate": 2.2999999999999996e-06, | |
| "loss": 0.7923, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.04222972972972973, | |
| "grad_norm": 1.694602617591211, | |
| "learning_rate": 2.4e-06, | |
| "loss": 0.7855, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.04391891891891892, | |
| "grad_norm": 1.9634724614647339, | |
| "learning_rate": 2.4999999999999998e-06, | |
| "loss": 0.8557, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.04560810810810811, | |
| "grad_norm": 1.7393403646085595, | |
| "learning_rate": 2.6e-06, | |
| "loss": 0.7526, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.0472972972972973, | |
| "grad_norm": 1.6361547258883284, | |
| "learning_rate": 2.7e-06, | |
| "loss": 0.7496, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.048986486486486486, | |
| "grad_norm": 1.6311908238581783, | |
| "learning_rate": 2.8e-06, | |
| "loss": 0.7876, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.05067567567567568, | |
| "grad_norm": 1.6978843460505462, | |
| "learning_rate": 2.8999999999999998e-06, | |
| "loss": 0.7926, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.052364864864864864, | |
| "grad_norm": 1.700306641243955, | |
| "learning_rate": 3e-06, | |
| "loss": 0.8115, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.05405405405405406, | |
| "grad_norm": 1.6289463279253593, | |
| "learning_rate": 3.1e-06, | |
| "loss": 0.7209, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.05574324324324324, | |
| "grad_norm": 1.5306464291392823, | |
| "learning_rate": 3.2e-06, | |
| "loss": 0.7284, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.057432432432432436, | |
| "grad_norm": 1.6255878142473301, | |
| "learning_rate": 3.2999999999999997e-06, | |
| "loss": 0.8205, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.05912162162162162, | |
| "grad_norm": 1.5806183262012883, | |
| "learning_rate": 3.3999999999999996e-06, | |
| "loss": 0.7471, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.060810810810810814, | |
| "grad_norm": 1.4925108083317444, | |
| "learning_rate": 3.5e-06, | |
| "loss": 0.7333, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.0625, | |
| "grad_norm": 1.47721649504872, | |
| "learning_rate": 3.6e-06, | |
| "loss": 0.785, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.06418918918918919, | |
| "grad_norm": 1.44983881509458, | |
| "learning_rate": 3.7e-06, | |
| "loss": 0.7666, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.06587837837837837, | |
| "grad_norm": 1.5847671132911032, | |
| "learning_rate": 3.7999999999999996e-06, | |
| "loss": 0.7811, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.06756756756756757, | |
| "grad_norm": 1.498427406292611, | |
| "learning_rate": 3.9e-06, | |
| "loss": 0.7249, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06925675675675676, | |
| "grad_norm": 1.5689523363961653, | |
| "learning_rate": 4e-06, | |
| "loss": 0.7381, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.07094594594594594, | |
| "grad_norm": 1.7469630894889612, | |
| "learning_rate": 3.999992458679062e-06, | |
| "loss": 0.734, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.07263513513513513, | |
| "grad_norm": 1.4229571463310593, | |
| "learning_rate": 3.999969834773121e-06, | |
| "loss": 0.7057, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.07432432432432433, | |
| "grad_norm": 1.4908625411485357, | |
| "learning_rate": 3.99993212845279e-06, | |
| "loss": 0.7758, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.07601351351351351, | |
| "grad_norm": 1.650679164773279, | |
| "learning_rate": 3.9998793400024255e-06, | |
| "loss": 0.7301, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.0777027027027027, | |
| "grad_norm": 1.7430443156485362, | |
| "learning_rate": 3.99981146982012e-06, | |
| "loss": 0.8024, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.07939189189189189, | |
| "grad_norm": 1.4650446297524533, | |
| "learning_rate": 3.999728518417708e-06, | |
| "loss": 0.7601, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.08108108108108109, | |
| "grad_norm": 1.4330737453276827, | |
| "learning_rate": 3.99963048642075e-06, | |
| "loss": 0.6946, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.08277027027027027, | |
| "grad_norm": 1.6240982322183675, | |
| "learning_rate": 3.999517374568536e-06, | |
| "loss": 0.7218, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.08445945945945946, | |
| "grad_norm": 1.5139893643970004, | |
| "learning_rate": 3.9993891837140806e-06, | |
| "loss": 0.7464, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.08614864864864864, | |
| "grad_norm": 1.3839941970510703, | |
| "learning_rate": 3.999245914824112e-06, | |
| "loss": 0.783, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.08783783783783784, | |
| "grad_norm": 1.4041597855533474, | |
| "learning_rate": 3.999087568979067e-06, | |
| "loss": 0.7357, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.08952702702702703, | |
| "grad_norm": 1.4228074713312242, | |
| "learning_rate": 3.9989141473730804e-06, | |
| "loss": 0.7528, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.09121621621621621, | |
| "grad_norm": 1.45544094128086, | |
| "learning_rate": 3.998725651313984e-06, | |
| "loss": 0.7121, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.0929054054054054, | |
| "grad_norm": 2.4199878467039975, | |
| "learning_rate": 3.998522082223282e-06, | |
| "loss": 0.7945, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.0945945945945946, | |
| "grad_norm": 1.5961731492055695, | |
| "learning_rate": 3.9983034416361594e-06, | |
| "loss": 0.7397, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.09628378378378379, | |
| "grad_norm": 1.4512814325426553, | |
| "learning_rate": 3.998069731201452e-06, | |
| "loss": 0.7034, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.09797297297297297, | |
| "grad_norm": 1.5843175046581277, | |
| "learning_rate": 3.997820952681645e-06, | |
| "loss": 0.7441, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.09966216216216216, | |
| "grad_norm": 1.3953565784623518, | |
| "learning_rate": 3.9975571079528596e-06, | |
| "loss": 0.7193, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.10135135135135136, | |
| "grad_norm": 1.3805266599685402, | |
| "learning_rate": 3.997278199004831e-06, | |
| "loss": 0.7262, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.10304054054054054, | |
| "grad_norm": 1.4612943703890746, | |
| "learning_rate": 3.996984227940902e-06, | |
| "loss": 0.7983, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.10472972972972973, | |
| "grad_norm": 1.810133635495203, | |
| "learning_rate": 3.9966751969780025e-06, | |
| "loss": 0.7769, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.10641891891891891, | |
| "grad_norm": 1.4598985705910221, | |
| "learning_rate": 3.996351108446635e-06, | |
| "loss": 0.7429, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.10810810810810811, | |
| "grad_norm": 1.443083533415171, | |
| "learning_rate": 3.9960119647908545e-06, | |
| "loss": 0.732, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.1097972972972973, | |
| "grad_norm": 1.3883608013574098, | |
| "learning_rate": 3.995657768568251e-06, | |
| "loss": 0.716, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.11148648648648649, | |
| "grad_norm": 1.498198395419281, | |
| "learning_rate": 3.995288522449935e-06, | |
| "loss": 0.6985, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.11317567567567567, | |
| "grad_norm": 1.4554676659901469, | |
| "learning_rate": 3.994904229220507e-06, | |
| "loss": 0.7372, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.11486486486486487, | |
| "grad_norm": 1.5008357924858833, | |
| "learning_rate": 3.994504891778047e-06, | |
| "loss": 0.7126, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.11655405405405406, | |
| "grad_norm": 1.419811370376343, | |
| "learning_rate": 3.994090513134086e-06, | |
| "loss": 0.7243, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.11824324324324324, | |
| "grad_norm": 1.442331674686139, | |
| "learning_rate": 3.9936610964135874e-06, | |
| "loss": 0.7305, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.11993243243243243, | |
| "grad_norm": 1.5182212943849125, | |
| "learning_rate": 3.99321664485492e-06, | |
| "loss": 0.6941, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.12162162162162163, | |
| "grad_norm": 1.3401060100127358, | |
| "learning_rate": 3.992757161809835e-06, | |
| "loss": 0.7111, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.12331081081081081, | |
| "grad_norm": 1.4117702617033703, | |
| "learning_rate": 3.992282650743443e-06, | |
| "loss": 0.7177, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "grad_norm": 1.6738584983948046, | |
| "learning_rate": 3.991793115234182e-06, | |
| "loss": 0.7896, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.1266891891891892, | |
| "grad_norm": 1.8574841037381855, | |
| "learning_rate": 3.991288558973798e-06, | |
| "loss": 0.7902, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.12837837837837837, | |
| "grad_norm": 1.497828677704926, | |
| "learning_rate": 3.990768985767312e-06, | |
| "loss": 0.7076, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.13006756756756757, | |
| "grad_norm": 1.52038921491966, | |
| "learning_rate": 3.9902343995329916e-06, | |
| "loss": 0.8006, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.13175675675675674, | |
| "grad_norm": 1.4458874014195138, | |
| "learning_rate": 3.989684804302323e-06, | |
| "loss": 0.7043, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.13344594594594594, | |
| "grad_norm": 2.0596220547582176, | |
| "learning_rate": 3.98912020421998e-06, | |
| "loss": 0.7644, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.13513513513513514, | |
| "grad_norm": 2.8723719852090985, | |
| "learning_rate": 3.988540603543794e-06, | |
| "loss": 0.766, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.13682432432432431, | |
| "grad_norm": 1.3880889948705695, | |
| "learning_rate": 3.98794600664472e-06, | |
| "loss": 0.7704, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.13851351351351351, | |
| "grad_norm": 1.3581068937102048, | |
| "learning_rate": 3.987336418006802e-06, | |
| "loss": 0.713, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.14020270270270271, | |
| "grad_norm": 1.435765423914678, | |
| "learning_rate": 3.986711842227146e-06, | |
| "loss": 0.7187, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.14189189189189189, | |
| "grad_norm": 1.4288300487189134, | |
| "learning_rate": 3.9860722840158765e-06, | |
| "loss": 0.6821, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.14358108108108109, | |
| "grad_norm": 1.441392279654618, | |
| "learning_rate": 3.985417748196107e-06, | |
| "loss": 0.7272, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.14527027027027026, | |
| "grad_norm": 2.182568121286923, | |
| "learning_rate": 3.984748239703905e-06, | |
| "loss": 0.7365, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.14695945945945946, | |
| "grad_norm": 1.4323037713767275, | |
| "learning_rate": 3.984063763588246e-06, | |
| "loss": 0.7054, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.14864864864864866, | |
| "grad_norm": 1.4352524812290801, | |
| "learning_rate": 3.983364325010986e-06, | |
| "loss": 0.7827, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.15033783783783783, | |
| "grad_norm": 1.3585268883879378, | |
| "learning_rate": 3.9826499292468135e-06, | |
| "loss": 0.7121, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.15202702702702703, | |
| "grad_norm": 1.468711558201286, | |
| "learning_rate": 3.981920581683218e-06, | |
| "loss": 0.7408, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.15371621621621623, | |
| "grad_norm": 1.5155258611932023, | |
| "learning_rate": 3.981176287820444e-06, | |
| "loss": 0.6812, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.1554054054054054, | |
| "grad_norm": 1.4211773705458912, | |
| "learning_rate": 3.9804170532714495e-06, | |
| "loss": 0.7134, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.1570945945945946, | |
| "grad_norm": 1.5526244069056863, | |
| "learning_rate": 3.979642883761865e-06, | |
| "loss": 0.773, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.15878378378378377, | |
| "grad_norm": 1.4535911061477287, | |
| "learning_rate": 3.978853785129953e-06, | |
| "loss": 0.6815, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.16047297297297297, | |
| "grad_norm": 1.377349590888399, | |
| "learning_rate": 3.978049763326558e-06, | |
| "loss": 0.6711, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.16216216216216217, | |
| "grad_norm": 1.4269071997724252, | |
| "learning_rate": 3.977230824415068e-06, | |
| "loss": 0.725, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.16385135135135134, | |
| "grad_norm": 1.8258026487599963, | |
| "learning_rate": 3.9763969745713635e-06, | |
| "loss": 0.742, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.16554054054054054, | |
| "grad_norm": 1.5637199081600301, | |
| "learning_rate": 3.975548220083773e-06, | |
| "loss": 0.7176, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.16722972972972974, | |
| "grad_norm": 1.4418683327068436, | |
| "learning_rate": 3.974684567353027e-06, | |
| "loss": 0.7704, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.16891891891891891, | |
| "grad_norm": 1.500650299775909, | |
| "learning_rate": 3.973806022892209e-06, | |
| "loss": 0.7777, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.17060810810810811, | |
| "grad_norm": 1.4229499052679826, | |
| "learning_rate": 3.972912593326703e-06, | |
| "loss": 0.6773, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.17229729729729729, | |
| "grad_norm": 1.4069619452197821, | |
| "learning_rate": 3.9720042853941494e-06, | |
| "loss": 0.7166, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.17398648648648649, | |
| "grad_norm": 1.4641029705511208, | |
| "learning_rate": 3.971081105944389e-06, | |
| "loss": 0.6622, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.17567567567567569, | |
| "grad_norm": 1.3822866897164043, | |
| "learning_rate": 3.970143061939414e-06, | |
| "loss": 0.6678, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.17736486486486486, | |
| "grad_norm": 1.3486512924905725, | |
| "learning_rate": 3.969190160453317e-06, | |
| "loss": 0.7085, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.17905405405405406, | |
| "grad_norm": 1.4080642954468132, | |
| "learning_rate": 3.968222408672232e-06, | |
| "loss": 0.6805, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.18074324324324326, | |
| "grad_norm": 1.3245403639949178, | |
| "learning_rate": 3.9672398138942874e-06, | |
| "loss": 0.6723, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.18243243243243243, | |
| "grad_norm": 1.522527598129808, | |
| "learning_rate": 3.966242383529544e-06, | |
| "loss": 0.7348, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.18412162162162163, | |
| "grad_norm": 1.4593718177402826, | |
| "learning_rate": 3.965230125099946e-06, | |
| "loss": 0.6859, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.1858108108108108, | |
| "grad_norm": 1.6113096358693728, | |
| "learning_rate": 3.964203046239258e-06, | |
| "loss": 0.7133, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.1875, | |
| "grad_norm": 2.1901731817473227, | |
| "learning_rate": 3.963161154693013e-06, | |
| "loss": 0.6989, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.1891891891891892, | |
| "grad_norm": 1.4041902139290952, | |
| "learning_rate": 3.962104458318446e-06, | |
| "loss": 0.683, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.19087837837837837, | |
| "grad_norm": 1.3355403920639282, | |
| "learning_rate": 3.961032965084447e-06, | |
| "loss": 0.7055, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.19256756756756757, | |
| "grad_norm": 1.4378012677241268, | |
| "learning_rate": 3.959946683071489e-06, | |
| "loss": 0.7721, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.19425675675675674, | |
| "grad_norm": 1.5234306742219028, | |
| "learning_rate": 3.958845620471573e-06, | |
| "loss": 0.7021, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.19594594594594594, | |
| "grad_norm": 1.329738948124563, | |
| "learning_rate": 3.957729785588166e-06, | |
| "loss": 0.6521, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.19763513513513514, | |
| "grad_norm": 1.5992344878568325, | |
| "learning_rate": 3.956599186836137e-06, | |
| "loss": 0.7275, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.19932432432432431, | |
| "grad_norm": 1.4020110254102955, | |
| "learning_rate": 3.955453832741693e-06, | |
| "loss": 0.6748, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.20101351351351351, | |
| "grad_norm": 1.3573035489579244, | |
| "learning_rate": 3.954293731942319e-06, | |
| "loss": 0.7025, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.20270270270270271, | |
| "grad_norm": 1.335897627914034, | |
| "learning_rate": 3.953118893186705e-06, | |
| "loss": 0.6984, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.20439189189189189, | |
| "grad_norm": 1.4918423194384791, | |
| "learning_rate": 3.951929325334689e-06, | |
| "loss": 0.6686, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.20608108108108109, | |
| "grad_norm": 1.6979742909503677, | |
| "learning_rate": 3.950725037357182e-06, | |
| "loss": 0.7658, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.20777027027027026, | |
| "grad_norm": 1.6274287291465286, | |
| "learning_rate": 3.949506038336108e-06, | |
| "loss": 0.7144, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.20945945945945946, | |
| "grad_norm": 1.3302624319860148, | |
| "learning_rate": 3.94827233746433e-06, | |
| "loss": 0.6349, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.21114864864864866, | |
| "grad_norm": 1.4826441921680285, | |
| "learning_rate": 3.94702394404558e-06, | |
| "loss": 0.7219, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.21283783783783783, | |
| "grad_norm": 1.4005282602108076, | |
| "learning_rate": 3.9457608674943945e-06, | |
| "loss": 0.7359, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.21452702702702703, | |
| "grad_norm": 1.5877469483748996, | |
| "learning_rate": 3.9444831173360406e-06, | |
| "loss": 0.693, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.21621621621621623, | |
| "grad_norm": 1.5442843013623662, | |
| "learning_rate": 3.94319070320644e-06, | |
| "loss": 0.7417, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.2179054054054054, | |
| "grad_norm": 1.4333541902470606, | |
| "learning_rate": 3.941883634852104e-06, | |
| "loss": 0.6699, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.2195945945945946, | |
| "grad_norm": 1.5080751183762988, | |
| "learning_rate": 3.940561922130054e-06, | |
| "loss": 0.6839, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.22128378378378377, | |
| "grad_norm": 1.3898529898507852, | |
| "learning_rate": 3.93922557500775e-06, | |
| "loss": 0.663, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.22297297297297297, | |
| "grad_norm": 1.3772254909736483, | |
| "learning_rate": 3.937874603563015e-06, | |
| "loss": 0.6593, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.22466216216216217, | |
| "grad_norm": 1.760208679944535, | |
| "learning_rate": 3.936509017983956e-06, | |
| "loss": 0.7082, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.22635135135135134, | |
| "grad_norm": 1.8196575759309455, | |
| "learning_rate": 3.935128828568896e-06, | |
| "loss": 0.6945, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.22804054054054054, | |
| "grad_norm": 1.385346989951205, | |
| "learning_rate": 3.933734045726283e-06, | |
| "loss": 0.6863, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.22972972972972974, | |
| "grad_norm": 1.3685547174486146, | |
| "learning_rate": 3.932324679974623e-06, | |
| "loss": 0.7477, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.23141891891891891, | |
| "grad_norm": 1.4295145309120525, | |
| "learning_rate": 3.930900741942396e-06, | |
| "loss": 0.6747, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.23310810810810811, | |
| "grad_norm": 1.3906500390926828, | |
| "learning_rate": 3.929462242367975e-06, | |
| "loss": 0.764, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.23479729729729729, | |
| "grad_norm": 1.384756160443465, | |
| "learning_rate": 3.928009192099548e-06, | |
| "loss": 0.6678, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.23648648648648649, | |
| "grad_norm": 1.7220512704516102, | |
| "learning_rate": 3.926541602095032e-06, | |
| "loss": 0.7969, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.23817567567567569, | |
| "grad_norm": 1.477284673606475, | |
| "learning_rate": 3.925059483421996e-06, | |
| "loss": 0.6866, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.23986486486486486, | |
| "grad_norm": 1.431239746187604, | |
| "learning_rate": 3.9235628472575705e-06, | |
| "loss": 0.6979, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.24155405405405406, | |
| "grad_norm": 1.604028789928828, | |
| "learning_rate": 3.92205170488837e-06, | |
| "loss": 0.6599, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.24324324324324326, | |
| "grad_norm": 1.3489137098771946, | |
| "learning_rate": 3.9205260677104055e-06, | |
| "loss": 0.7128, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.24493243243243243, | |
| "grad_norm": 1.3940741727669859, | |
| "learning_rate": 3.9189859472289945e-06, | |
| "loss": 0.6735, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.24662162162162163, | |
| "grad_norm": 1.3893021240993075, | |
| "learning_rate": 3.917431355058681e-06, | |
| "loss": 0.6915, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.2483108108108108, | |
| "grad_norm": 1.5080140713249557, | |
| "learning_rate": 3.915862302923143e-06, | |
| "loss": 0.7439, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.3355739307749024, | |
| "learning_rate": 3.914278802655106e-06, | |
| "loss": 0.7065, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.2516891891891892, | |
| "grad_norm": 1.6241902843973968, | |
| "learning_rate": 3.912680866196255e-06, | |
| "loss": 0.7081, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.2533783783783784, | |
| "grad_norm": 1.5261637427264052, | |
| "learning_rate": 3.9110685055971406e-06, | |
| "loss": 0.6994, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.25506756756756754, | |
| "grad_norm": 1.479630710856107, | |
| "learning_rate": 3.909441733017091e-06, | |
| "loss": 0.689, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.25675675675675674, | |
| "grad_norm": 1.4933663608483565, | |
| "learning_rate": 3.907800560724121e-06, | |
| "loss": 0.6942, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.25844594594594594, | |
| "grad_norm": 1.5302529362714137, | |
| "learning_rate": 3.906145001094839e-06, | |
| "loss": 0.6868, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.26013513513513514, | |
| "grad_norm": 1.4040014862243637, | |
| "learning_rate": 3.904475066614349e-06, | |
| "loss": 0.6988, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.26182432432432434, | |
| "grad_norm": 1.4226489729992533, | |
| "learning_rate": 3.902790769876164e-06, | |
| "loss": 0.7645, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.2635135135135135, | |
| "grad_norm": 1.5321366676822086, | |
| "learning_rate": 3.901092123582107e-06, | |
| "loss": 0.7381, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.2652027027027027, | |
| "grad_norm": 1.430585178455411, | |
| "learning_rate": 3.899379140542213e-06, | |
| "loss": 0.7184, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.2668918918918919, | |
| "grad_norm": 1.4320550163328156, | |
| "learning_rate": 3.897651833674639e-06, | |
| "loss": 0.6898, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.2685810810810811, | |
| "grad_norm": 1.4007180357015416, | |
| "learning_rate": 3.895910216005559e-06, | |
| "loss": 0.7204, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.2702702702702703, | |
| "grad_norm": 1.4760164587014226, | |
| "learning_rate": 3.894154300669071e-06, | |
| "loss": 0.6809, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.2719594594594595, | |
| "grad_norm": 1.359805865137023, | |
| "learning_rate": 3.892384100907097e-06, | |
| "loss": 0.6664, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.27364864864864863, | |
| "grad_norm": 1.494271130133251, | |
| "learning_rate": 3.89059963006928e-06, | |
| "loss": 0.7812, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.27533783783783783, | |
| "grad_norm": 1.4693114375854741, | |
| "learning_rate": 3.888800901612889e-06, | |
| "loss": 0.6843, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.27702702702702703, | |
| "grad_norm": 1.3756308293515527, | |
| "learning_rate": 3.886987929102711e-06, | |
| "loss": 0.6712, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.27871621621621623, | |
| "grad_norm": 1.4414208695652082, | |
| "learning_rate": 3.885160726210954e-06, | |
| "loss": 0.6686, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.28040540540540543, | |
| "grad_norm": 1.6068914930388214, | |
| "learning_rate": 3.883319306717143e-06, | |
| "loss": 0.6479, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.28209459459459457, | |
| "grad_norm": 1.5676401171578263, | |
| "learning_rate": 3.881463684508011e-06, | |
| "loss": 0.7549, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.28378378378378377, | |
| "grad_norm": 1.370828841111984, | |
| "learning_rate": 3.879593873577402e-06, | |
| "loss": 0.7173, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.28547297297297297, | |
| "grad_norm": 1.3973893255241456, | |
| "learning_rate": 3.877709888026159e-06, | |
| "loss": 0.7046, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.28716216216216217, | |
| "grad_norm": 1.3059515131618362, | |
| "learning_rate": 3.875811742062024e-06, | |
| "loss": 0.6521, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.28885135135135137, | |
| "grad_norm": 1.3428841635145867, | |
| "learning_rate": 3.873899449999524e-06, | |
| "loss": 0.6404, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.2905405405405405, | |
| "grad_norm": 1.3845007444348196, | |
| "learning_rate": 3.871973026259865e-06, | |
| "loss": 0.6715, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.2922297297297297, | |
| "grad_norm": 1.4166123181291566, | |
| "learning_rate": 3.8700324853708295e-06, | |
| "loss": 0.7466, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.2939189189189189, | |
| "grad_norm": 1.3710082585316177, | |
| "learning_rate": 3.8680778419666576e-06, | |
| "loss": 0.6271, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.2956081081081081, | |
| "grad_norm": 1.3982194102229981, | |
| "learning_rate": 3.8661091107879434e-06, | |
| "loss": 0.7215, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.2972972972972973, | |
| "grad_norm": 1.7210038720236769, | |
| "learning_rate": 3.8641263066815205e-06, | |
| "loss": 0.721, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.2989864864864865, | |
| "grad_norm": 1.4097607569308712, | |
| "learning_rate": 3.862129444600349e-06, | |
| "loss": 0.6755, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.30067567567567566, | |
| "grad_norm": 1.4419897609366905, | |
| "learning_rate": 3.86011853960341e-06, | |
| "loss": 0.7348, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.30236486486486486, | |
| "grad_norm": 1.507066037469795, | |
| "learning_rate": 3.8580936068555815e-06, | |
| "loss": 0.7768, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.30405405405405406, | |
| "grad_norm": 1.4044353506211649, | |
| "learning_rate": 3.856054661627532e-06, | |
| "loss": 0.6697, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.30574324324324326, | |
| "grad_norm": 1.3011596028963899, | |
| "learning_rate": 3.854001719295601e-06, | |
| "loss": 0.6593, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.30743243243243246, | |
| "grad_norm": 1.4882426091073055, | |
| "learning_rate": 3.851934795341686e-06, | |
| "loss": 0.725, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.3091216216216216, | |
| "grad_norm": 1.412044516159466, | |
| "learning_rate": 3.849853905353123e-06, | |
| "loss": 0.7254, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.3108108108108108, | |
| "grad_norm": 1.3552048527285785, | |
| "learning_rate": 3.847759065022573e-06, | |
| "loss": 0.6612, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.3125, | |
| "grad_norm": 1.4893729275292906, | |
| "learning_rate": 3.845650290147898e-06, | |
| "loss": 0.7761, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.3141891891891892, | |
| "grad_norm": 1.5227378731808672, | |
| "learning_rate": 3.843527596632047e-06, | |
| "loss": 0.7112, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.3158783783783784, | |
| "grad_norm": 1.3652595903876206, | |
| "learning_rate": 3.841391000482931e-06, | |
| "loss": 0.7052, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.31756756756756754, | |
| "grad_norm": 1.3170940458876272, | |
| "learning_rate": 3.839240517813311e-06, | |
| "loss": 0.621, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.31925675675675674, | |
| "grad_norm": 1.4119825821508496, | |
| "learning_rate": 3.837076164840663e-06, | |
| "loss": 0.6619, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.32094594594594594, | |
| "grad_norm": 1.7966706386985845, | |
| "learning_rate": 3.834897957887069e-06, | |
| "loss": 0.7411, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.32263513513513514, | |
| "grad_norm": 1.7260868732827916, | |
| "learning_rate": 3.832705913379087e-06, | |
| "loss": 0.6961, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.32432432432432434, | |
| "grad_norm": 1.417051027809352, | |
| "learning_rate": 3.830500047847628e-06, | |
| "loss": 0.681, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.3260135135135135, | |
| "grad_norm": 1.479899217335837, | |
| "learning_rate": 3.828280377927833e-06, | |
| "loss": 0.701, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.3277027027027027, | |
| "grad_norm": 1.4191602227151927, | |
| "learning_rate": 3.826046920358943e-06, | |
| "loss": 0.6454, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.3293918918918919, | |
| "grad_norm": 1.3454083625003137, | |
| "learning_rate": 3.82379969198418e-06, | |
| "loss": 0.6771, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.3310810810810811, | |
| "grad_norm": 1.9842557988763356, | |
| "learning_rate": 3.821538709750614e-06, | |
| "loss": 0.7267, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.3327702702702703, | |
| "grad_norm": 1.4358235281752614, | |
| "learning_rate": 3.819263990709037e-06, | |
| "loss": 0.7401, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.3344594594594595, | |
| "grad_norm": 1.3515480343034862, | |
| "learning_rate": 3.816975552013836e-06, | |
| "loss": 0.672, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.33614864864864863, | |
| "grad_norm": 1.3859740753777527, | |
| "learning_rate": 3.814673410922861e-06, | |
| "loss": 0.6601, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.33783783783783783, | |
| "grad_norm": 1.4541566973376086, | |
| "learning_rate": 3.8123575847972977e-06, | |
| "loss": 0.7193, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.33952702702702703, | |
| "grad_norm": 1.480164342260533, | |
| "learning_rate": 3.8100280911015333e-06, | |
| "loss": 0.7088, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.34121621621621623, | |
| "grad_norm": 1.4289314706161513, | |
| "learning_rate": 3.8076849474030286e-06, | |
| "loss": 0.6853, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.34290540540540543, | |
| "grad_norm": 1.4130120631059673, | |
| "learning_rate": 3.8053281713721804e-06, | |
| "loss": 0.6854, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.34459459459459457, | |
| "grad_norm": 1.3964253471976507, | |
| "learning_rate": 3.802957780782195e-06, | |
| "loss": 0.6842, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.34628378378378377, | |
| "grad_norm": 1.2981821719803015, | |
| "learning_rate": 3.800573793508948e-06, | |
| "loss": 0.7186, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.34797297297297297, | |
| "grad_norm": 1.5058610876069536, | |
| "learning_rate": 3.7981762275308514e-06, | |
| "loss": 0.6836, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.34966216216216217, | |
| "grad_norm": 1.3543583050604886, | |
| "learning_rate": 3.7957651009287214e-06, | |
| "loss": 0.6814, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.35135135135135137, | |
| "grad_norm": 1.4591142445601828, | |
| "learning_rate": 3.7933404318856365e-06, | |
| "loss": 0.6472, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.3530405405405405, | |
| "grad_norm": 1.5357516266772646, | |
| "learning_rate": 3.7909022386868042e-06, | |
| "loss": 0.7252, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.3547297297297297, | |
| "grad_norm": 1.3504853847903922, | |
| "learning_rate": 3.7884505397194224e-06, | |
| "loss": 0.6627, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.3564189189189189, | |
| "grad_norm": 1.4204456318259766, | |
| "learning_rate": 3.7859853534725393e-06, | |
| "loss": 0.7244, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.3581081081081081, | |
| "grad_norm": 1.3315700281844445, | |
| "learning_rate": 3.783506698536916e-06, | |
| "loss": 0.6536, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.3597972972972973, | |
| "grad_norm": 1.552425593407634, | |
| "learning_rate": 3.7810145936048846e-06, | |
| "loss": 0.6733, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.3614864864864865, | |
| "grad_norm": 1.407167273299544, | |
| "learning_rate": 3.778509057470208e-06, | |
| "loss": 0.6622, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.36317567567567566, | |
| "grad_norm": 1.4261346346811004, | |
| "learning_rate": 3.7759901090279385e-06, | |
| "loss": 0.7443, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.36486486486486486, | |
| "grad_norm": 1.370518370669021, | |
| "learning_rate": 3.7734577672742754e-06, | |
| "loss": 0.7047, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.36655405405405406, | |
| "grad_norm": 1.3021833652741044, | |
| "learning_rate": 3.7709120513064196e-06, | |
| "loss": 0.691, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.36824324324324326, | |
| "grad_norm": 1.3463537418495264, | |
| "learning_rate": 3.768352980322433e-06, | |
| "loss": 0.686, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.36993243243243246, | |
| "grad_norm": 1.5841455095635015, | |
| "learning_rate": 3.7657805736210905e-06, | |
| "loss": 0.7098, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.3716216216216216, | |
| "grad_norm": 1.2932754452322628, | |
| "learning_rate": 3.763194850601737e-06, | |
| "loss": 0.6325, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.3733108108108108, | |
| "grad_norm": 1.3674162336593572, | |
| "learning_rate": 3.7605958307641393e-06, | |
| "loss": 0.7169, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.375, | |
| "grad_norm": 1.4335929535708738, | |
| "learning_rate": 3.7579835337083408e-06, | |
| "loss": 0.6958, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.3766891891891892, | |
| "grad_norm": 1.3707580832113364, | |
| "learning_rate": 3.755357979134511e-06, | |
| "loss": 0.6482, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.3783783783783784, | |
| "grad_norm": 1.4338771915089066, | |
| "learning_rate": 3.7527191868428003e-06, | |
| "loss": 0.7001, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.38006756756756754, | |
| "grad_norm": 1.4843289475059827, | |
| "learning_rate": 3.750067176733189e-06, | |
| "loss": 0.6732, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.38175675675675674, | |
| "grad_norm": 1.5775222455201545, | |
| "learning_rate": 3.7474019688053346e-06, | |
| "loss": 0.7174, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.38344594594594594, | |
| "grad_norm": 1.4415117197399947, | |
| "learning_rate": 3.744723583158427e-06, | |
| "loss": 0.7392, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.38513513513513514, | |
| "grad_norm": 1.343529072793593, | |
| "learning_rate": 3.742032039991031e-06, | |
| "loss": 0.6349, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.38682432432432434, | |
| "grad_norm": 2.10680046167969, | |
| "learning_rate": 3.739327359600938e-06, | |
| "loss": 0.6924, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.3885135135135135, | |
| "grad_norm": 1.332769490719328, | |
| "learning_rate": 3.736609562385011e-06, | |
| "loss": 0.6563, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.3902027027027027, | |
| "grad_norm": 1.3133901303899669, | |
| "learning_rate": 3.73387866883903e-06, | |
| "loss": 0.6815, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.3918918918918919, | |
| "grad_norm": 1.358025933731894, | |
| "learning_rate": 3.731134699557541e-06, | |
| "loss": 0.7082, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.3935810810810811, | |
| "grad_norm": 1.3424987866213658, | |
| "learning_rate": 3.7283776752336966e-06, | |
| "loss": 0.6801, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.3952702702702703, | |
| "grad_norm": 1.4157554026587067, | |
| "learning_rate": 3.725607616659101e-06, | |
| "loss": 0.709, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.3969594594594595, | |
| "grad_norm": 1.3201455751629254, | |
| "learning_rate": 3.7228245447236565e-06, | |
| "loss": 0.7228, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.39864864864864863, | |
| "grad_norm": 1.327402943647592, | |
| "learning_rate": 3.7200284804154006e-06, | |
| "loss": 0.6788, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.40033783783783783, | |
| "grad_norm": 1.2643583008290484, | |
| "learning_rate": 3.717219444820353e-06, | |
| "loss": 0.7641, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.40202702702702703, | |
| "grad_norm": 1.409052594991303, | |
| "learning_rate": 3.7143974591223507e-06, | |
| "loss": 0.6654, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.40371621621621623, | |
| "grad_norm": 1.3890965869384935, | |
| "learning_rate": 3.711562544602895e-06, | |
| "loss": 0.7743, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.40540540540540543, | |
| "grad_norm": 1.2046354984122658, | |
| "learning_rate": 3.7087147226409854e-06, | |
| "loss": 0.6307, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.40709459459459457, | |
| "grad_norm": 1.3627485443451046, | |
| "learning_rate": 3.705854014712962e-06, | |
| "loss": 0.6762, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.40878378378378377, | |
| "grad_norm": 1.443105456149895, | |
| "learning_rate": 3.7029804423923405e-06, | |
| "loss": 0.6653, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.41047297297297297, | |
| "grad_norm": 1.3802124106502922, | |
| "learning_rate": 3.7000940273496526e-06, | |
| "loss": 0.6859, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.41216216216216217, | |
| "grad_norm": 1.3786708758892288, | |
| "learning_rate": 3.69719479135228e-06, | |
| "loss": 0.704, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.41385135135135137, | |
| "grad_norm": 1.6258577516719144, | |
| "learning_rate": 3.694282756264293e-06, | |
| "loss": 0.6779, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.4155405405405405, | |
| "grad_norm": 1.440767640944531, | |
| "learning_rate": 3.6913579440462813e-06, | |
| "loss": 0.6907, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.4172297297297297, | |
| "grad_norm": 1.5767111896131487, | |
| "learning_rate": 3.6884203767551933e-06, | |
| "loss": 0.7245, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.4189189189189189, | |
| "grad_norm": 1.4496934104421961, | |
| "learning_rate": 3.685470076544167e-06, | |
| "loss": 0.7094, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.4206081081081081, | |
| "grad_norm": 1.90004392560482, | |
| "learning_rate": 3.6825070656623624e-06, | |
| "loss": 0.6901, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.4222972972972973, | |
| "grad_norm": 1.4383292682665025, | |
| "learning_rate": 3.679531366454796e-06, | |
| "loss": 0.6375, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.4239864864864865, | |
| "grad_norm": 1.2362701389465913, | |
| "learning_rate": 3.67654300136217e-06, | |
| "loss": 0.665, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.42567567567567566, | |
| "grad_norm": 1.46687594364853, | |
| "learning_rate": 3.6735419929207053e-06, | |
| "loss": 0.7318, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.42736486486486486, | |
| "grad_norm": 1.4321445965555986, | |
| "learning_rate": 3.670528363761969e-06, | |
| "loss": 0.7179, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.42905405405405406, | |
| "grad_norm": 1.8152115334660424, | |
| "learning_rate": 3.6675021366127065e-06, | |
| "loss": 0.7054, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.43074324324324326, | |
| "grad_norm": 1.429230095995432, | |
| "learning_rate": 3.6644633342946684e-06, | |
| "loss": 0.6142, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.43243243243243246, | |
| "grad_norm": 1.4522752487614075, | |
| "learning_rate": 3.6614119797244365e-06, | |
| "loss": 0.698, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.4341216216216216, | |
| "grad_norm": 1.5568752097630847, | |
| "learning_rate": 3.6583480959132564e-06, | |
| "loss": 0.7271, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.4358108108108108, | |
| "grad_norm": 1.3610557476395386, | |
| "learning_rate": 3.655271705966859e-06, | |
| "loss": 0.7206, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.4375, | |
| "grad_norm": 1.4012888384359043, | |
| "learning_rate": 3.6521828330852876e-06, | |
| "loss": 0.7165, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.4391891891891892, | |
| "grad_norm": 1.4325647232787657, | |
| "learning_rate": 3.6490815005627244e-06, | |
| "loss": 0.7167, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.4408783783783784, | |
| "grad_norm": 1.3071946737965967, | |
| "learning_rate": 3.6459677317873127e-06, | |
| "loss": 0.7068, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.44256756756756754, | |
| "grad_norm": 1.3337930545508176, | |
| "learning_rate": 3.6428415502409832e-06, | |
| "loss": 0.6375, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.44425675675675674, | |
| "grad_norm": 1.374084685518983, | |
| "learning_rate": 3.6397029794992734e-06, | |
| "loss": 0.6604, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.44594594594594594, | |
| "grad_norm": 1.5129308344285353, | |
| "learning_rate": 3.6365520432311526e-06, | |
| "loss": 0.7708, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.44763513513513514, | |
| "grad_norm": 1.3480810073566278, | |
| "learning_rate": 3.633388765198843e-06, | |
| "loss": 0.672, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.44932432432432434, | |
| "grad_norm": 1.4142271399614021, | |
| "learning_rate": 3.6302131692576397e-06, | |
| "loss": 0.6493, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.4510135135135135, | |
| "grad_norm": 1.3926074050818393, | |
| "learning_rate": 3.62702527935573e-06, | |
| "loss": 0.7032, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.4527027027027027, | |
| "grad_norm": 1.3416220663606933, | |
| "learning_rate": 3.6238251195340146e-06, | |
| "loss": 0.6994, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.4543918918918919, | |
| "grad_norm": 1.4002752256004076, | |
| "learning_rate": 3.6206127139259264e-06, | |
| "loss": 0.713, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.4560810810810811, | |
| "grad_norm": 1.4228643045362366, | |
| "learning_rate": 3.6173880867572475e-06, | |
| "loss": 0.6981, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.4577702702702703, | |
| "grad_norm": 1.3277514022496684, | |
| "learning_rate": 3.614151262345925e-06, | |
| "loss": 0.6278, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.4594594594594595, | |
| "grad_norm": 1.5614757390183474, | |
| "learning_rate": 3.610902265101892e-06, | |
| "loss": 0.6662, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.46114864864864863, | |
| "grad_norm": 1.377452591839154, | |
| "learning_rate": 3.607641119526878e-06, | |
| "loss": 0.7131, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.46283783783783783, | |
| "grad_norm": 1.3254757212732167, | |
| "learning_rate": 3.6043678502142293e-06, | |
| "loss": 0.6995, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.46452702702702703, | |
| "grad_norm": 1.361651841573283, | |
| "learning_rate": 3.6010824818487207e-06, | |
| "loss": 0.7173, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.46621621621621623, | |
| "grad_norm": 1.4283922402308293, | |
| "learning_rate": 3.5977850392063687e-06, | |
| "loss": 0.6889, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.46790540540540543, | |
| "grad_norm": 1.3050372631128504, | |
| "learning_rate": 3.5944755471542464e-06, | |
| "loss": 0.6299, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.46959459459459457, | |
| "grad_norm": 1.3310272595986719, | |
| "learning_rate": 3.591154030650296e-06, | |
| "loss": 0.7197, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.47128378378378377, | |
| "grad_norm": 1.462114533432708, | |
| "learning_rate": 3.587820514743139e-06, | |
| "loss": 0.6827, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.47297297297297297, | |
| "grad_norm": 1.360632157690388, | |
| "learning_rate": 3.5844750245718897e-06, | |
| "loss": 0.7245, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.47466216216216217, | |
| "grad_norm": 1.4456255398928297, | |
| "learning_rate": 3.5811175853659623e-06, | |
| "loss": 0.7162, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.47635135135135137, | |
| "grad_norm": 1.3510909857414901, | |
| "learning_rate": 3.5777482224448836e-06, | |
| "loss": 0.6797, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.4780405405405405, | |
| "grad_norm": 1.3321362166986153, | |
| "learning_rate": 3.5743669612181e-06, | |
| "loss": 0.6759, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.4797297297297297, | |
| "grad_norm": 1.379461432799908, | |
| "learning_rate": 3.570973827184789e-06, | |
| "loss": 0.7133, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.4814189189189189, | |
| "grad_norm": 1.4068419824919371, | |
| "learning_rate": 3.5675688459336623e-06, | |
| "loss": 0.6568, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.4831081081081081, | |
| "grad_norm": 1.3786876200293305, | |
| "learning_rate": 3.5641520431427766e-06, | |
| "loss": 0.6935, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.4847972972972973, | |
| "grad_norm": 1.3495616584593104, | |
| "learning_rate": 3.5607234445793387e-06, | |
| "loss": 0.6745, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.4864864864864865, | |
| "grad_norm": 1.3576777119745298, | |
| "learning_rate": 3.55728307609951e-06, | |
| "loss": 0.6784, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.48817567567567566, | |
| "grad_norm": 1.307035703069375, | |
| "learning_rate": 3.553830963648214e-06, | |
| "loss": 0.6857, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.48986486486486486, | |
| "grad_norm": 1.4002775832288137, | |
| "learning_rate": 3.5503671332589384e-06, | |
| "loss": 0.7473, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.49155405405405406, | |
| "grad_norm": 1.373483342786104, | |
| "learning_rate": 3.5468916110535397e-06, | |
| "loss": 0.7624, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.49324324324324326, | |
| "grad_norm": 1.301856349289806, | |
| "learning_rate": 3.5434044232420463e-06, | |
| "loss": 0.6667, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.49493243243243246, | |
| "grad_norm": 1.3278884354841156, | |
| "learning_rate": 3.539905596122461e-06, | |
| "loss": 0.6685, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.4966216216216216, | |
| "grad_norm": 1.3342318682934637, | |
| "learning_rate": 3.536395156080561e-06, | |
| "loss": 0.6051, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.4983108108108108, | |
| "grad_norm": 1.34236526276834, | |
| "learning_rate": 3.532873129589702e-06, | |
| "loss": 0.6658, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.3367750769250621, | |
| "learning_rate": 3.529339543210617e-06, | |
| "loss": 0.6505, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "eval_loss": 0.5758998990058899, | |
| "eval_runtime": 949.2326, | |
| "eval_samples_per_second": 5.741, | |
| "eval_steps_per_second": 0.359, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.5016891891891891, | |
| "grad_norm": 1.5001207756046795, | |
| "learning_rate": 3.5257944235912133e-06, | |
| "loss": 0.7564, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.5033783783783784, | |
| "grad_norm": 1.3616930930034892, | |
| "learning_rate": 3.522237797466377e-06, | |
| "loss": 0.7024, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.5050675675675675, | |
| "grad_norm": 1.324985823959547, | |
| "learning_rate": 3.5186696916577665e-06, | |
| "loss": 0.6364, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.5067567567567568, | |
| "grad_norm": 1.3179657688973248, | |
| "learning_rate": 3.5150901330736132e-06, | |
| "loss": 0.6321, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5084459459459459, | |
| "grad_norm": 1.2671693135409419, | |
| "learning_rate": 3.5114991487085164e-06, | |
| "loss": 0.6639, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.5101351351351351, | |
| "grad_norm": 1.3146371625412312, | |
| "learning_rate": 3.5078967656432427e-06, | |
| "loss": 0.6528, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.5118243243243243, | |
| "grad_norm": 1.4881216764035425, | |
| "learning_rate": 3.5042830110445183e-06, | |
| "loss": 0.7018, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.5135135135135135, | |
| "grad_norm": 1.4149915220983738, | |
| "learning_rate": 3.5006579121648267e-06, | |
| "loss": 0.6319, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.5152027027027027, | |
| "grad_norm": 1.2678817742325494, | |
| "learning_rate": 3.497021496342202e-06, | |
| "loss": 0.6819, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.5168918918918919, | |
| "grad_norm": 1.4242737673502084, | |
| "learning_rate": 3.4933737910000226e-06, | |
| "loss": 0.6619, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.518581081081081, | |
| "grad_norm": 1.3987855120700334, | |
| "learning_rate": 3.489714823646806e-06, | |
| "loss": 0.6767, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.5202702702702703, | |
| "grad_norm": 1.3015873879741848, | |
| "learning_rate": 3.4860446218759982e-06, | |
| "loss": 0.6568, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.5219594594594594, | |
| "grad_norm": 1.2995415158579773, | |
| "learning_rate": 3.4823632133657698e-06, | |
| "loss": 0.6928, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.5236486486486487, | |
| "grad_norm": 1.3973146351155765, | |
| "learning_rate": 3.478670625878803e-06, | |
| "loss": 0.7464, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.5253378378378378, | |
| "grad_norm": 1.3419676372680656, | |
| "learning_rate": 3.474966887262085e-06, | |
| "loss": 0.627, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.527027027027027, | |
| "grad_norm": 1.2957464216055319, | |
| "learning_rate": 3.4712520254466985e-06, | |
| "loss": 0.6509, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.5287162162162162, | |
| "grad_norm": 1.4041048903525055, | |
| "learning_rate": 3.4675260684476077e-06, | |
| "loss": 0.7146, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.5304054054054054, | |
| "grad_norm": 1.3505461190296533, | |
| "learning_rate": 3.4637890443634507e-06, | |
| "loss": 0.6383, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.5320945945945946, | |
| "grad_norm": 1.5396055781468738, | |
| "learning_rate": 3.460040981376325e-06, | |
| "loss": 0.6604, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.5337837837837838, | |
| "grad_norm": 1.448008568089465, | |
| "learning_rate": 3.4562819077515765e-06, | |
| "loss": 0.7216, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.535472972972973, | |
| "grad_norm": 1.286441416505065, | |
| "learning_rate": 3.4525118518375863e-06, | |
| "loss": 0.6402, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.5371621621621622, | |
| "grad_norm": 1.5964613265568737, | |
| "learning_rate": 3.4487308420655557e-06, | |
| "loss": 0.7228, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.5388513513513513, | |
| "grad_norm": 1.4345479080610504, | |
| "learning_rate": 3.444938906949293e-06, | |
| "loss": 0.7091, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.5405405405405406, | |
| "grad_norm": 1.3311234454544891, | |
| "learning_rate": 3.4411360750849973e-06, | |
| "loss": 0.7218, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.5422297297297297, | |
| "grad_norm": 1.5087568673234424, | |
| "learning_rate": 3.437322375151045e-06, | |
| "loss": 0.7483, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.543918918918919, | |
| "grad_norm": 1.4737933458588186, | |
| "learning_rate": 3.433497835907771e-06, | |
| "loss": 0.7068, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.5456081081081081, | |
| "grad_norm": 1.397907808243913, | |
| "learning_rate": 3.4296624861972524e-06, | |
| "loss": 0.6457, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.5472972972972973, | |
| "grad_norm": 1.4048567528248315, | |
| "learning_rate": 3.425816354943094e-06, | |
| "loss": 0.6434, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.5489864864864865, | |
| "grad_norm": 1.4428709908043869, | |
| "learning_rate": 3.421959471150203e-06, | |
| "loss": 0.6821, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.5506756756756757, | |
| "grad_norm": 1.446561332479648, | |
| "learning_rate": 3.418091863904582e-06, | |
| "loss": 0.6819, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.5523648648648649, | |
| "grad_norm": 1.4357382541799415, | |
| "learning_rate": 3.414213562373095e-06, | |
| "loss": 0.6778, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.5540540540540541, | |
| "grad_norm": 1.325570913409981, | |
| "learning_rate": 3.41032459580326e-06, | |
| "loss": 0.7177, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.5557432432432432, | |
| "grad_norm": 1.4430530715112349, | |
| "learning_rate": 3.4064249935230217e-06, | |
| "loss": 0.6964, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.5574324324324325, | |
| "grad_norm": 1.4498604144957874, | |
| "learning_rate": 3.4025147849405334e-06, | |
| "loss": 0.7229, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.5591216216216216, | |
| "grad_norm": 1.4203134795507701, | |
| "learning_rate": 3.3985939995439314e-06, | |
| "loss": 0.718, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.5608108108108109, | |
| "grad_norm": 1.3911765358785673, | |
| "learning_rate": 3.3946626669011175e-06, | |
| "loss": 0.5948, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.5625, | |
| "grad_norm": 1.3184457107748184, | |
| "learning_rate": 3.3907208166595326e-06, | |
| "loss": 0.7103, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.5641891891891891, | |
| "grad_norm": 1.301270165189858, | |
| "learning_rate": 3.3867684785459353e-06, | |
| "loss": 0.6371, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.5658783783783784, | |
| "grad_norm": 1.454573205764328, | |
| "learning_rate": 3.3828056823661754e-06, | |
| "loss": 0.7071, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.5675675675675675, | |
| "grad_norm": 1.264644666852447, | |
| "learning_rate": 3.378832458004969e-06, | |
| "loss": 0.6851, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.5692567567567568, | |
| "grad_norm": 1.459099029368749, | |
| "learning_rate": 3.3748488354256786e-06, | |
| "loss": 0.6663, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.5709459459459459, | |
| "grad_norm": 1.5212632276053133, | |
| "learning_rate": 3.370854844670079e-06, | |
| "loss": 0.7275, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.5726351351351351, | |
| "grad_norm": 1.2987587957080753, | |
| "learning_rate": 3.3668505158581376e-06, | |
| "loss": 0.6521, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.5743243243243243, | |
| "grad_norm": 1.4304350175753886, | |
| "learning_rate": 3.3628358791877826e-06, | |
| "loss": 0.6774, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.5760135135135135, | |
| "grad_norm": 1.3183042000590088, | |
| "learning_rate": 3.358810964934676e-06, | |
| "loss": 0.6695, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.5777027027027027, | |
| "grad_norm": 1.3865225075934426, | |
| "learning_rate": 3.3547758034519904e-06, | |
| "loss": 0.6528, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.5793918918918919, | |
| "grad_norm": 1.312113624994429, | |
| "learning_rate": 3.3507304251701724e-06, | |
| "loss": 0.6622, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.581081081081081, | |
| "grad_norm": 1.9551062964719512, | |
| "learning_rate": 3.3466748605967173e-06, | |
| "loss": 0.7228, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.5827702702702703, | |
| "grad_norm": 1.488075428878002, | |
| "learning_rate": 3.3426091403159404e-06, | |
| "loss": 0.7062, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.5844594594594594, | |
| "grad_norm": 1.2927191218758096, | |
| "learning_rate": 3.3385332949887426e-06, | |
| "loss": 0.6142, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.5861486486486487, | |
| "grad_norm": 1.418534591843739, | |
| "learning_rate": 3.334447355352381e-06, | |
| "loss": 0.6474, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.5878378378378378, | |
| "grad_norm": 1.3562653998164598, | |
| "learning_rate": 3.3303513522202396e-06, | |
| "loss": 0.7144, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.589527027027027, | |
| "grad_norm": 1.4149402373315325, | |
| "learning_rate": 3.3262453164815904e-06, | |
| "loss": 0.629, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.5912162162162162, | |
| "grad_norm": 1.4220368804725194, | |
| "learning_rate": 3.322129279101368e-06, | |
| "loss": 0.6083, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.5929054054054054, | |
| "grad_norm": 1.4608723242530153, | |
| "learning_rate": 3.3180032711199305e-06, | |
| "loss": 0.7124, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.5945945945945946, | |
| "grad_norm": 1.3227326094937233, | |
| "learning_rate": 3.3138673236528285e-06, | |
| "loss": 0.7078, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.5962837837837838, | |
| "grad_norm": 1.379499039240384, | |
| "learning_rate": 3.3097214678905703e-06, | |
| "loss": 0.6387, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.597972972972973, | |
| "grad_norm": 1.2469548286035632, | |
| "learning_rate": 3.305565735098383e-06, | |
| "loss": 0.7052, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.5996621621621622, | |
| "grad_norm": 1.2848214472322932, | |
| "learning_rate": 3.3014001566159823e-06, | |
| "loss": 0.6886, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.6013513513513513, | |
| "grad_norm": 1.5153345614568845, | |
| "learning_rate": 3.2972247638573326e-06, | |
| "loss": 0.6655, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.6030405405405406, | |
| "grad_norm": 1.325640865438285, | |
| "learning_rate": 3.2930395883104106e-06, | |
| "loss": 0.7274, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.6047297297297297, | |
| "grad_norm": 1.3993390256389666, | |
| "learning_rate": 3.2888446615369684e-06, | |
| "loss": 0.6558, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.606418918918919, | |
| "grad_norm": 1.4018696537559865, | |
| "learning_rate": 3.284640015172294e-06, | |
| "loss": 0.71, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.6081081081081081, | |
| "grad_norm": 1.415708082259971, | |
| "learning_rate": 3.280425680924976e-06, | |
| "loss": 0.7008, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.6097972972972973, | |
| "grad_norm": 1.3763409840970744, | |
| "learning_rate": 3.2762016905766614e-06, | |
| "loss": 0.7022, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.6114864864864865, | |
| "grad_norm": 1.325496237946089, | |
| "learning_rate": 3.271968075981817e-06, | |
| "loss": 0.6589, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.6131756756756757, | |
| "grad_norm": 1.2826988616253867, | |
| "learning_rate": 3.2677248690674903e-06, | |
| "loss": 0.6855, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.6148648648648649, | |
| "grad_norm": 1.2591029878970108, | |
| "learning_rate": 3.2634721018330638e-06, | |
| "loss": 0.6219, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.6165540540540541, | |
| "grad_norm": 1.3387803144672588, | |
| "learning_rate": 3.2592098063500222e-06, | |
| "loss": 0.7039, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.6182432432432432, | |
| "grad_norm": 1.309308982930036, | |
| "learning_rate": 3.2549380147617037e-06, | |
| "loss": 0.6709, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.6199324324324325, | |
| "grad_norm": 1.3431163959736079, | |
| "learning_rate": 3.2506567592830585e-06, | |
| "loss": 0.6324, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.6216216216216216, | |
| "grad_norm": 1.4979970516420926, | |
| "learning_rate": 3.246366072200409e-06, | |
| "loss": 0.6709, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.6233108108108109, | |
| "grad_norm": 1.306386734821211, | |
| "learning_rate": 3.2420659858712035e-06, | |
| "loss": 0.6706, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 1.8009179522234993, | |
| "learning_rate": 3.2377565327237727e-06, | |
| "loss": 0.7755, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.6266891891891891, | |
| "grad_norm": 1.495289761525164, | |
| "learning_rate": 3.2334377452570866e-06, | |
| "loss": 0.6674, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.6283783783783784, | |
| "grad_norm": 1.348145783274342, | |
| "learning_rate": 3.2291096560405055e-06, | |
| "loss": 0.6731, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.6300675675675675, | |
| "grad_norm": 1.3341921892745692, | |
| "learning_rate": 3.2247722977135416e-06, | |
| "loss": 0.6764, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.6317567567567568, | |
| "grad_norm": 1.2841278378116288, | |
| "learning_rate": 3.2204257029856054e-06, | |
| "loss": 0.632, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.6334459459459459, | |
| "grad_norm": 1.720730722690385, | |
| "learning_rate": 3.216069904635762e-06, | |
| "loss": 0.7334, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.6351351351351351, | |
| "grad_norm": 1.3542485100514752, | |
| "learning_rate": 3.2117049355124853e-06, | |
| "loss": 0.7303, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.6368243243243243, | |
| "grad_norm": 1.2936690883966144, | |
| "learning_rate": 3.207330828533408e-06, | |
| "loss": 0.643, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.6385135135135135, | |
| "grad_norm": 1.3864749698122996, | |
| "learning_rate": 3.2029476166850754e-06, | |
| "loss": 0.705, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.6402027027027027, | |
| "grad_norm": 1.3256233676801643, | |
| "learning_rate": 3.1985553330226935e-06, | |
| "loss": 0.6869, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.6418918918918919, | |
| "grad_norm": 1.2944610218294335, | |
| "learning_rate": 3.1941540106698846e-06, | |
| "loss": 0.6514, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.643581081081081, | |
| "grad_norm": 1.501294959024127, | |
| "learning_rate": 3.189743682818432e-06, | |
| "loss": 0.6621, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.6452702702702703, | |
| "grad_norm": 1.3461382821528753, | |
| "learning_rate": 3.1853243827280337e-06, | |
| "loss": 0.6812, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.6469594594594594, | |
| "grad_norm": 1.3019577565523754, | |
| "learning_rate": 3.1808961437260504e-06, | |
| "loss": 0.7056, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.6486486486486487, | |
| "grad_norm": 1.3684415920358612, | |
| "learning_rate": 3.176458999207252e-06, | |
| "loss": 0.6613, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.6503378378378378, | |
| "grad_norm": 1.2594356862234934, | |
| "learning_rate": 3.1720129826335723e-06, | |
| "loss": 0.6383, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.652027027027027, | |
| "grad_norm": 1.3224364937940278, | |
| "learning_rate": 3.167558127533847e-06, | |
| "loss": 0.662, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.6537162162162162, | |
| "grad_norm": 1.361590839149971, | |
| "learning_rate": 3.163094467503568e-06, | |
| "loss": 0.6702, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.6554054054054054, | |
| "grad_norm": 1.3971962151286201, | |
| "learning_rate": 3.1586220362046296e-06, | |
| "loss": 0.6106, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.6570945945945946, | |
| "grad_norm": 1.3370080857838247, | |
| "learning_rate": 3.15414086736507e-06, | |
| "loss": 0.6868, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.6587837837837838, | |
| "grad_norm": 1.2272857766272367, | |
| "learning_rate": 3.1496509947788235e-06, | |
| "loss": 0.6709, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.660472972972973, | |
| "grad_norm": 1.3086690016897455, | |
| "learning_rate": 3.145152452305458e-06, | |
| "loss": 0.6632, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.6621621621621622, | |
| "grad_norm": 1.3851181280606764, | |
| "learning_rate": 3.140645273869928e-06, | |
| "loss": 0.6983, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.6638513513513513, | |
| "grad_norm": 1.5878523469615937, | |
| "learning_rate": 3.136129493462311e-06, | |
| "loss": 0.6712, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.6655405405405406, | |
| "grad_norm": 1.371150403456645, | |
| "learning_rate": 3.1316051451375583e-06, | |
| "loss": 0.684, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.6672297297297297, | |
| "grad_norm": 1.3555611551082325, | |
| "learning_rate": 3.127072263015231e-06, | |
| "loss": 0.7178, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.668918918918919, | |
| "grad_norm": 1.325289229316507, | |
| "learning_rate": 3.122530881279248e-06, | |
| "loss": 0.6923, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.6706081081081081, | |
| "grad_norm": 1.5525432364230671, | |
| "learning_rate": 3.1179810341776267e-06, | |
| "loss": 0.674, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.6722972972972973, | |
| "grad_norm": 1.3171395938666564, | |
| "learning_rate": 3.113422756022225e-06, | |
| "loss": 0.642, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.6739864864864865, | |
| "grad_norm": 1.3270242075423406, | |
| "learning_rate": 3.108856081188481e-06, | |
| "loss": 0.6707, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.6756756756756757, | |
| "grad_norm": 1.3484108066463232, | |
| "learning_rate": 3.1042810441151553e-06, | |
| "loss": 0.6379, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.6773648648648649, | |
| "grad_norm": 1.3309851729704076, | |
| "learning_rate": 3.0996976793040695e-06, | |
| "loss": 0.6682, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 0.6790540540540541, | |
| "grad_norm": 1.2652506659655327, | |
| "learning_rate": 3.095106021319851e-06, | |
| "loss": 0.6847, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.6807432432432432, | |
| "grad_norm": 1.3081964276113105, | |
| "learning_rate": 3.0905061047896643e-06, | |
| "loss": 0.6441, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 0.6824324324324325, | |
| "grad_norm": 1.35647381776027, | |
| "learning_rate": 3.0858979644029575e-06, | |
| "loss": 0.7234, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.6841216216216216, | |
| "grad_norm": 1.372565244390501, | |
| "learning_rate": 3.0812816349111954e-06, | |
| "loss": 0.6534, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.6858108108108109, | |
| "grad_norm": 1.33328522768366, | |
| "learning_rate": 3.0766571511276002e-06, | |
| "loss": 0.6732, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.6875, | |
| "grad_norm": 1.320063333364407, | |
| "learning_rate": 3.0720245479268884e-06, | |
| "loss": 0.6463, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 0.6891891891891891, | |
| "grad_norm": 1.281578342443152, | |
| "learning_rate": 3.0673838602450085e-06, | |
| "loss": 0.6341, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.6908783783783784, | |
| "grad_norm": 1.81611714342352, | |
| "learning_rate": 3.0627351230788744e-06, | |
| "loss": 0.6715, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 0.6925675675675675, | |
| "grad_norm": 1.4517692849833734, | |
| "learning_rate": 3.0580783714861054e-06, | |
| "loss": 0.6293, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.6942567567567568, | |
| "grad_norm": 1.2955581967933887, | |
| "learning_rate": 3.05341364058476e-06, | |
| "loss": 0.6706, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.6959459459459459, | |
| "grad_norm": 1.2989161044167714, | |
| "learning_rate": 3.0487409655530706e-06, | |
| "loss": 0.6485, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.6976351351351351, | |
| "grad_norm": 1.3016968351593294, | |
| "learning_rate": 3.0440603816291807e-06, | |
| "loss": 0.6685, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.6993243243243243, | |
| "grad_norm": 1.3145081529605454, | |
| "learning_rate": 3.0393719241108735e-06, | |
| "loss": 0.6469, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.7010135135135135, | |
| "grad_norm": 1.3639479980615052, | |
| "learning_rate": 3.0346756283553134e-06, | |
| "loss": 0.653, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.7027027027027027, | |
| "grad_norm": 1.318013354050016, | |
| "learning_rate": 3.0299715297787737e-06, | |
| "loss": 0.7239, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.7043918918918919, | |
| "grad_norm": 1.341591059459244, | |
| "learning_rate": 3.025259663856371e-06, | |
| "loss": 0.6943, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 0.706081081081081, | |
| "grad_norm": 1.3721630646268186, | |
| "learning_rate": 3.0205400661217995e-06, | |
| "loss": 0.6693, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.7077702702702703, | |
| "grad_norm": 1.3952679062107705, | |
| "learning_rate": 3.0158127721670584e-06, | |
| "loss": 0.7067, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 0.7094594594594594, | |
| "grad_norm": 1.262343902848384, | |
| "learning_rate": 3.0110778176421913e-06, | |
| "loss": 0.6788, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.7111486486486487, | |
| "grad_norm": 1.3420851582874196, | |
| "learning_rate": 3.0063352382550074e-06, | |
| "loss": 0.6689, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 0.7128378378378378, | |
| "grad_norm": 1.3791855105828084, | |
| "learning_rate": 3.0015850697708217e-06, | |
| "loss": 0.6805, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.714527027027027, | |
| "grad_norm": 1.5875407253674527, | |
| "learning_rate": 2.996827348012178e-06, | |
| "loss": 0.687, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 0.7162162162162162, | |
| "grad_norm": 1.3650745130884876, | |
| "learning_rate": 2.992062108858584e-06, | |
| "loss": 0.6995, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.7179054054054054, | |
| "grad_norm": 1.4782912422272074, | |
| "learning_rate": 2.987289388246237e-06, | |
| "loss": 0.7213, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.7195945945945946, | |
| "grad_norm": 1.3549493700401416, | |
| "learning_rate": 2.9825092221677545e-06, | |
| "loss": 0.6851, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.7212837837837838, | |
| "grad_norm": 1.3065450320634688, | |
| "learning_rate": 2.9777216466719036e-06, | |
| "loss": 0.6733, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 0.722972972972973, | |
| "grad_norm": 1.3874606374901184, | |
| "learning_rate": 2.972926697863328e-06, | |
| "loss": 0.6551, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 0.7246621621621622, | |
| "grad_norm": 1.2251403325861798, | |
| "learning_rate": 2.968124411902275e-06, | |
| "loss": 0.6603, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 0.7263513513513513, | |
| "grad_norm": 1.4125671966195497, | |
| "learning_rate": 2.9633148250043236e-06, | |
| "loss": 0.6989, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.7280405405405406, | |
| "grad_norm": 1.3582792827647256, | |
| "learning_rate": 2.9584979734401135e-06, | |
| "loss": 0.7313, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 0.7297297297297297, | |
| "grad_norm": 1.4040248422818056, | |
| "learning_rate": 2.953673893535067e-06, | |
| "loss": 0.6655, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.731418918918919, | |
| "grad_norm": 1.3542440874977937, | |
| "learning_rate": 2.9488426216691204e-06, | |
| "loss": 0.7208, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 0.7331081081081081, | |
| "grad_norm": 1.6421535799103113, | |
| "learning_rate": 2.9440041942764443e-06, | |
| "loss": 0.6847, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.7347972972972973, | |
| "grad_norm": 1.3015301627339435, | |
| "learning_rate": 2.9391586478451726e-06, | |
| "loss": 0.6793, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.7364864864864865, | |
| "grad_norm": 1.3756341557406138, | |
| "learning_rate": 2.934306018917126e-06, | |
| "loss": 0.6719, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 0.7381756756756757, | |
| "grad_norm": 1.3684325540715159, | |
| "learning_rate": 2.929446344087537e-06, | |
| "loss": 0.6926, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 0.7398648648648649, | |
| "grad_norm": 1.348921387693639, | |
| "learning_rate": 2.924579660004773e-06, | |
| "loss": 0.6538, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.7415540540540541, | |
| "grad_norm": 1.4451251249573025, | |
| "learning_rate": 2.9197060033700603e-06, | |
| "loss": 0.6952, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 0.7432432432432432, | |
| "grad_norm": 1.435659952122345, | |
| "learning_rate": 2.914825410937208e-06, | |
| "loss": 0.7609, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.7449324324324325, | |
| "grad_norm": 1.463004294888883, | |
| "learning_rate": 2.90993791951233e-06, | |
| "loss": 0.6777, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 0.7466216216216216, | |
| "grad_norm": 1.2911426285111054, | |
| "learning_rate": 2.9050435659535678e-06, | |
| "loss": 0.6805, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 0.7483108108108109, | |
| "grad_norm": 1.380380264635941, | |
| "learning_rate": 2.900142387170812e-06, | |
| "loss": 0.7359, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.4287814635062361, | |
| "learning_rate": 2.895234420125425e-06, | |
| "loss": 0.6564, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.7516891891891891, | |
| "grad_norm": 1.364150218756707, | |
| "learning_rate": 2.8903197018299613e-06, | |
| "loss": 0.6779, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.7533783783783784, | |
| "grad_norm": 1.6302546987919042, | |
| "learning_rate": 2.8853982693478895e-06, | |
| "loss": 0.6686, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 0.7550675675675675, | |
| "grad_norm": 1.4176851311608176, | |
| "learning_rate": 2.8804701597933108e-06, | |
| "loss": 0.7193, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 0.7567567567567568, | |
| "grad_norm": 1.3980473477530322, | |
| "learning_rate": 2.8755354103306806e-06, | |
| "loss": 0.6763, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.7584459459459459, | |
| "grad_norm": 1.306040542269856, | |
| "learning_rate": 2.87059405817453e-06, | |
| "loss": 0.6565, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 0.7601351351351351, | |
| "grad_norm": 1.421190776938713, | |
| "learning_rate": 2.8656461405891794e-06, | |
| "loss": 0.6544, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.7618243243243243, | |
| "grad_norm": 1.3558925930101302, | |
| "learning_rate": 2.8606916948884644e-06, | |
| "loss": 0.6992, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 0.7635135135135135, | |
| "grad_norm": 1.3597956638790247, | |
| "learning_rate": 2.85573075843545e-06, | |
| "loss": 0.7236, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 0.7652027027027027, | |
| "grad_norm": 1.2642452651402398, | |
| "learning_rate": 2.8507633686421496e-06, | |
| "loss": 0.665, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 0.7668918918918919, | |
| "grad_norm": 1.318361028472153, | |
| "learning_rate": 2.845789562969245e-06, | |
| "loss": 0.701, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 0.768581081081081, | |
| "grad_norm": 1.3360445051985224, | |
| "learning_rate": 2.8408093789258e-06, | |
| "loss": 0.6707, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.7702702702702703, | |
| "grad_norm": 1.2691241582564996, | |
| "learning_rate": 2.8358228540689812e-06, | |
| "loss": 0.6077, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 0.7719594594594594, | |
| "grad_norm": 1.323540568143679, | |
| "learning_rate": 2.830830026003773e-06, | |
| "loss": 0.7048, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 0.7736486486486487, | |
| "grad_norm": 1.2731559364058054, | |
| "learning_rate": 2.825830932382694e-06, | |
| "loss": 0.6104, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 0.7753378378378378, | |
| "grad_norm": 1.3118198520290825, | |
| "learning_rate": 2.820825610905514e-06, | |
| "loss": 0.6426, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 0.777027027027027, | |
| "grad_norm": 1.371723875417946, | |
| "learning_rate": 2.815814099318968e-06, | |
| "loss": 0.7205, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.7787162162162162, | |
| "grad_norm": 1.2988247931459596, | |
| "learning_rate": 2.810796435416473e-06, | |
| "loss": 0.6985, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 0.7804054054054054, | |
| "grad_norm": 1.3027800359441715, | |
| "learning_rate": 2.8057726570378447e-06, | |
| "loss": 0.7064, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 0.7820945945945946, | |
| "grad_norm": 1.299894099802323, | |
| "learning_rate": 2.800742802069006e-06, | |
| "loss": 0.628, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 0.7837837837837838, | |
| "grad_norm": 1.4642656243160261, | |
| "learning_rate": 2.7957069084417093e-06, | |
| "loss": 0.6943, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 0.785472972972973, | |
| "grad_norm": 1.3308107990437001, | |
| "learning_rate": 2.7906650141332427e-06, | |
| "loss": 0.7128, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.7871621621621622, | |
| "grad_norm": 1.3874905549157548, | |
| "learning_rate": 2.7856171571661514e-06, | |
| "loss": 0.6766, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 0.7888513513513513, | |
| "grad_norm": 1.3259468751863204, | |
| "learning_rate": 2.7805633756079426e-06, | |
| "loss": 0.6743, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 0.7905405405405406, | |
| "grad_norm": 1.3215238136524157, | |
| "learning_rate": 2.775503707570808e-06, | |
| "loss": 0.6575, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 0.7922297297297297, | |
| "grad_norm": 1.2709398810755754, | |
| "learning_rate": 2.7704381912113245e-06, | |
| "loss": 0.6739, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 0.793918918918919, | |
| "grad_norm": 1.3037615494242192, | |
| "learning_rate": 2.7653668647301796e-06, | |
| "loss": 0.7025, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.7956081081081081, | |
| "grad_norm": 1.2917843369183553, | |
| "learning_rate": 2.7602897663718725e-06, | |
| "loss": 0.6327, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 0.7972972972972973, | |
| "grad_norm": 1.309216686164246, | |
| "learning_rate": 2.755206934424431e-06, | |
| "loss": 0.6277, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 0.7989864864864865, | |
| "grad_norm": 1.371910975646893, | |
| "learning_rate": 2.7501184072191237e-06, | |
| "loss": 0.6407, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 0.8006756756756757, | |
| "grad_norm": 1.3727580581951706, | |
| "learning_rate": 2.7450242231301655e-06, | |
| "loss": 0.6536, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 0.8023648648648649, | |
| "grad_norm": 1.4020595724687033, | |
| "learning_rate": 2.7399244205744347e-06, | |
| "loss": 0.6807, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.8040540540540541, | |
| "grad_norm": 1.4148426490517583, | |
| "learning_rate": 2.734819038011179e-06, | |
| "loss": 0.7045, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 0.8057432432432432, | |
| "grad_norm": 1.3409500564912207, | |
| "learning_rate": 2.729708113941727e-06, | |
| "loss": 0.6991, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 0.8074324324324325, | |
| "grad_norm": 1.3823977849949307, | |
| "learning_rate": 2.724591686909196e-06, | |
| "loss": 0.6594, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 0.8091216216216216, | |
| "grad_norm": 1.2394815303889972, | |
| "learning_rate": 2.719469795498206e-06, | |
| "loss": 0.6467, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 0.8108108108108109, | |
| "grad_norm": 1.281104807999005, | |
| "learning_rate": 2.714342478334583e-06, | |
| "loss": 0.6547, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.8125, | |
| "grad_norm": 1.3287359372486716, | |
| "learning_rate": 2.709209774085071e-06, | |
| "loss": 0.6273, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 0.8141891891891891, | |
| "grad_norm": 1.3379278170149733, | |
| "learning_rate": 2.7040717214570415e-06, | |
| "loss": 0.646, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 0.8158783783783784, | |
| "grad_norm": 1.2926999526730814, | |
| "learning_rate": 2.698928359198197e-06, | |
| "loss": 0.6677, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 0.8175675675675675, | |
| "grad_norm": 1.3127200141418005, | |
| "learning_rate": 2.693779726096283e-06, | |
| "loss": 0.6618, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 0.8192567567567568, | |
| "grad_norm": 1.3197962107489793, | |
| "learning_rate": 2.6886258609787946e-06, | |
| "loss": 0.6282, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.8209459459459459, | |
| "grad_norm": 1.2848795428712034, | |
| "learning_rate": 2.683466802712683e-06, | |
| "loss": 0.6269, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 0.8226351351351351, | |
| "grad_norm": 1.3093990553658448, | |
| "learning_rate": 2.678302590204062e-06, | |
| "loss": 0.6661, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 0.8243243243243243, | |
| "grad_norm": 1.3439659038952452, | |
| "learning_rate": 2.6731332623979154e-06, | |
| "loss": 0.6191, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 0.8260135135135135, | |
| "grad_norm": 1.3151017233480091, | |
| "learning_rate": 2.6679588582778024e-06, | |
| "loss": 0.692, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 0.8277027027027027, | |
| "grad_norm": 1.367520065635975, | |
| "learning_rate": 2.662779416865567e-06, | |
| "loss": 0.6235, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.8293918918918919, | |
| "grad_norm": 1.2805691498538, | |
| "learning_rate": 2.6575949772210376e-06, | |
| "loss": 0.6885, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 0.831081081081081, | |
| "grad_norm": 1.3093657104644763, | |
| "learning_rate": 2.6524055784417386e-06, | |
| "loss": 0.6832, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 0.8327702702702703, | |
| "grad_norm": 1.2719434085445112, | |
| "learning_rate": 2.6472112596625912e-06, | |
| "loss": 0.6358, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 0.8344594594594594, | |
| "grad_norm": 1.3289440001002792, | |
| "learning_rate": 2.642012060055619e-06, | |
| "loss": 0.6951, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 0.8361486486486487, | |
| "grad_norm": 1.4234708370546796, | |
| "learning_rate": 2.6368080188296577e-06, | |
| "loss": 0.6366, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.8378378378378378, | |
| "grad_norm": 1.3379252399438568, | |
| "learning_rate": 2.63159917523005e-06, | |
| "loss": 0.6828, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 0.839527027027027, | |
| "grad_norm": 1.2971666902550303, | |
| "learning_rate": 2.626385568538358e-06, | |
| "loss": 0.6548, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 0.8412162162162162, | |
| "grad_norm": 1.319788315724304, | |
| "learning_rate": 2.6211672380720625e-06, | |
| "loss": 0.675, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 0.8429054054054054, | |
| "grad_norm": 1.3693649054808243, | |
| "learning_rate": 2.6159442231842693e-06, | |
| "loss": 0.6685, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 0.8445945945945946, | |
| "grad_norm": 1.2826721891793782, | |
| "learning_rate": 2.6107165632634098e-06, | |
| "loss": 0.6231, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8462837837837838, | |
| "grad_norm": 1.3927785373092278, | |
| "learning_rate": 2.605484297732944e-06, | |
| "loss": 0.696, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 0.847972972972973, | |
| "grad_norm": 1.2238179436862426, | |
| "learning_rate": 2.6002474660510665e-06, | |
| "loss": 0.599, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 0.8496621621621622, | |
| "grad_norm": 1.3395786217070758, | |
| "learning_rate": 2.595006107710406e-06, | |
| "loss": 0.6785, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 0.8513513513513513, | |
| "grad_norm": 1.3197500690461326, | |
| "learning_rate": 2.5897602622377272e-06, | |
| "loss": 0.6828, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 0.8530405405405406, | |
| "grad_norm": 1.2526702269554713, | |
| "learning_rate": 2.5845099691936343e-06, | |
| "loss": 0.6678, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.8547297297297297, | |
| "grad_norm": 1.284205313573379, | |
| "learning_rate": 2.579255268172273e-06, | |
| "loss": 0.6653, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 0.856418918918919, | |
| "grad_norm": 1.2411561149505208, | |
| "learning_rate": 2.573996198801029e-06, | |
| "loss": 0.6274, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 0.8581081081081081, | |
| "grad_norm": 1.2770163226003766, | |
| "learning_rate": 2.568732800740233e-06, | |
| "loss": 0.6527, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 0.8597972972972973, | |
| "grad_norm": 1.3037995073619448, | |
| "learning_rate": 2.5634651136828594e-06, | |
| "loss": 0.6832, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 0.8614864864864865, | |
| "grad_norm": 1.3305574812358596, | |
| "learning_rate": 2.5581931773542263e-06, | |
| "loss": 0.6716, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.8631756756756757, | |
| "grad_norm": 1.2699825016633381, | |
| "learning_rate": 2.552917031511697e-06, | |
| "loss": 0.6807, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 0.8648648648648649, | |
| "grad_norm": 1.384721350284805, | |
| "learning_rate": 2.547636715944382e-06, | |
| "loss": 0.7128, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.8665540540540541, | |
| "grad_norm": 1.2962160985630566, | |
| "learning_rate": 2.542352270472834e-06, | |
| "loss": 0.7097, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 0.8682432432432432, | |
| "grad_norm": 1.3315884714121142, | |
| "learning_rate": 2.5370637349487537e-06, | |
| "loss": 0.7239, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 0.8699324324324325, | |
| "grad_norm": 1.29687671033792, | |
| "learning_rate": 2.5317711492546836e-06, | |
| "loss": 0.6688, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.8716216216216216, | |
| "grad_norm": 1.2711205250804978, | |
| "learning_rate": 2.5264745533037123e-06, | |
| "loss": 0.6742, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 0.8733108108108109, | |
| "grad_norm": 1.2996812621542553, | |
| "learning_rate": 2.521173987039169e-06, | |
| "loss": 0.6811, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 0.875, | |
| "grad_norm": 1.6950421844669081, | |
| "learning_rate": 2.5158694904343246e-06, | |
| "loss": 0.6618, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 0.8766891891891891, | |
| "grad_norm": 1.3193723681629466, | |
| "learning_rate": 2.510561103492091e-06, | |
| "loss": 0.6402, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 0.8783783783783784, | |
| "grad_norm": 1.2695628123088936, | |
| "learning_rate": 2.505248866244718e-06, | |
| "loss": 0.6831, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.8800675675675675, | |
| "grad_norm": 1.2787241205061413, | |
| "learning_rate": 2.4999328187534915e-06, | |
| "loss": 0.6802, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 0.8817567567567568, | |
| "grad_norm": 1.300609736482222, | |
| "learning_rate": 2.4946130011084306e-06, | |
| "loss": 0.6546, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 0.8834459459459459, | |
| "grad_norm": 1.3468074007186794, | |
| "learning_rate": 2.489289453427989e-06, | |
| "loss": 0.6704, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 0.8851351351351351, | |
| "grad_norm": 1.3306700582716369, | |
| "learning_rate": 2.483962215858748e-06, | |
| "loss": 0.6587, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 0.8868243243243243, | |
| "grad_norm": 1.339892354934288, | |
| "learning_rate": 2.4786313285751155e-06, | |
| "loss": 0.6632, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.8885135135135135, | |
| "grad_norm": 1.4002140225966508, | |
| "learning_rate": 2.473296831779023e-06, | |
| "loss": 0.6419, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 0.8902027027027027, | |
| "grad_norm": 1.288664747488742, | |
| "learning_rate": 2.4679587656996235e-06, | |
| "loss": 0.6371, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 0.8918918918918919, | |
| "grad_norm": 1.3734784253236045, | |
| "learning_rate": 2.462617170592987e-06, | |
| "loss": 0.6561, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 0.893581081081081, | |
| "grad_norm": 1.2050784201474516, | |
| "learning_rate": 2.4572720867417945e-06, | |
| "loss": 0.632, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 0.8952702702702703, | |
| "grad_norm": 1.3132258094685767, | |
| "learning_rate": 2.4519235544550412e-06, | |
| "loss": 0.6538, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.8969594594594594, | |
| "grad_norm": 1.337760432140806, | |
| "learning_rate": 2.4465716140677234e-06, | |
| "loss": 0.7156, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 0.8986486486486487, | |
| "grad_norm": 1.2666914241716667, | |
| "learning_rate": 2.4412163059405435e-06, | |
| "loss": 0.6577, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 0.9003378378378378, | |
| "grad_norm": 1.2823601807164624, | |
| "learning_rate": 2.4358576704595965e-06, | |
| "loss": 0.697, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 0.902027027027027, | |
| "grad_norm": 1.3367451920880444, | |
| "learning_rate": 2.4304957480360744e-06, | |
| "loss": 0.6527, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 0.9037162162162162, | |
| "grad_norm": 1.3739268155417224, | |
| "learning_rate": 2.425130579105953e-06, | |
| "loss": 0.7149, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.9054054054054054, | |
| "grad_norm": 1.329350014321296, | |
| "learning_rate": 2.419762204129695e-06, | |
| "loss": 0.7081, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 0.9070945945945946, | |
| "grad_norm": 1.2638498827293603, | |
| "learning_rate": 2.414390663591938e-06, | |
| "loss": 0.6665, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 0.9087837837837838, | |
| "grad_norm": 1.2762258266716298, | |
| "learning_rate": 2.4090159980011934e-06, | |
| "loss": 0.6079, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 0.910472972972973, | |
| "grad_norm": 1.3002809021905946, | |
| "learning_rate": 2.4036382478895393e-06, | |
| "loss": 0.655, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 0.9121621621621622, | |
| "grad_norm": 1.3593466428722907, | |
| "learning_rate": 2.398257453812315e-06, | |
| "loss": 0.6471, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.9138513513513513, | |
| "grad_norm": 1.3217770368871993, | |
| "learning_rate": 2.392873656347815e-06, | |
| "loss": 0.6376, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 0.9155405405405406, | |
| "grad_norm": 1.357348425007889, | |
| "learning_rate": 2.387486896096986e-06, | |
| "loss": 0.7056, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 0.9172297297297297, | |
| "grad_norm": 1.3138607334781762, | |
| "learning_rate": 2.382097213683114e-06, | |
| "loss": 0.7244, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 0.918918918918919, | |
| "grad_norm": 1.2943145130114848, | |
| "learning_rate": 2.3767046497515235e-06, | |
| "loss": 0.6455, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 0.9206081081081081, | |
| "grad_norm": 1.3329448183257002, | |
| "learning_rate": 2.3713092449692705e-06, | |
| "loss": 0.6655, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.9222972972972973, | |
| "grad_norm": 1.3827889113122052, | |
| "learning_rate": 2.365911040024835e-06, | |
| "loss": 0.6085, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 0.9239864864864865, | |
| "grad_norm": 1.2873550101593443, | |
| "learning_rate": 2.3605100756278114e-06, | |
| "loss": 0.6496, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 0.9256756756756757, | |
| "grad_norm": 1.4635505647607276, | |
| "learning_rate": 2.355106392508607e-06, | |
| "loss": 0.7255, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 0.9273648648648649, | |
| "grad_norm": 1.274562634214393, | |
| "learning_rate": 2.349700031418129e-06, | |
| "loss": 0.622, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 0.9290540540540541, | |
| "grad_norm": 1.2916834992250223, | |
| "learning_rate": 2.344291033127482e-06, | |
| "loss": 0.6483, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.9307432432432432, | |
| "grad_norm": 1.2974562634280429, | |
| "learning_rate": 2.338879438427659e-06, | |
| "loss": 0.6368, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 0.9324324324324325, | |
| "grad_norm": 1.3244349894986713, | |
| "learning_rate": 2.333465288129231e-06, | |
| "loss": 0.6852, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 0.9341216216216216, | |
| "grad_norm": 1.3151764611776746, | |
| "learning_rate": 2.3280486230620433e-06, | |
| "loss": 0.6805, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 0.9358108108108109, | |
| "grad_norm": 1.2843621068360367, | |
| "learning_rate": 2.322629484074907e-06, | |
| "loss": 0.6746, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 0.9375, | |
| "grad_norm": 1.3014490689776925, | |
| "learning_rate": 2.3172079120352865e-06, | |
| "loss": 0.6901, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.9391891891891891, | |
| "grad_norm": 1.6358853991358127, | |
| "learning_rate": 2.3117839478289983e-06, | |
| "loss": 0.6609, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 0.9408783783783784, | |
| "grad_norm": 1.4871449836737003, | |
| "learning_rate": 2.3063576323598955e-06, | |
| "loss": 0.663, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 0.9425675675675675, | |
| "grad_norm": 1.2851645298579233, | |
| "learning_rate": 2.3009290065495662e-06, | |
| "loss": 0.6648, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 0.9442567567567568, | |
| "grad_norm": 1.3989687257878538, | |
| "learning_rate": 2.2954981113370182e-06, | |
| "loss": 0.6431, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 0.9459459459459459, | |
| "grad_norm": 1.4550664360042096, | |
| "learning_rate": 2.290064987678377e-06, | |
| "loss": 0.6861, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.9476351351351351, | |
| "grad_norm": 1.3725831272335538, | |
| "learning_rate": 2.2846296765465706e-06, | |
| "loss": 0.6985, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 0.9493243243243243, | |
| "grad_norm": 1.3575443413890222, | |
| "learning_rate": 2.2791922189310244e-06, | |
| "loss": 0.6626, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 0.9510135135135135, | |
| "grad_norm": 1.256005428809596, | |
| "learning_rate": 2.2737526558373527e-06, | |
| "loss": 0.654, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 0.9527027027027027, | |
| "grad_norm": 1.4238550908429117, | |
| "learning_rate": 2.268311028287045e-06, | |
| "loss": 0.6715, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 0.9543918918918919, | |
| "grad_norm": 1.3317043187800373, | |
| "learning_rate": 2.262867377317163e-06, | |
| "loss": 0.65, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.956081081081081, | |
| "grad_norm": 1.3154947523106852, | |
| "learning_rate": 2.257421743980024e-06, | |
| "loss": 0.6567, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 0.9577702702702703, | |
| "grad_norm": 1.2964749823164652, | |
| "learning_rate": 2.2519741693428976e-06, | |
| "loss": 0.6065, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 0.9594594594594594, | |
| "grad_norm": 1.2614425014750057, | |
| "learning_rate": 2.246524694487692e-06, | |
| "loss": 0.6928, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 0.9611486486486487, | |
| "grad_norm": 1.3147013825054574, | |
| "learning_rate": 2.2410733605106456e-06, | |
| "loss": 0.6271, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 0.9628378378378378, | |
| "grad_norm": 1.2464850380254473, | |
| "learning_rate": 2.235620208522019e-06, | |
| "loss": 0.6187, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.964527027027027, | |
| "grad_norm": 1.3118794490711199, | |
| "learning_rate": 2.2301652796457807e-06, | |
| "loss": 0.6434, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 0.9662162162162162, | |
| "grad_norm": 1.3652152462118168, | |
| "learning_rate": 2.2247086150192997e-06, | |
| "loss": 0.6581, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 0.9679054054054054, | |
| "grad_norm": 1.4431845392319815, | |
| "learning_rate": 2.2192502557930343e-06, | |
| "loss": 0.6747, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 0.9695945945945946, | |
| "grad_norm": 1.333305348582229, | |
| "learning_rate": 2.213790243130226e-06, | |
| "loss": 0.6562, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 0.9712837837837838, | |
| "grad_norm": 1.3059868201157803, | |
| "learning_rate": 2.20832861820658e-06, | |
| "loss": 0.6073, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.972972972972973, | |
| "grad_norm": 1.255203354957442, | |
| "learning_rate": 2.202865422209963e-06, | |
| "loss": 0.6399, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 0.9746621621621622, | |
| "grad_norm": 1.2966274532372035, | |
| "learning_rate": 2.197400696340091e-06, | |
| "loss": 0.6904, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 0.9763513513513513, | |
| "grad_norm": 1.342746601472998, | |
| "learning_rate": 2.1919344818082144e-06, | |
| "loss": 0.7327, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 0.9780405405405406, | |
| "grad_norm": 1.271167602819476, | |
| "learning_rate": 2.1864668198368116e-06, | |
| "loss": 0.6235, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 0.9797297297297297, | |
| "grad_norm": 1.2704298842023867, | |
| "learning_rate": 2.1809977516592758e-06, | |
| "loss": 0.6397, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.981418918918919, | |
| "grad_norm": 1.4659602830194913, | |
| "learning_rate": 2.175527318519606e-06, | |
| "loss": 0.6375, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 0.9831081081081081, | |
| "grad_norm": 1.2837108523299343, | |
| "learning_rate": 2.1700555616720934e-06, | |
| "loss": 0.5946, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 0.9847972972972973, | |
| "grad_norm": 1.3258335291339656, | |
| "learning_rate": 2.1645825223810135e-06, | |
| "loss": 0.646, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 0.9864864864864865, | |
| "grad_norm": 1.381249010402763, | |
| "learning_rate": 2.159108241920312e-06, | |
| "loss": 0.6993, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 0.9881756756756757, | |
| "grad_norm": 1.3321089783532771, | |
| "learning_rate": 2.1536327615732937e-06, | |
| "loss": 0.637, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.9898648648648649, | |
| "grad_norm": 1.3022198415919726, | |
| "learning_rate": 2.148156122632314e-06, | |
| "loss": 0.6623, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 0.9915540540540541, | |
| "grad_norm": 1.2277665889364764, | |
| "learning_rate": 2.1426783663984645e-06, | |
| "loss": 0.6595, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 0.9932432432432432, | |
| "grad_norm": 1.3098439041046739, | |
| "learning_rate": 2.1371995341812636e-06, | |
| "loss": 0.6805, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 0.9949324324324325, | |
| "grad_norm": 1.3300371685472199, | |
| "learning_rate": 2.1317196672983425e-06, | |
| "loss": 0.6783, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 0.9966216216216216, | |
| "grad_norm": 1.3496135502588562, | |
| "learning_rate": 2.126238807075137e-06, | |
| "loss": 0.6528, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.9983108108108109, | |
| "grad_norm": 1.361261277820641, | |
| "learning_rate": 2.120756994844572e-06, | |
| "loss": 0.6694, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.3306654265805096, | |
| "learning_rate": 2.115274271946754e-06, | |
| "loss": 0.7216, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.5579404234886169, | |
| "eval_runtime": 948.1228, | |
| "eval_samples_per_second": 5.748, | |
| "eval_steps_per_second": 0.36, | |
| "step": 592 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 1184, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 296, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.25646117018665e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |