| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.7807417046193884, |
| "eval_steps": 500, |
| "global_step": 1200, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0006506180871828237, |
| "grad_norm": 3.778571605682373, |
| "learning_rate": 0.0001, |
| "loss": 4.706, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.0013012361743656475, |
| "grad_norm": 0.7331739068031311, |
| "learning_rate": 0.0001, |
| "loss": 2.6402, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.001951854261548471, |
| "grad_norm": 0.5679969191551208, |
| "learning_rate": 0.0001, |
| "loss": 2.5315, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.002602472348731295, |
| "grad_norm": 0.6543067693710327, |
| "learning_rate": 0.0001, |
| "loss": 2.5226, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.0032530904359141183, |
| "grad_norm": 0.42487671971321106, |
| "learning_rate": 0.0001, |
| "loss": 2.1375, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.003903708523096942, |
| "grad_norm": 0.48795655369758606, |
| "learning_rate": 0.0001, |
| "loss": 2.253, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.004554326610279766, |
| "grad_norm": 0.6054234504699707, |
| "learning_rate": 0.0001, |
| "loss": 2.3411, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.00520494469746259, |
| "grad_norm": 0.3039970397949219, |
| "learning_rate": 0.0001, |
| "loss": 2.1293, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.005855562784645413, |
| "grad_norm": 0.6592361330986023, |
| "learning_rate": 0.0001, |
| "loss": 3.1615, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.006506180871828237, |
| "grad_norm": 0.4017999470233917, |
| "learning_rate": 0.0001, |
| "loss": 2.5068, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0071567989590110605, |
| "grad_norm": 0.31507641077041626, |
| "learning_rate": 0.0001, |
| "loss": 2.1894, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.007807417046193884, |
| "grad_norm": 0.33226895332336426, |
| "learning_rate": 0.0001, |
| "loss": 2.2006, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.008458035133376708, |
| "grad_norm": 0.2632739841938019, |
| "learning_rate": 0.0001, |
| "loss": 2.0998, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.009108653220559532, |
| "grad_norm": 0.2794795036315918, |
| "learning_rate": 0.0001, |
| "loss": 2.113, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.009759271307742356, |
| "grad_norm": 0.29168492555618286, |
| "learning_rate": 0.0001, |
| "loss": 2.354, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.01040988939492518, |
| "grad_norm": 0.2537970244884491, |
| "learning_rate": 0.0001, |
| "loss": 2.2939, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.011060507482108002, |
| "grad_norm": 0.5140053033828735, |
| "learning_rate": 0.0001, |
| "loss": 2.6237, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.011711125569290826, |
| "grad_norm": 0.3093675971031189, |
| "learning_rate": 0.0001, |
| "loss": 2.3502, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.01236174365647365, |
| "grad_norm": 0.29241421818733215, |
| "learning_rate": 0.0001, |
| "loss": 2.5365, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.013012361743656473, |
| "grad_norm": 0.3164322078227997, |
| "learning_rate": 0.0001, |
| "loss": 2.396, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.013662979830839297, |
| "grad_norm": 0.24512743949890137, |
| "learning_rate": 0.0001, |
| "loss": 2.2759, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.014313597918022121, |
| "grad_norm": 0.24328342080116272, |
| "learning_rate": 0.0001, |
| "loss": 2.2103, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.014964216005204945, |
| "grad_norm": 0.2563220262527466, |
| "learning_rate": 0.0001, |
| "loss": 2.4836, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.015614834092387769, |
| "grad_norm": 0.33601588010787964, |
| "learning_rate": 0.0001, |
| "loss": 2.4446, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.01626545217957059, |
| "grad_norm": 0.28699007630348206, |
| "learning_rate": 0.0001, |
| "loss": 2.8504, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.016916070266753416, |
| "grad_norm": 0.3181653618812561, |
| "learning_rate": 0.0001, |
| "loss": 2.3042, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.01756668835393624, |
| "grad_norm": 0.2349390834569931, |
| "learning_rate": 0.0001, |
| "loss": 2.1024, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.018217306441119064, |
| "grad_norm": 0.2751820981502533, |
| "learning_rate": 0.0001, |
| "loss": 2.2646, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.018867924528301886, |
| "grad_norm": 0.25547271966934204, |
| "learning_rate": 0.0001, |
| "loss": 2.1928, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.01951854261548471, |
| "grad_norm": 0.283507764339447, |
| "learning_rate": 0.0001, |
| "loss": 2.3073, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.020169160702667534, |
| "grad_norm": 0.3354213237762451, |
| "learning_rate": 0.0001, |
| "loss": 2.6273, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.02081977878985036, |
| "grad_norm": 0.40484553575515747, |
| "learning_rate": 0.0001, |
| "loss": 2.4919, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.02147039687703318, |
| "grad_norm": 0.34319421648979187, |
| "learning_rate": 0.0001, |
| "loss": 2.8381, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.022121014964216004, |
| "grad_norm": 0.32958984375, |
| "learning_rate": 0.0001, |
| "loss": 2.3062, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.02277163305139883, |
| "grad_norm": 0.4503105878829956, |
| "learning_rate": 0.0001, |
| "loss": 2.4647, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.02342225113858165, |
| "grad_norm": 0.5084238052368164, |
| "learning_rate": 0.0001, |
| "loss": 3.0047, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.024072869225764477, |
| "grad_norm": 0.5192400813102722, |
| "learning_rate": 0.0001, |
| "loss": 2.2899, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.0247234873129473, |
| "grad_norm": 0.4197874665260315, |
| "learning_rate": 0.0001, |
| "loss": 2.4057, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.025374105400130124, |
| "grad_norm": 0.5170285105705261, |
| "learning_rate": 0.0001, |
| "loss": 3.2918, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.026024723487312947, |
| "grad_norm": 0.2491147667169571, |
| "learning_rate": 0.0001, |
| "loss": 2.1957, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.026675341574495772, |
| "grad_norm": 0.6597635746002197, |
| "learning_rate": 0.0001, |
| "loss": 2.7474, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.027325959661678594, |
| "grad_norm": 0.40205034613609314, |
| "learning_rate": 0.0001, |
| "loss": 2.4561, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.02797657774886142, |
| "grad_norm": 0.27388331294059753, |
| "learning_rate": 0.0001, |
| "loss": 2.0477, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.028627195836044242, |
| "grad_norm": 0.9163908958435059, |
| "learning_rate": 0.0001, |
| "loss": 3.334, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.029277813923227064, |
| "grad_norm": 0.2747696042060852, |
| "learning_rate": 0.0001, |
| "loss": 2.1604, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.02992843201040989, |
| "grad_norm": 0.36308085918426514, |
| "learning_rate": 0.0001, |
| "loss": 2.693, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.03057905009759271, |
| "grad_norm": 0.6159886121749878, |
| "learning_rate": 0.0001, |
| "loss": 2.5515, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.031229668184775537, |
| "grad_norm": 0.4801373779773712, |
| "learning_rate": 0.0001, |
| "loss": 2.809, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.03188028627195836, |
| "grad_norm": 0.32580915093421936, |
| "learning_rate": 0.0001, |
| "loss": 2.5236, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.03253090435914118, |
| "grad_norm": 0.3028671443462372, |
| "learning_rate": 0.0001, |
| "loss": 2.2685, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.03318152244632401, |
| "grad_norm": 0.5660931468009949, |
| "learning_rate": 0.0001, |
| "loss": 2.2564, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.03383214053350683, |
| "grad_norm": 0.24634602665901184, |
| "learning_rate": 0.0001, |
| "loss": 2.1355, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.034482758620689655, |
| "grad_norm": 0.24830913543701172, |
| "learning_rate": 0.0001, |
| "loss": 2.0425, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.03513337670787248, |
| "grad_norm": 0.23614570498466492, |
| "learning_rate": 0.0001, |
| "loss": 2.1975, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.035783994795055306, |
| "grad_norm": 0.2624325156211853, |
| "learning_rate": 0.0001, |
| "loss": 2.3071, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.03643461288223813, |
| "grad_norm": 0.3967755436897278, |
| "learning_rate": 0.0001, |
| "loss": 2.6088, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.03708523096942095, |
| "grad_norm": 0.22147373855113983, |
| "learning_rate": 0.0001, |
| "loss": 2.003, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.03773584905660377, |
| "grad_norm": 0.47795867919921875, |
| "learning_rate": 0.0001, |
| "loss": 2.1473, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.038386467143786594, |
| "grad_norm": 0.43953707814216614, |
| "learning_rate": 0.0001, |
| "loss": 2.6595, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.03903708523096942, |
| "grad_norm": 0.29031845927238464, |
| "learning_rate": 0.0001, |
| "loss": 2.3173, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.039687703318152245, |
| "grad_norm": 0.2491024285554886, |
| "learning_rate": 0.0001, |
| "loss": 2.0575, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.04033832140533507, |
| "grad_norm": 0.3025687634944916, |
| "learning_rate": 0.0001, |
| "loss": 2.0965, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.04098893949251789, |
| "grad_norm": 0.26097819209098816, |
| "learning_rate": 0.0001, |
| "loss": 2.2583, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.04163955757970072, |
| "grad_norm": 0.2413238286972046, |
| "learning_rate": 0.0001, |
| "loss": 2.2441, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.04229017566688354, |
| "grad_norm": 0.2332315295934677, |
| "learning_rate": 0.0001, |
| "loss": 2.185, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.04294079375406636, |
| "grad_norm": 0.4037252366542816, |
| "learning_rate": 0.0001, |
| "loss": 2.3875, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.043591411841249185, |
| "grad_norm": 0.34149354696273804, |
| "learning_rate": 0.0001, |
| "loss": 2.3835, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.04424202992843201, |
| "grad_norm": 0.23793481290340424, |
| "learning_rate": 0.0001, |
| "loss": 2.3521, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.044892648015614836, |
| "grad_norm": 0.24252744019031525, |
| "learning_rate": 0.0001, |
| "loss": 2.0984, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.04554326610279766, |
| "grad_norm": 0.2870447635650635, |
| "learning_rate": 0.0001, |
| "loss": 2.5408, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.04619388418998048, |
| "grad_norm": 0.5050077438354492, |
| "learning_rate": 0.0001, |
| "loss": 2.7091, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.0468445022771633, |
| "grad_norm": 0.2391565591096878, |
| "learning_rate": 0.0001, |
| "loss": 2.1601, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.04749512036434613, |
| "grad_norm": 0.20647507905960083, |
| "learning_rate": 0.0001, |
| "loss": 1.9582, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.048145738451528954, |
| "grad_norm": 0.26072338223457336, |
| "learning_rate": 0.0001, |
| "loss": 2.3577, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.048796356538711776, |
| "grad_norm": 0.28378504514694214, |
| "learning_rate": 0.0001, |
| "loss": 2.349, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.0494469746258946, |
| "grad_norm": 0.2536943256855011, |
| "learning_rate": 0.0001, |
| "loss": 2.375, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.05009759271307743, |
| "grad_norm": 0.29276445508003235, |
| "learning_rate": 0.0001, |
| "loss": 2.5003, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.05074821080026025, |
| "grad_norm": 0.2649310231208801, |
| "learning_rate": 0.0001, |
| "loss": 2.3247, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.05139882888744307, |
| "grad_norm": 0.38125383853912354, |
| "learning_rate": 0.0001, |
| "loss": 2.5405, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.05204944697462589, |
| "grad_norm": 0.40980008244514465, |
| "learning_rate": 0.0001, |
| "loss": 2.212, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.052700065061808715, |
| "grad_norm": 0.5363492965698242, |
| "learning_rate": 0.0001, |
| "loss": 2.6499, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.053350683148991544, |
| "grad_norm": 0.34647300839424133, |
| "learning_rate": 0.0001, |
| "loss": 2.6302, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.054001301236174366, |
| "grad_norm": 0.27607980370521545, |
| "learning_rate": 0.0001, |
| "loss": 2.1819, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.05465191932335719, |
| "grad_norm": 0.27654680609703064, |
| "learning_rate": 0.0001, |
| "loss": 2.1763, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.05530253741054001, |
| "grad_norm": 0.24596217274665833, |
| "learning_rate": 0.0001, |
| "loss": 2.2585, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.05595315549772284, |
| "grad_norm": 0.24279890954494476, |
| "learning_rate": 0.0001, |
| "loss": 2.4247, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.05660377358490566, |
| "grad_norm": 0.2918747365474701, |
| "learning_rate": 0.0001, |
| "loss": 2.3986, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.057254391672088484, |
| "grad_norm": 0.26778745651245117, |
| "learning_rate": 0.0001, |
| "loss": 2.3592, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.057905009759271306, |
| "grad_norm": 0.39637815952301025, |
| "learning_rate": 0.0001, |
| "loss": 2.8006, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.05855562784645413, |
| "grad_norm": 0.2676962614059448, |
| "learning_rate": 0.0001, |
| "loss": 2.2384, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.05920624593363696, |
| "grad_norm": 0.3044937252998352, |
| "learning_rate": 0.0001, |
| "loss": 2.7762, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.05985686402081978, |
| "grad_norm": 0.23922136425971985, |
| "learning_rate": 0.0001, |
| "loss": 2.0873, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.0605074821080026, |
| "grad_norm": 0.25385046005249023, |
| "learning_rate": 0.0001, |
| "loss": 2.2708, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.06115810019518542, |
| "grad_norm": 0.378401517868042, |
| "learning_rate": 0.0001, |
| "loss": 3.0583, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.06180871828236825, |
| "grad_norm": 0.37193092703819275, |
| "learning_rate": 0.0001, |
| "loss": 2.3632, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.062459336369551074, |
| "grad_norm": 0.3757643699645996, |
| "learning_rate": 0.0001, |
| "loss": 2.4071, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.0631099544567339, |
| "grad_norm": 0.272833913564682, |
| "learning_rate": 0.0001, |
| "loss": 2.3989, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.06376057254391672, |
| "grad_norm": 0.26533326506614685, |
| "learning_rate": 0.0001, |
| "loss": 2.1716, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.06441119063109954, |
| "grad_norm": 0.5787199139595032, |
| "learning_rate": 0.0001, |
| "loss": 2.9445, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.06506180871828236, |
| "grad_norm": 0.29046157002449036, |
| "learning_rate": 0.0001, |
| "loss": 2.3325, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.06571242680546518, |
| "grad_norm": 0.531452476978302, |
| "learning_rate": 0.0001, |
| "loss": 2.7445, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.06636304489264802, |
| "grad_norm": 0.3969165086746216, |
| "learning_rate": 0.0001, |
| "loss": 2.7126, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.06701366297983084, |
| "grad_norm": 0.24183356761932373, |
| "learning_rate": 0.0001, |
| "loss": 1.9971, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.06766428106701367, |
| "grad_norm": 0.3268399238586426, |
| "learning_rate": 0.0001, |
| "loss": 2.1055, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.06831489915419649, |
| "grad_norm": 0.2625877559185028, |
| "learning_rate": 0.0001, |
| "loss": 1.9946, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.06896551724137931, |
| "grad_norm": 0.2720443308353424, |
| "learning_rate": 0.0001, |
| "loss": 2.0764, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.06961613532856213, |
| "grad_norm": 0.20969334244728088, |
| "learning_rate": 0.0001, |
| "loss": 1.8687, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.07026675341574495, |
| "grad_norm": 0.26211223006248474, |
| "learning_rate": 0.0001, |
| "loss": 2.2042, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.07091737150292778, |
| "grad_norm": 0.27889683842658997, |
| "learning_rate": 0.0001, |
| "loss": 2.3146, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.07156798959011061, |
| "grad_norm": 0.2657179832458496, |
| "learning_rate": 0.0001, |
| "loss": 2.1021, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.07221860767729343, |
| "grad_norm": 0.26620885729789734, |
| "learning_rate": 0.0001, |
| "loss": 2.3488, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.07286922576447626, |
| "grad_norm": 0.4223373830318451, |
| "learning_rate": 0.0001, |
| "loss": 2.5289, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.07351984385165908, |
| "grad_norm": 0.35398781299591064, |
| "learning_rate": 0.0001, |
| "loss": 2.5702, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.0741704619388419, |
| "grad_norm": 0.23328129947185516, |
| "learning_rate": 0.0001, |
| "loss": 2.1292, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.07482108002602472, |
| "grad_norm": 0.33508536219596863, |
| "learning_rate": 0.0001, |
| "loss": 2.2049, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.07547169811320754, |
| "grad_norm": 0.2646953761577606, |
| "learning_rate": 0.0001, |
| "loss": 2.3445, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.07612231620039037, |
| "grad_norm": 0.27866706252098083, |
| "learning_rate": 0.0001, |
| "loss": 2.2472, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.07677293428757319, |
| "grad_norm": 0.35688602924346924, |
| "learning_rate": 0.0001, |
| "loss": 2.5045, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.07742355237475602, |
| "grad_norm": 0.24262933433055878, |
| "learning_rate": 0.0001, |
| "loss": 2.4565, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.07807417046193885, |
| "grad_norm": 0.44757333397865295, |
| "learning_rate": 0.0001, |
| "loss": 2.1619, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.07872478854912167, |
| "grad_norm": 0.3279111385345459, |
| "learning_rate": 0.0001, |
| "loss": 2.3996, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.07937540663630449, |
| "grad_norm": 0.25862693786621094, |
| "learning_rate": 0.0001, |
| "loss": 2.3214, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.08002602472348731, |
| "grad_norm": 0.30093592405319214, |
| "learning_rate": 0.0001, |
| "loss": 2.6446, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.08067664281067013, |
| "grad_norm": 0.25440871715545654, |
| "learning_rate": 0.0001, |
| "loss": 2.1181, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.08132726089785296, |
| "grad_norm": 0.19935627281665802, |
| "learning_rate": 0.0001, |
| "loss": 2.0904, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.08197787898503578, |
| "grad_norm": 0.27385473251342773, |
| "learning_rate": 0.0001, |
| "loss": 2.0829, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.0826284970722186, |
| "grad_norm": 0.24417711794376373, |
| "learning_rate": 0.0001, |
| "loss": 2.0019, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.08327911515940144, |
| "grad_norm": 0.27386653423309326, |
| "learning_rate": 0.0001, |
| "loss": 2.2743, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.08392973324658426, |
| "grad_norm": 0.22413575649261475, |
| "learning_rate": 0.0001, |
| "loss": 2.1584, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.08458035133376708, |
| "grad_norm": 0.27748343348503113, |
| "learning_rate": 0.0001, |
| "loss": 2.1428, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.0852309694209499, |
| "grad_norm": 0.18890976905822754, |
| "learning_rate": 0.0001, |
| "loss": 1.9474, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.08588158750813273, |
| "grad_norm": 0.3067719340324402, |
| "learning_rate": 0.0001, |
| "loss": 2.287, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.08653220559531555, |
| "grad_norm": 0.35126858949661255, |
| "learning_rate": 0.0001, |
| "loss": 2.5086, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.08718282368249837, |
| "grad_norm": 0.19619591534137726, |
| "learning_rate": 0.0001, |
| "loss": 2.0132, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.08783344176968119, |
| "grad_norm": 0.360569566488266, |
| "learning_rate": 0.0001, |
| "loss": 2.607, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.08848405985686401, |
| "grad_norm": 0.22566738724708557, |
| "learning_rate": 0.0001, |
| "loss": 2.0942, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.08913467794404685, |
| "grad_norm": 0.27346086502075195, |
| "learning_rate": 0.0001, |
| "loss": 2.3139, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.08978529603122967, |
| "grad_norm": 0.2500152289867401, |
| "learning_rate": 0.0001, |
| "loss": 2.0815, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.0904359141184125, |
| "grad_norm": 0.22101153433322906, |
| "learning_rate": 0.0001, |
| "loss": 2.374, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.09108653220559532, |
| "grad_norm": 0.2173723727464676, |
| "learning_rate": 0.0001, |
| "loss": 2.0084, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.09173715029277814, |
| "grad_norm": 0.28956499695777893, |
| "learning_rate": 0.0001, |
| "loss": 2.6283, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.09238776837996096, |
| "grad_norm": 0.27032795548439026, |
| "learning_rate": 0.0001, |
| "loss": 2.142, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.09303838646714378, |
| "grad_norm": 0.24320480227470398, |
| "learning_rate": 0.0001, |
| "loss": 2.1402, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.0936890045543266, |
| "grad_norm": 0.3127799332141876, |
| "learning_rate": 0.0001, |
| "loss": 2.6671, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.09433962264150944, |
| "grad_norm": 0.30706024169921875, |
| "learning_rate": 0.0001, |
| "loss": 2.3026, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.09499024072869226, |
| "grad_norm": 0.2378646731376648, |
| "learning_rate": 0.0001, |
| "loss": 2.0422, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.09564085881587508, |
| "grad_norm": 0.24755406379699707, |
| "learning_rate": 0.0001, |
| "loss": 2.2574, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.09629147690305791, |
| "grad_norm": 0.34464696049690247, |
| "learning_rate": 0.0001, |
| "loss": 2.2817, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.09694209499024073, |
| "grad_norm": 0.30485469102859497, |
| "learning_rate": 0.0001, |
| "loss": 2.7303, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.09759271307742355, |
| "grad_norm": 0.1860698163509369, |
| "learning_rate": 0.0001, |
| "loss": 1.8582, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.09824333116460637, |
| "grad_norm": 0.23853841423988342, |
| "learning_rate": 0.0001, |
| "loss": 2.1378, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.0988939492517892, |
| "grad_norm": 0.20248261094093323, |
| "learning_rate": 0.0001, |
| "loss": 2.1888, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.09954456733897202, |
| "grad_norm": 0.3582792282104492, |
| "learning_rate": 0.0001, |
| "loss": 2.6726, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.10019518542615485, |
| "grad_norm": 0.2576686441898346, |
| "learning_rate": 0.0001, |
| "loss": 2.4494, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.10084580351333768, |
| "grad_norm": 0.306029349565506, |
| "learning_rate": 0.0001, |
| "loss": 2.2273, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.1014964216005205, |
| "grad_norm": 0.31375500559806824, |
| "learning_rate": 0.0001, |
| "loss": 2.2474, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.10214703968770332, |
| "grad_norm": 0.253250390291214, |
| "learning_rate": 0.0001, |
| "loss": 2.0142, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.10279765777488614, |
| "grad_norm": 0.3098273277282715, |
| "learning_rate": 0.0001, |
| "loss": 2.2516, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.10344827586206896, |
| "grad_norm": 0.3239591717720032, |
| "learning_rate": 0.0001, |
| "loss": 2.2432, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.10409889394925179, |
| "grad_norm": 0.24929773807525635, |
| "learning_rate": 0.0001, |
| "loss": 2.2495, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.10474951203643461, |
| "grad_norm": 0.3203783929347992, |
| "learning_rate": 0.0001, |
| "loss": 2.68, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.10540013012361743, |
| "grad_norm": 0.38844674825668335, |
| "learning_rate": 0.0001, |
| "loss": 2.7457, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.10605074821080027, |
| "grad_norm": 0.21753644943237305, |
| "learning_rate": 0.0001, |
| "loss": 2.1284, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.10670136629798309, |
| "grad_norm": 0.20610418915748596, |
| "learning_rate": 0.0001, |
| "loss": 1.8377, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.10735198438516591, |
| "grad_norm": 0.3555772304534912, |
| "learning_rate": 0.0001, |
| "loss": 2.3599, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.10800260247234873, |
| "grad_norm": 0.3971005380153656, |
| "learning_rate": 0.0001, |
| "loss": 2.2771, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.10865322055953155, |
| "grad_norm": 0.28628769516944885, |
| "learning_rate": 0.0001, |
| "loss": 2.2438, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.10930383864671438, |
| "grad_norm": 0.38728833198547363, |
| "learning_rate": 0.0001, |
| "loss": 2.4103, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.1099544567338972, |
| "grad_norm": 0.26340189576148987, |
| "learning_rate": 0.0001, |
| "loss": 2.6832, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.11060507482108002, |
| "grad_norm": 0.20119386911392212, |
| "learning_rate": 0.0001, |
| "loss": 1.9622, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.11125569290826284, |
| "grad_norm": 0.2929171621799469, |
| "learning_rate": 0.0001, |
| "loss": 2.2762, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.11190631099544568, |
| "grad_norm": 0.422146201133728, |
| "learning_rate": 0.0001, |
| "loss": 2.4015, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.1125569290826285, |
| "grad_norm": 0.29050537943840027, |
| "learning_rate": 0.0001, |
| "loss": 2.4399, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.11320754716981132, |
| "grad_norm": 0.2646816074848175, |
| "learning_rate": 0.0001, |
| "loss": 2.3058, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.11385816525699415, |
| "grad_norm": 0.2643061578273773, |
| "learning_rate": 0.0001, |
| "loss": 2.1892, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.11450878334417697, |
| "grad_norm": 0.5878323316574097, |
| "learning_rate": 0.0001, |
| "loss": 3.2198, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.11515940143135979, |
| "grad_norm": 0.36881884932518005, |
| "learning_rate": 0.0001, |
| "loss": 2.4112, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.11581001951854261, |
| "grad_norm": 0.25198304653167725, |
| "learning_rate": 0.0001, |
| "loss": 2.1667, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.11646063760572543, |
| "grad_norm": 0.34164664149284363, |
| "learning_rate": 0.0001, |
| "loss": 2.6248, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.11711125569290826, |
| "grad_norm": 0.41471973061561584, |
| "learning_rate": 0.0001, |
| "loss": 2.5616, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.11776187378009109, |
| "grad_norm": 0.26372480392456055, |
| "learning_rate": 0.0001, |
| "loss": 2.2904, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.11841249186727391, |
| "grad_norm": 0.2271176278591156, |
| "learning_rate": 0.0001, |
| "loss": 2.0312, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.11906310995445674, |
| "grad_norm": 0.2106996774673462, |
| "learning_rate": 0.0001, |
| "loss": 1.9661, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.11971372804163956, |
| "grad_norm": 0.22870291769504547, |
| "learning_rate": 0.0001, |
| "loss": 1.9052, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.12036434612882238, |
| "grad_norm": 0.41253864765167236, |
| "learning_rate": 0.0001, |
| "loss": 2.3747, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.1210149642160052, |
| "grad_norm": 0.3258817791938782, |
| "learning_rate": 0.0001, |
| "loss": 2.5401, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.12166558230318802, |
| "grad_norm": 0.3461870551109314, |
| "learning_rate": 0.0001, |
| "loss": 2.8027, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.12231620039037085, |
| "grad_norm": 0.3704046607017517, |
| "learning_rate": 0.0001, |
| "loss": 2.799, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.12296681847755368, |
| "grad_norm": 0.30265969038009644, |
| "learning_rate": 0.0001, |
| "loss": 2.4287, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.1236174365647365, |
| "grad_norm": 0.4215582013130188, |
| "learning_rate": 0.0001, |
| "loss": 2.6857, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.12426805465191933, |
| "grad_norm": 0.3003520965576172, |
| "learning_rate": 0.0001, |
| "loss": 2.4155, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.12491867273910215, |
| "grad_norm": 0.412749320268631, |
| "learning_rate": 0.0001, |
| "loss": 2.6352, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.12556929082628496, |
| "grad_norm": 0.2772350013256073, |
| "learning_rate": 0.0001, |
| "loss": 2.2452, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.1262199089134678, |
| "grad_norm": 0.21457143127918243, |
| "learning_rate": 0.0001, |
| "loss": 2.0172, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.12687052700065063, |
| "grad_norm": 0.40995845198631287, |
| "learning_rate": 0.0001, |
| "loss": 2.6218, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.12752114508783344, |
| "grad_norm": 0.2253209501504898, |
| "learning_rate": 0.0001, |
| "loss": 2.2319, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.12817176317501627, |
| "grad_norm": 0.36564287543296814, |
| "learning_rate": 0.0001, |
| "loss": 2.4585, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.12882238126219908, |
| "grad_norm": 0.41084784269332886, |
| "learning_rate": 0.0001, |
| "loss": 2.6326, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.12947299934938192, |
| "grad_norm": 0.36012157797813416, |
| "learning_rate": 0.0001, |
| "loss": 2.0168, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.13012361743656473, |
| "grad_norm": 0.5138425230979919, |
| "learning_rate": 0.0001, |
| "loss": 2.3377, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.13077423552374756, |
| "grad_norm": 0.2799031436443329, |
| "learning_rate": 0.0001, |
| "loss": 2.532, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.13142485361093037, |
| "grad_norm": 0.3078779876232147, |
| "learning_rate": 0.0001, |
| "loss": 2.044, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.1320754716981132, |
| "grad_norm": 0.31270912289619446, |
| "learning_rate": 0.0001, |
| "loss": 1.8576, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.13272608978529604, |
| "grad_norm": 0.23117204010486603, |
| "learning_rate": 0.0001, |
| "loss": 2.1908, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.13337670787247885, |
| "grad_norm": 0.2531285285949707, |
| "learning_rate": 0.0001, |
| "loss": 2.143, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.1340273259596617, |
| "grad_norm": 0.28053218126296997, |
| "learning_rate": 0.0001, |
| "loss": 2.6902, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.1346779440468445, |
| "grad_norm": 0.2600589692592621, |
| "learning_rate": 0.0001, |
| "loss": 2.0355, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.13532856213402733, |
| "grad_norm": 0.2725912630558014, |
| "learning_rate": 0.0001, |
| "loss": 2.3949, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.13597918022121014, |
| "grad_norm": 0.6166338324546814, |
| "learning_rate": 0.0001, |
| "loss": 2.8146, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.13662979830839297, |
| "grad_norm": 0.4028575122356415, |
| "learning_rate": 0.0001, |
| "loss": 2.888, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.1372804163955758, |
| "grad_norm": 0.23181548714637756, |
| "learning_rate": 0.0001, |
| "loss": 2.1406, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.13793103448275862, |
| "grad_norm": 0.24338063597679138, |
| "learning_rate": 0.0001, |
| "loss": 2.1564, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.13858165256994145, |
| "grad_norm": 0.233146533370018, |
| "learning_rate": 0.0001, |
| "loss": 2.1695, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.13923227065712426, |
| "grad_norm": 0.21236726641654968, |
| "learning_rate": 0.0001, |
| "loss": 1.9272, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.1398828887443071, |
| "grad_norm": 0.25471317768096924, |
| "learning_rate": 0.0001, |
| "loss": 2.3447, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.1405335068314899, |
| "grad_norm": 0.35532835125923157, |
| "learning_rate": 0.0001, |
| "loss": 2.4328, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.14118412491867274, |
| "grad_norm": 0.32900944352149963, |
| "learning_rate": 0.0001, |
| "loss": 2.385, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.14183474300585555, |
| "grad_norm": 0.45404863357543945, |
| "learning_rate": 0.0001, |
| "loss": 2.8053, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.1424853610930384, |
| "grad_norm": 0.33968400955200195, |
| "learning_rate": 0.0001, |
| "loss": 2.4524, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.14313597918022122, |
| "grad_norm": 0.3250170946121216, |
| "learning_rate": 0.0001, |
| "loss": 2.6173, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.14378659726740403, |
| "grad_norm": 0.34765559434890747, |
| "learning_rate": 0.0001, |
| "loss": 2.8468, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.14443721535458687, |
| "grad_norm": 0.2274564653635025, |
| "learning_rate": 0.0001, |
| "loss": 2.1305, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.14508783344176968, |
| "grad_norm": 0.42719507217407227, |
| "learning_rate": 0.0001, |
| "loss": 2.3682, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.1457384515289525, |
| "grad_norm": 0.2848481833934784, |
| "learning_rate": 0.0001, |
| "loss": 2.0923, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.14638906961613532, |
| "grad_norm": 0.266548752784729, |
| "learning_rate": 0.0001, |
| "loss": 2.0393, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.14703968770331816, |
| "grad_norm": 0.24076099693775177, |
| "learning_rate": 0.0001, |
| "loss": 2.2674, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.14769030579050096, |
| "grad_norm": 0.23347622156143188, |
| "learning_rate": 0.0001, |
| "loss": 1.9455, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.1483409238776838, |
| "grad_norm": 0.3925648033618927, |
| "learning_rate": 0.0001, |
| "loss": 2.7117, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.14899154196486664, |
| "grad_norm": 0.27654924988746643, |
| "learning_rate": 0.0001, |
| "loss": 2.1306, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.14964216005204944, |
| "grad_norm": 0.2853853702545166, |
| "learning_rate": 0.0001, |
| "loss": 2.4369, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.15029277813923228, |
| "grad_norm": 0.4509859085083008, |
| "learning_rate": 0.0001, |
| "loss": 2.6047, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.1509433962264151, |
| "grad_norm": 0.2515909671783447, |
| "learning_rate": 0.0001, |
| "loss": 2.2065, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.15159401431359792, |
| "grad_norm": 0.5977367162704468, |
| "learning_rate": 0.0001, |
| "loss": 2.7133, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.15224463240078073, |
| "grad_norm": 0.30381399393081665, |
| "learning_rate": 0.0001, |
| "loss": 2.343, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.15289525048796357, |
| "grad_norm": 0.27204832434654236, |
| "learning_rate": 0.0001, |
| "loss": 2.2908, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.15354586857514638, |
| "grad_norm": 0.6246710419654846, |
| "learning_rate": 0.0001, |
| "loss": 2.7862, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.1541964866623292, |
| "grad_norm": 0.4803178012371063, |
| "learning_rate": 0.0001, |
| "loss": 3.4388, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.15484710474951205, |
| "grad_norm": 0.3038940727710724, |
| "learning_rate": 0.0001, |
| "loss": 2.7409, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.15549772283669486, |
| "grad_norm": 0.2494591474533081, |
| "learning_rate": 0.0001, |
| "loss": 2.2601, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.1561483409238777, |
| "grad_norm": 0.23808616399765015, |
| "learning_rate": 0.0001, |
| "loss": 2.1319, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.1567989590110605, |
| "grad_norm": 0.3111306130886078, |
| "learning_rate": 0.0001, |
| "loss": 2.7414, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.15744957709824334, |
| "grad_norm": 0.22197599709033966, |
| "learning_rate": 0.0001, |
| "loss": 2.1346, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.15810019518542615, |
| "grad_norm": 0.2681500315666199, |
| "learning_rate": 0.0001, |
| "loss": 2.3779, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.15875081327260898, |
| "grad_norm": 0.2612643241882324, |
| "learning_rate": 0.0001, |
| "loss": 2.5743, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.1594014313597918, |
| "grad_norm": 0.201397106051445, |
| "learning_rate": 0.0001, |
| "loss": 2.0312, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.16005204944697463, |
| "grad_norm": 0.25662410259246826, |
| "learning_rate": 0.0001, |
| "loss": 2.5085, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.16070266753415746, |
| "grad_norm": 0.21460294723510742, |
| "learning_rate": 0.0001, |
| "loss": 2.1099, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.16135328562134027, |
| "grad_norm": 0.19971312582492828, |
| "learning_rate": 0.0001, |
| "loss": 2.1024, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.1620039037085231, |
| "grad_norm": 0.1986059844493866, |
| "learning_rate": 0.0001, |
| "loss": 1.9306, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.16265452179570591, |
| "grad_norm": 0.21961884200572968, |
| "learning_rate": 0.0001, |
| "loss": 2.1218, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.16330513988288875, |
| "grad_norm": 0.20071017742156982, |
| "learning_rate": 0.0001, |
| "loss": 2.0581, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.16395575797007156, |
| "grad_norm": 0.32734909653663635, |
| "learning_rate": 0.0001, |
| "loss": 2.6229, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.1646063760572544, |
| "grad_norm": 0.21822451055049896, |
| "learning_rate": 0.0001, |
| "loss": 1.9954, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.1652569941444372, |
| "grad_norm": 0.3013177216053009, |
| "learning_rate": 0.0001, |
| "loss": 2.454, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.16590761223162004, |
| "grad_norm": 0.31199347972869873, |
| "learning_rate": 0.0001, |
| "loss": 2.815, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.16655823031880287, |
| "grad_norm": 0.2255464345216751, |
| "learning_rate": 0.0001, |
| "loss": 2.0232, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.16720884840598568, |
| "grad_norm": 0.21208804845809937, |
| "learning_rate": 0.0001, |
| "loss": 1.9663, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.16785946649316852, |
| "grad_norm": 0.2432132512331009, |
| "learning_rate": 0.0001, |
| "loss": 2.4189, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.16851008458035133, |
| "grad_norm": 0.21116623282432556, |
| "learning_rate": 0.0001, |
| "loss": 2.0761, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.16916070266753416, |
| "grad_norm": 0.18722975254058838, |
| "learning_rate": 0.0001, |
| "loss": 1.9537, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.16981132075471697, |
| "grad_norm": 0.2683362662792206, |
| "learning_rate": 0.0001, |
| "loss": 2.4483, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.1704619388418998, |
| "grad_norm": 0.2739648222923279, |
| "learning_rate": 0.0001, |
| "loss": 2.3754, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.17111255692908262, |
| "grad_norm": 0.1836375594139099, |
| "learning_rate": 0.0001, |
| "loss": 2.0103, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.17176317501626545, |
| "grad_norm": 0.34002602100372314, |
| "learning_rate": 0.0001, |
| "loss": 2.2626, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.1724137931034483, |
| "grad_norm": 0.19341516494750977, |
| "learning_rate": 0.0001, |
| "loss": 1.9751, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.1730644111906311, |
| "grad_norm": 0.25080743432044983, |
| "learning_rate": 0.0001, |
| "loss": 2.2162, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.17371502927781393, |
| "grad_norm": 0.2362661212682724, |
| "learning_rate": 0.0001, |
| "loss": 2.0226, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.17436564736499674, |
| "grad_norm": 0.25844064354896545, |
| "learning_rate": 0.0001, |
| "loss": 2.3176, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.17501626545217958, |
| "grad_norm": 0.3904498517513275, |
| "learning_rate": 0.0001, |
| "loss": 2.4871, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.17566688353936238, |
| "grad_norm": 0.22143317759037018, |
| "learning_rate": 0.0001, |
| "loss": 2.2073, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.17631750162654522, |
| "grad_norm": 0.20974211394786835, |
| "learning_rate": 0.0001, |
| "loss": 2.1393, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.17696811971372803, |
| "grad_norm": 0.24463056027889252, |
| "learning_rate": 0.0001, |
| "loss": 2.0203, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.17761873780091086, |
| "grad_norm": 0.23296399414539337, |
| "learning_rate": 0.0001, |
| "loss": 2.1096, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.1782693558880937, |
| "grad_norm": 0.4122619926929474, |
| "learning_rate": 0.0001, |
| "loss": 3.1512, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.1789199739752765, |
| "grad_norm": 0.2744470536708832, |
| "learning_rate": 0.0001, |
| "loss": 2.2211, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.17957059206245934, |
| "grad_norm": 0.21010619401931763, |
| "learning_rate": 0.0001, |
| "loss": 2.2203, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.18022121014964215, |
| "grad_norm": 0.27855056524276733, |
| "learning_rate": 0.0001, |
| "loss": 2.2903, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.180871828236825, |
| "grad_norm": 0.2909989058971405, |
| "learning_rate": 0.0001, |
| "loss": 2.237, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.1815224463240078, |
| "grad_norm": 0.21754448115825653, |
| "learning_rate": 0.0001, |
| "loss": 2.0138, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.18217306441119063, |
| "grad_norm": 0.35209745168685913, |
| "learning_rate": 0.0001, |
| "loss": 2.652, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.18282368249837344, |
| "grad_norm": 0.29994750022888184, |
| "learning_rate": 0.0001, |
| "loss": 2.1868, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.18347430058555628, |
| "grad_norm": 0.2645902633666992, |
| "learning_rate": 0.0001, |
| "loss": 2.2925, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.1841249186727391, |
| "grad_norm": 0.3492202162742615, |
| "learning_rate": 0.0001, |
| "loss": 2.4176, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.18477553675992192, |
| "grad_norm": 0.256651371717453, |
| "learning_rate": 0.0001, |
| "loss": 2.3414, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.18542615484710476, |
| "grad_norm": 0.23287786543369293, |
| "learning_rate": 0.0001, |
| "loss": 2.5488, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.18607677293428757, |
| "grad_norm": 0.26059290766716003, |
| "learning_rate": 0.0001, |
| "loss": 2.4551, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.1867273910214704, |
| "grad_norm": 0.2482365071773529, |
| "learning_rate": 0.0001, |
| "loss": 2.0818, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.1873780091086532, |
| "grad_norm": 0.23024773597717285, |
| "learning_rate": 0.0001, |
| "loss": 2.2592, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.18802862719583605, |
| "grad_norm": 0.2590011656284332, |
| "learning_rate": 0.0001, |
| "loss": 2.4177, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.18867924528301888, |
| "grad_norm": 0.19760870933532715, |
| "learning_rate": 0.0001, |
| "loss": 2.0731, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.1893298633702017, |
| "grad_norm": 0.20266428589820862, |
| "learning_rate": 0.0001, |
| "loss": 2.1221, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.18998048145738453, |
| "grad_norm": 0.20199884474277496, |
| "learning_rate": 0.0001, |
| "loss": 2.0489, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.19063109954456733, |
| "grad_norm": 0.23876360058784485, |
| "learning_rate": 0.0001, |
| "loss": 2.1392, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.19128171763175017, |
| "grad_norm": 0.23555997014045715, |
| "learning_rate": 0.0001, |
| "loss": 2.4116, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.19193233571893298, |
| "grad_norm": 0.5010725259780884, |
| "learning_rate": 0.0001, |
| "loss": 2.7444, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.19258295380611581, |
| "grad_norm": 0.37809622287750244, |
| "learning_rate": 0.0001, |
| "loss": 2.2635, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.19323357189329862, |
| "grad_norm": 0.499888151884079, |
| "learning_rate": 0.0001, |
| "loss": 2.1984, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.19388418998048146, |
| "grad_norm": 0.43810585141181946, |
| "learning_rate": 0.0001, |
| "loss": 3.084, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.1945348080676643, |
| "grad_norm": 0.35633769631385803, |
| "learning_rate": 0.0001, |
| "loss": 2.0351, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.1951854261548471, |
| "grad_norm": 0.3693079650402069, |
| "learning_rate": 0.0001, |
| "loss": 1.9525, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.19583604424202994, |
| "grad_norm": 0.36550503969192505, |
| "learning_rate": 0.0001, |
| "loss": 2.2469, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.19648666232921275, |
| "grad_norm": 0.2579827308654785, |
| "learning_rate": 0.0001, |
| "loss": 2.3585, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.19713728041639558, |
| "grad_norm": 0.2603841722011566, |
| "learning_rate": 0.0001, |
| "loss": 2.3959, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.1977878985035784, |
| "grad_norm": 0.33103683590888977, |
| "learning_rate": 0.0001, |
| "loss": 2.2197, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.19843851659076123, |
| "grad_norm": 0.2977697551250458, |
| "learning_rate": 0.0001, |
| "loss": 2.2569, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.19908913467794404, |
| "grad_norm": 0.2085130512714386, |
| "learning_rate": 0.0001, |
| "loss": 2.2284, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.19973975276512687, |
| "grad_norm": 0.409212201833725, |
| "learning_rate": 0.0001, |
| "loss": 2.7014, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.2003903708523097, |
| "grad_norm": 0.2447553277015686, |
| "learning_rate": 0.0001, |
| "loss": 2.2826, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.20104098893949252, |
| "grad_norm": 0.21881726384162903, |
| "learning_rate": 0.0001, |
| "loss": 1.8573, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.20169160702667535, |
| "grad_norm": 0.24484936892986298, |
| "learning_rate": 0.0001, |
| "loss": 2.318, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.20234222511385816, |
| "grad_norm": 0.3251173198223114, |
| "learning_rate": 0.0001, |
| "loss": 2.3346, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.202992843201041, |
| "grad_norm": 0.22313712537288666, |
| "learning_rate": 0.0001, |
| "loss": 1.9119, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.2036434612882238, |
| "grad_norm": 0.3086949288845062, |
| "learning_rate": 0.0001, |
| "loss": 2.1809, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.20429407937540664, |
| "grad_norm": 0.28272122144699097, |
| "learning_rate": 0.0001, |
| "loss": 2.3335, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.20494469746258945, |
| "grad_norm": 0.208637535572052, |
| "learning_rate": 0.0001, |
| "loss": 2.1947, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.20559531554977228, |
| "grad_norm": 0.2913041114807129, |
| "learning_rate": 0.0001, |
| "loss": 2.3009, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.20624593363695512, |
| "grad_norm": 0.2813785970211029, |
| "learning_rate": 0.0001, |
| "loss": 2.0133, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.20689655172413793, |
| "grad_norm": 0.2324337363243103, |
| "learning_rate": 0.0001, |
| "loss": 2.0827, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.20754716981132076, |
| "grad_norm": 0.25195491313934326, |
| "learning_rate": 0.0001, |
| "loss": 2.5201, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.20819778789850357, |
| "grad_norm": 0.3435034453868866, |
| "learning_rate": 0.0001, |
| "loss": 2.321, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.2088484059856864, |
| "grad_norm": 0.2735581696033478, |
| "learning_rate": 0.0001, |
| "loss": 2.2218, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.20949902407286922, |
| "grad_norm": 0.2250661551952362, |
| "learning_rate": 0.0001, |
| "loss": 1.9416, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.21014964216005205, |
| "grad_norm": 0.3160262107849121, |
| "learning_rate": 0.0001, |
| "loss": 2.5494, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.21080026024723486, |
| "grad_norm": 0.3669279217720032, |
| "learning_rate": 0.0001, |
| "loss": 2.7751, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.2114508783344177, |
| "grad_norm": 0.2052752673625946, |
| "learning_rate": 0.0001, |
| "loss": 2.0139, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.21210149642160053, |
| "grad_norm": 0.2906612455844879, |
| "learning_rate": 0.0001, |
| "loss": 2.227, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.21275211450878334, |
| "grad_norm": 0.30327048897743225, |
| "learning_rate": 0.0001, |
| "loss": 2.2905, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.21340273259596618, |
| "grad_norm": 0.33950623869895935, |
| "learning_rate": 0.0001, |
| "loss": 3.0731, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.21405335068314899, |
| "grad_norm": 0.31319788098335266, |
| "learning_rate": 0.0001, |
| "loss": 2.1374, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.21470396877033182, |
| "grad_norm": 0.21442054212093353, |
| "learning_rate": 0.0001, |
| "loss": 1.7588, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.21535458685751463, |
| "grad_norm": 0.23125174641609192, |
| "learning_rate": 0.0001, |
| "loss": 1.9295, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.21600520494469747, |
| "grad_norm": 0.23220308125019073, |
| "learning_rate": 0.0001, |
| "loss": 2.2606, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.21665582303188027, |
| "grad_norm": 0.24599219858646393, |
| "learning_rate": 0.0001, |
| "loss": 2.2687, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.2173064411190631, |
| "grad_norm": 0.22226236760616302, |
| "learning_rate": 0.0001, |
| "loss": 2.1428, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.21795705920624595, |
| "grad_norm": 0.2653510570526123, |
| "learning_rate": 0.0001, |
| "loss": 2.4381, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.21860767729342875, |
| "grad_norm": 0.23770929872989655, |
| "learning_rate": 0.0001, |
| "loss": 1.9655, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.2192582953806116, |
| "grad_norm": 0.1932332068681717, |
| "learning_rate": 0.0001, |
| "loss": 1.9465, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.2199089134677944, |
| "grad_norm": 0.181661456823349, |
| "learning_rate": 0.0001, |
| "loss": 1.9912, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.22055953155497723, |
| "grad_norm": 0.22275297343730927, |
| "learning_rate": 0.0001, |
| "loss": 2.1964, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.22121014964216004, |
| "grad_norm": 0.22086840867996216, |
| "learning_rate": 0.0001, |
| "loss": 2.2216, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.22186076772934288, |
| "grad_norm": 0.22807130217552185, |
| "learning_rate": 0.0001, |
| "loss": 2.2434, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.2225113858165257, |
| "grad_norm": 0.26616647839546204, |
| "learning_rate": 0.0001, |
| "loss": 2.442, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.22316200390370852, |
| "grad_norm": 0.2841719388961792, |
| "learning_rate": 0.0001, |
| "loss": 2.2358, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.22381262199089136, |
| "grad_norm": 0.23251943290233612, |
| "learning_rate": 0.0001, |
| "loss": 2.3436, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.22446324007807417, |
| "grad_norm": 0.20406994223594666, |
| "learning_rate": 0.0001, |
| "loss": 2.101, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.225113858165257, |
| "grad_norm": 0.18677304685115814, |
| "learning_rate": 0.0001, |
| "loss": 2.0596, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.2257644762524398, |
| "grad_norm": 0.22367873787879944, |
| "learning_rate": 0.0001, |
| "loss": 2.2051, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.22641509433962265, |
| "grad_norm": 0.2521246671676636, |
| "learning_rate": 0.0001, |
| "loss": 2.1718, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.22706571242680545, |
| "grad_norm": 0.23043319582939148, |
| "learning_rate": 0.0001, |
| "loss": 2.2818, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.2277163305139883, |
| "grad_norm": 0.22021251916885376, |
| "learning_rate": 0.0001, |
| "loss": 2.0337, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.2283669486011711, |
| "grad_norm": 0.18043603003025055, |
| "learning_rate": 0.0001, |
| "loss": 1.9434, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.22901756668835394, |
| "grad_norm": 0.4757142961025238, |
| "learning_rate": 0.0001, |
| "loss": 2.2467, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.22966818477553677, |
| "grad_norm": 0.30740290880203247, |
| "learning_rate": 0.0001, |
| "loss": 2.5296, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.23031880286271958, |
| "grad_norm": 0.23037666082382202, |
| "learning_rate": 0.0001, |
| "loss": 2.311, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.23096942094990242, |
| "grad_norm": 0.22314564883708954, |
| "learning_rate": 0.0001, |
| "loss": 2.0494, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.23162003903708522, |
| "grad_norm": 0.21417242288589478, |
| "learning_rate": 0.0001, |
| "loss": 2.2459, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.23227065712426806, |
| "grad_norm": 0.2895831763744354, |
| "learning_rate": 0.0001, |
| "loss": 2.2705, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.23292127521145087, |
| "grad_norm": 0.2110838145017624, |
| "learning_rate": 0.0001, |
| "loss": 2.1175, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.2335718932986337, |
| "grad_norm": 0.3999682664871216, |
| "learning_rate": 0.0001, |
| "loss": 2.6891, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.2342225113858165, |
| "grad_norm": 0.5169201493263245, |
| "learning_rate": 0.0001, |
| "loss": 2.5764, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.23487312947299935, |
| "grad_norm": 0.24382548034191132, |
| "learning_rate": 0.0001, |
| "loss": 2.1065, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.23552374756018218, |
| "grad_norm": 0.2830081582069397, |
| "learning_rate": 0.0001, |
| "loss": 2.1186, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.236174365647365, |
| "grad_norm": 0.23680554330348969, |
| "learning_rate": 0.0001, |
| "loss": 2.118, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.23682498373454783, |
| "grad_norm": 0.3790690302848816, |
| "learning_rate": 0.0001, |
| "loss": 2.3566, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.23747560182173064, |
| "grad_norm": 0.2664685845375061, |
| "learning_rate": 0.0001, |
| "loss": 2.2118, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.23812621990891347, |
| "grad_norm": 0.22439126670360565, |
| "learning_rate": 0.0001, |
| "loss": 2.0897, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.23877683799609628, |
| "grad_norm": 0.2559892237186432, |
| "learning_rate": 0.0001, |
| "loss": 2.2559, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.23942745608327912, |
| "grad_norm": 0.43989577889442444, |
| "learning_rate": 0.0001, |
| "loss": 2.5208, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.24007807417046195, |
| "grad_norm": 0.24543894827365875, |
| "learning_rate": 0.0001, |
| "loss": 2.1692, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.24072869225764476, |
| "grad_norm": 0.37020954489707947, |
| "learning_rate": 0.0001, |
| "loss": 2.1287, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.2413793103448276, |
| "grad_norm": 0.41815564036369324, |
| "learning_rate": 0.0001, |
| "loss": 2.5952, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.2420299284320104, |
| "grad_norm": 0.22579136490821838, |
| "learning_rate": 0.0001, |
| "loss": 2.2427, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.24268054651919324, |
| "grad_norm": 0.3004798889160156, |
| "learning_rate": 0.0001, |
| "loss": 2.2767, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.24333116460637605, |
| "grad_norm": 0.27470141649246216, |
| "learning_rate": 0.0001, |
| "loss": 2.092, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.24398178269355889, |
| "grad_norm": 0.25301867723464966, |
| "learning_rate": 0.0001, |
| "loss": 2.1816, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.2446324007807417, |
| "grad_norm": 0.21194620430469513, |
| "learning_rate": 0.0001, |
| "loss": 2.1322, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.24528301886792453, |
| "grad_norm": 0.28737103939056396, |
| "learning_rate": 0.0001, |
| "loss": 2.6685, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.24593363695510737, |
| "grad_norm": 0.28857922554016113, |
| "learning_rate": 0.0001, |
| "loss": 2.2219, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.24658425504229017, |
| "grad_norm": 0.29493409395217896, |
| "learning_rate": 0.0001, |
| "loss": 2.717, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.247234873129473, |
| "grad_norm": 0.33975929021835327, |
| "learning_rate": 0.0001, |
| "loss": 2.3499, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.24788549121665582, |
| "grad_norm": 0.21486152708530426, |
| "learning_rate": 0.0001, |
| "loss": 2.306, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.24853610930383865, |
| "grad_norm": 0.2686431109905243, |
| "learning_rate": 0.0001, |
| "loss": 2.0942, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.24918672739102146, |
| "grad_norm": 0.2812007963657379, |
| "learning_rate": 0.0001, |
| "loss": 2.3729, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.2498373454782043, |
| "grad_norm": 0.31875330209732056, |
| "learning_rate": 0.0001, |
| "loss": 2.5766, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.2504879635653871, |
| "grad_norm": 0.2624376714229584, |
| "learning_rate": 0.0001, |
| "loss": 2.2057, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.2511385816525699, |
| "grad_norm": 0.265286386013031, |
| "learning_rate": 0.0001, |
| "loss": 2.2405, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.2517891997397528, |
| "grad_norm": 0.3202246129512787, |
| "learning_rate": 0.0001, |
| "loss": 2.2817, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.2524398178269356, |
| "grad_norm": 0.22770161926746368, |
| "learning_rate": 0.0001, |
| "loss": 1.9564, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.2530904359141184, |
| "grad_norm": 0.3313138484954834, |
| "learning_rate": 0.0001, |
| "loss": 2.4424, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.25374105400130126, |
| "grad_norm": 0.2961839437484741, |
| "learning_rate": 0.0001, |
| "loss": 2.4122, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.25439167208848407, |
| "grad_norm": 0.24270308017730713, |
| "learning_rate": 0.0001, |
| "loss": 1.99, |
| "step": 391 |
| }, |
| { |
| "epoch": 0.2550422901756669, |
| "grad_norm": 0.2306670844554901, |
| "learning_rate": 0.0001, |
| "loss": 2.3529, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.2556929082628497, |
| "grad_norm": 0.28387176990509033, |
| "learning_rate": 0.0001, |
| "loss": 2.0824, |
| "step": 393 |
| }, |
| { |
| "epoch": 0.25634352635003255, |
| "grad_norm": 0.3105824291706085, |
| "learning_rate": 0.0001, |
| "loss": 2.437, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.25699414443721535, |
| "grad_norm": 0.1932361125946045, |
| "learning_rate": 0.0001, |
| "loss": 1.9747, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.25764476252439816, |
| "grad_norm": 0.31146278977394104, |
| "learning_rate": 0.0001, |
| "loss": 2.263, |
| "step": 396 |
| }, |
| { |
| "epoch": 0.258295380611581, |
| "grad_norm": 0.24420365691184998, |
| "learning_rate": 0.0001, |
| "loss": 2.015, |
| "step": 397 |
| }, |
| { |
| "epoch": 0.25894599869876384, |
| "grad_norm": 0.24144989252090454, |
| "learning_rate": 0.0001, |
| "loss": 2.2536, |
| "step": 398 |
| }, |
| { |
| "epoch": 0.25959661678594664, |
| "grad_norm": 0.3478517532348633, |
| "learning_rate": 0.0001, |
| "loss": 2.5835, |
| "step": 399 |
| }, |
| { |
| "epoch": 0.26024723487312945, |
| "grad_norm": 0.24381348490715027, |
| "learning_rate": 0.0001, |
| "loss": 2.2439, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.2608978529603123, |
| "grad_norm": 0.2834983468055725, |
| "learning_rate": 0.0001, |
| "loss": 2.3991, |
| "step": 401 |
| }, |
| { |
| "epoch": 0.2615484710474951, |
| "grad_norm": 0.28689858317375183, |
| "learning_rate": 0.0001, |
| "loss": 1.9156, |
| "step": 402 |
| }, |
| { |
| "epoch": 0.26219908913467793, |
| "grad_norm": 0.23692357540130615, |
| "learning_rate": 0.0001, |
| "loss": 2.0189, |
| "step": 403 |
| }, |
| { |
| "epoch": 0.26284970722186074, |
| "grad_norm": 0.30104926228523254, |
| "learning_rate": 0.0001, |
| "loss": 2.4945, |
| "step": 404 |
| }, |
| { |
| "epoch": 0.2635003253090436, |
| "grad_norm": 0.23472270369529724, |
| "learning_rate": 0.0001, |
| "loss": 1.8892, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.2641509433962264, |
| "grad_norm": 0.31508034467697144, |
| "learning_rate": 0.0001, |
| "loss": 2.4935, |
| "step": 406 |
| }, |
| { |
| "epoch": 0.2648015614834092, |
| "grad_norm": 0.25103551149368286, |
| "learning_rate": 0.0001, |
| "loss": 2.4428, |
| "step": 407 |
| }, |
| { |
| "epoch": 0.2654521795705921, |
| "grad_norm": 0.2387259602546692, |
| "learning_rate": 0.0001, |
| "loss": 2.0989, |
| "step": 408 |
| }, |
| { |
| "epoch": 0.2661027976577749, |
| "grad_norm": 0.2606028616428375, |
| "learning_rate": 0.0001, |
| "loss": 1.9494, |
| "step": 409 |
| }, |
| { |
| "epoch": 0.2667534157449577, |
| "grad_norm": 0.25114724040031433, |
| "learning_rate": 0.0001, |
| "loss": 2.2432, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.2674040338321405, |
| "grad_norm": 0.3072582483291626, |
| "learning_rate": 0.0001, |
| "loss": 2.3506, |
| "step": 411 |
| }, |
| { |
| "epoch": 0.2680546519193234, |
| "grad_norm": 0.23917561769485474, |
| "learning_rate": 0.0001, |
| "loss": 2.2665, |
| "step": 412 |
| }, |
| { |
| "epoch": 0.2687052700065062, |
| "grad_norm": 0.2120814174413681, |
| "learning_rate": 0.0001, |
| "loss": 1.9625, |
| "step": 413 |
| }, |
| { |
| "epoch": 0.269355888093689, |
| "grad_norm": 0.22003813087940216, |
| "learning_rate": 0.0001, |
| "loss": 2.1179, |
| "step": 414 |
| }, |
| { |
| "epoch": 0.27000650618087185, |
| "grad_norm": 0.33217060565948486, |
| "learning_rate": 0.0001, |
| "loss": 2.6353, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.27065712426805466, |
| "grad_norm": 0.2260630577802658, |
| "learning_rate": 0.0001, |
| "loss": 2.0355, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.27130774235523747, |
| "grad_norm": 0.30081093311309814, |
| "learning_rate": 0.0001, |
| "loss": 2.1825, |
| "step": 417 |
| }, |
| { |
| "epoch": 0.2719583604424203, |
| "grad_norm": 0.27275893092155457, |
| "learning_rate": 0.0001, |
| "loss": 2.6183, |
| "step": 418 |
| }, |
| { |
| "epoch": 0.27260897852960314, |
| "grad_norm": 0.4902358651161194, |
| "learning_rate": 0.0001, |
| "loss": 3.0888, |
| "step": 419 |
| }, |
| { |
| "epoch": 0.27325959661678595, |
| "grad_norm": 0.21213112771511078, |
| "learning_rate": 0.0001, |
| "loss": 2.1172, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.27391021470396876, |
| "grad_norm": 0.35953450202941895, |
| "learning_rate": 0.0001, |
| "loss": 2.5109, |
| "step": 421 |
| }, |
| { |
| "epoch": 0.2745608327911516, |
| "grad_norm": 0.2081584334373474, |
| "learning_rate": 0.0001, |
| "loss": 2.0894, |
| "step": 422 |
| }, |
| { |
| "epoch": 0.27521145087833443, |
| "grad_norm": 0.20892906188964844, |
| "learning_rate": 0.0001, |
| "loss": 1.9643, |
| "step": 423 |
| }, |
| { |
| "epoch": 0.27586206896551724, |
| "grad_norm": 0.30058735609054565, |
| "learning_rate": 0.0001, |
| "loss": 2.6503, |
| "step": 424 |
| }, |
| { |
| "epoch": 0.27651268705270005, |
| "grad_norm": 0.32902124524116516, |
| "learning_rate": 0.0001, |
| "loss": 2.3271, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.2771633051398829, |
| "grad_norm": 0.2003614902496338, |
| "learning_rate": 0.0001, |
| "loss": 1.9881, |
| "step": 426 |
| }, |
| { |
| "epoch": 0.2778139232270657, |
| "grad_norm": 0.33349111676216125, |
| "learning_rate": 0.0001, |
| "loss": 2.7625, |
| "step": 427 |
| }, |
| { |
| "epoch": 0.2784645413142485, |
| "grad_norm": 0.25051257014274597, |
| "learning_rate": 0.0001, |
| "loss": 2.0825, |
| "step": 428 |
| }, |
| { |
| "epoch": 0.27911515940143133, |
| "grad_norm": 0.3301559388637543, |
| "learning_rate": 0.0001, |
| "loss": 2.85, |
| "step": 429 |
| }, |
| { |
| "epoch": 0.2797657774886142, |
| "grad_norm": 0.18224254250526428, |
| "learning_rate": 0.0001, |
| "loss": 1.9687, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.280416395575797, |
| "grad_norm": 0.21809989213943481, |
| "learning_rate": 0.0001, |
| "loss": 2.2596, |
| "step": 431 |
| }, |
| { |
| "epoch": 0.2810670136629798, |
| "grad_norm": 0.2473779171705246, |
| "learning_rate": 0.0001, |
| "loss": 2.2042, |
| "step": 432 |
| }, |
| { |
| "epoch": 0.2817176317501627, |
| "grad_norm": 0.20744885504245758, |
| "learning_rate": 0.0001, |
| "loss": 2.1546, |
| "step": 433 |
| }, |
| { |
| "epoch": 0.2823682498373455, |
| "grad_norm": 0.2620698809623718, |
| "learning_rate": 0.0001, |
| "loss": 2.5195, |
| "step": 434 |
| }, |
| { |
| "epoch": 0.2830188679245283, |
| "grad_norm": 0.291421115398407, |
| "learning_rate": 0.0001, |
| "loss": 2.4983, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.2836694860117111, |
| "grad_norm": 0.3294708728790283, |
| "learning_rate": 0.0001, |
| "loss": 2.3146, |
| "step": 436 |
| }, |
| { |
| "epoch": 0.28432010409889397, |
| "grad_norm": 0.26191362738609314, |
| "learning_rate": 0.0001, |
| "loss": 2.2818, |
| "step": 437 |
| }, |
| { |
| "epoch": 0.2849707221860768, |
| "grad_norm": 0.29155483841896057, |
| "learning_rate": 0.0001, |
| "loss": 2.4888, |
| "step": 438 |
| }, |
| { |
| "epoch": 0.2856213402732596, |
| "grad_norm": 0.19482360780239105, |
| "learning_rate": 0.0001, |
| "loss": 2.0061, |
| "step": 439 |
| }, |
| { |
| "epoch": 0.28627195836044245, |
| "grad_norm": 0.2594612240791321, |
| "learning_rate": 0.0001, |
| "loss": 2.1891, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.28692257644762525, |
| "grad_norm": 0.21656309068202972, |
| "learning_rate": 0.0001, |
| "loss": 1.7911, |
| "step": 441 |
| }, |
| { |
| "epoch": 0.28757319453480806, |
| "grad_norm": 0.18664829432964325, |
| "learning_rate": 0.0001, |
| "loss": 1.9634, |
| "step": 442 |
| }, |
| { |
| "epoch": 0.28822381262199087, |
| "grad_norm": 0.2178332507610321, |
| "learning_rate": 0.0001, |
| "loss": 2.32, |
| "step": 443 |
| }, |
| { |
| "epoch": 0.28887443070917374, |
| "grad_norm": 0.351418673992157, |
| "learning_rate": 0.0001, |
| "loss": 3.0873, |
| "step": 444 |
| }, |
| { |
| "epoch": 0.28952504879635654, |
| "grad_norm": 0.23604457080364227, |
| "learning_rate": 0.0001, |
| "loss": 2.46, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.29017566688353935, |
| "grad_norm": 0.2599848806858063, |
| "learning_rate": 0.0001, |
| "loss": 2.0207, |
| "step": 446 |
| }, |
| { |
| "epoch": 0.29082628497072216, |
| "grad_norm": 0.340314120054245, |
| "learning_rate": 0.0001, |
| "loss": 2.279, |
| "step": 447 |
| }, |
| { |
| "epoch": 0.291476903057905, |
| "grad_norm": 0.23228399455547333, |
| "learning_rate": 0.0001, |
| "loss": 2.3561, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.29212752114508783, |
| "grad_norm": 0.25504687428474426, |
| "learning_rate": 0.0001, |
| "loss": 2.2251, |
| "step": 449 |
| }, |
| { |
| "epoch": 0.29277813923227064, |
| "grad_norm": 0.2465014010667801, |
| "learning_rate": 0.0001, |
| "loss": 2.1031, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.2934287573194535, |
| "grad_norm": 0.2188328504562378, |
| "learning_rate": 0.0001, |
| "loss": 2.1483, |
| "step": 451 |
| }, |
| { |
| "epoch": 0.2940793754066363, |
| "grad_norm": 0.24546551704406738, |
| "learning_rate": 0.0001, |
| "loss": 2.2334, |
| "step": 452 |
| }, |
| { |
| "epoch": 0.2947299934938191, |
| "grad_norm": 0.23416215181350708, |
| "learning_rate": 0.0001, |
| "loss": 2.1846, |
| "step": 453 |
| }, |
| { |
| "epoch": 0.29538061158100193, |
| "grad_norm": 0.25267231464385986, |
| "learning_rate": 0.0001, |
| "loss": 2.2134, |
| "step": 454 |
| }, |
| { |
| "epoch": 0.2960312296681848, |
| "grad_norm": 0.26632416248321533, |
| "learning_rate": 0.0001, |
| "loss": 2.5012, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.2966818477553676, |
| "grad_norm": 0.18289139866828918, |
| "learning_rate": 0.0001, |
| "loss": 2.0524, |
| "step": 456 |
| }, |
| { |
| "epoch": 0.2973324658425504, |
| "grad_norm": 0.19033563137054443, |
| "learning_rate": 0.0001, |
| "loss": 2.0165, |
| "step": 457 |
| }, |
| { |
| "epoch": 0.2979830839297333, |
| "grad_norm": 0.200730562210083, |
| "learning_rate": 0.0001, |
| "loss": 1.8021, |
| "step": 458 |
| }, |
| { |
| "epoch": 0.2986337020169161, |
| "grad_norm": 0.2109062522649765, |
| "learning_rate": 0.0001, |
| "loss": 2.0655, |
| "step": 459 |
| }, |
| { |
| "epoch": 0.2992843201040989, |
| "grad_norm": 0.23461318016052246, |
| "learning_rate": 0.0001, |
| "loss": 2.3335, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.2999349381912817, |
| "grad_norm": 0.2085726112127304, |
| "learning_rate": 0.0001, |
| "loss": 2.0061, |
| "step": 461 |
| }, |
| { |
| "epoch": 0.30058555627846456, |
| "grad_norm": 0.2938329875469208, |
| "learning_rate": 0.0001, |
| "loss": 2.5245, |
| "step": 462 |
| }, |
| { |
| "epoch": 0.30123617436564737, |
| "grad_norm": 0.22131232917308807, |
| "learning_rate": 0.0001, |
| "loss": 2.4115, |
| "step": 463 |
| }, |
| { |
| "epoch": 0.3018867924528302, |
| "grad_norm": 0.3459152579307556, |
| "learning_rate": 0.0001, |
| "loss": 2.3896, |
| "step": 464 |
| }, |
| { |
| "epoch": 0.302537410540013, |
| "grad_norm": 0.27464184165000916, |
| "learning_rate": 0.0001, |
| "loss": 2.6592, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.30318802862719585, |
| "grad_norm": 0.28379327058792114, |
| "learning_rate": 0.0001, |
| "loss": 2.1453, |
| "step": 466 |
| }, |
| { |
| "epoch": 0.30383864671437866, |
| "grad_norm": 0.28283926844596863, |
| "learning_rate": 0.0001, |
| "loss": 2.1704, |
| "step": 467 |
| }, |
| { |
| "epoch": 0.30448926480156147, |
| "grad_norm": 0.22243599593639374, |
| "learning_rate": 0.0001, |
| "loss": 2.1175, |
| "step": 468 |
| }, |
| { |
| "epoch": 0.30513988288874433, |
| "grad_norm": 0.22331124544143677, |
| "learning_rate": 0.0001, |
| "loss": 1.8857, |
| "step": 469 |
| }, |
| { |
| "epoch": 0.30579050097592714, |
| "grad_norm": 0.21995989978313446, |
| "learning_rate": 0.0001, |
| "loss": 2.1316, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.30644111906310995, |
| "grad_norm": 0.21140341460704803, |
| "learning_rate": 0.0001, |
| "loss": 2.0742, |
| "step": 471 |
| }, |
| { |
| "epoch": 0.30709173715029275, |
| "grad_norm": 0.31053757667541504, |
| "learning_rate": 0.0001, |
| "loss": 2.615, |
| "step": 472 |
| }, |
| { |
| "epoch": 0.3077423552374756, |
| "grad_norm": 0.2768484354019165, |
| "learning_rate": 0.0001, |
| "loss": 2.713, |
| "step": 473 |
| }, |
| { |
| "epoch": 0.3083929733246584, |
| "grad_norm": 0.2538318336009979, |
| "learning_rate": 0.0001, |
| "loss": 2.1917, |
| "step": 474 |
| }, |
| { |
| "epoch": 0.30904359141184123, |
| "grad_norm": 0.2105240672826767, |
| "learning_rate": 0.0001, |
| "loss": 2.2741, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.3096942094990241, |
| "grad_norm": 0.2915903925895691, |
| "learning_rate": 0.0001, |
| "loss": 2.115, |
| "step": 476 |
| }, |
| { |
| "epoch": 0.3103448275862069, |
| "grad_norm": 0.30282047390937805, |
| "learning_rate": 0.0001, |
| "loss": 2.7806, |
| "step": 477 |
| }, |
| { |
| "epoch": 0.3109954456733897, |
| "grad_norm": 0.2707601487636566, |
| "learning_rate": 0.0001, |
| "loss": 2.6137, |
| "step": 478 |
| }, |
| { |
| "epoch": 0.3116460637605725, |
| "grad_norm": 0.34574300050735474, |
| "learning_rate": 0.0001, |
| "loss": 2.5957, |
| "step": 479 |
| }, |
| { |
| "epoch": 0.3122966818477554, |
| "grad_norm": 0.22767509520053864, |
| "learning_rate": 0.0001, |
| "loss": 2.3543, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.3129472999349382, |
| "grad_norm": 0.25194215774536133, |
| "learning_rate": 0.0001, |
| "loss": 2.6586, |
| "step": 481 |
| }, |
| { |
| "epoch": 0.313597918022121, |
| "grad_norm": 0.20427219569683075, |
| "learning_rate": 0.0001, |
| "loss": 1.9091, |
| "step": 482 |
| }, |
| { |
| "epoch": 0.3142485361093038, |
| "grad_norm": 0.2993704378604889, |
| "learning_rate": 0.0001, |
| "loss": 2.4704, |
| "step": 483 |
| }, |
| { |
| "epoch": 0.3148991541964867, |
| "grad_norm": 0.18951758742332458, |
| "learning_rate": 0.0001, |
| "loss": 2.1108, |
| "step": 484 |
| }, |
| { |
| "epoch": 0.3155497722836695, |
| "grad_norm": 0.2622709572315216, |
| "learning_rate": 0.0001, |
| "loss": 2.4144, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.3162003903708523, |
| "grad_norm": 0.20735126733779907, |
| "learning_rate": 0.0001, |
| "loss": 2.3065, |
| "step": 486 |
| }, |
| { |
| "epoch": 0.31685100845803515, |
| "grad_norm": 0.22782085835933685, |
| "learning_rate": 0.0001, |
| "loss": 2.4377, |
| "step": 487 |
| }, |
| { |
| "epoch": 0.31750162654521796, |
| "grad_norm": 0.2568935453891754, |
| "learning_rate": 0.0001, |
| "loss": 2.1199, |
| "step": 488 |
| }, |
| { |
| "epoch": 0.31815224463240077, |
| "grad_norm": 0.23917409777641296, |
| "learning_rate": 0.0001, |
| "loss": 2.2457, |
| "step": 489 |
| }, |
| { |
| "epoch": 0.3188028627195836, |
| "grad_norm": 0.21531902253627777, |
| "learning_rate": 0.0001, |
| "loss": 2.0489, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.31945348080676644, |
| "grad_norm": 0.21461109817028046, |
| "learning_rate": 0.0001, |
| "loss": 2.1915, |
| "step": 491 |
| }, |
| { |
| "epoch": 0.32010409889394925, |
| "grad_norm": 0.2458680123090744, |
| "learning_rate": 0.0001, |
| "loss": 2.3939, |
| "step": 492 |
| }, |
| { |
| "epoch": 0.32075471698113206, |
| "grad_norm": 0.2617323696613312, |
| "learning_rate": 0.0001, |
| "loss": 2.5611, |
| "step": 493 |
| }, |
| { |
| "epoch": 0.3214053350683149, |
| "grad_norm": 0.22562618553638458, |
| "learning_rate": 0.0001, |
| "loss": 2.2703, |
| "step": 494 |
| }, |
| { |
| "epoch": 0.32205595315549773, |
| "grad_norm": 0.2290688008069992, |
| "learning_rate": 0.0001, |
| "loss": 2.3049, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.32270657124268054, |
| "grad_norm": 0.4118833541870117, |
| "learning_rate": 0.0001, |
| "loss": 2.9194, |
| "step": 496 |
| }, |
| { |
| "epoch": 0.32335718932986335, |
| "grad_norm": 0.22502999007701874, |
| "learning_rate": 0.0001, |
| "loss": 2.2362, |
| "step": 497 |
| }, |
| { |
| "epoch": 0.3240078074170462, |
| "grad_norm": 0.23599191009998322, |
| "learning_rate": 0.0001, |
| "loss": 2.35, |
| "step": 498 |
| }, |
| { |
| "epoch": 0.324658425504229, |
| "grad_norm": 0.3065047860145569, |
| "learning_rate": 0.0001, |
| "loss": 2.3984, |
| "step": 499 |
| }, |
| { |
| "epoch": 0.32530904359141183, |
| "grad_norm": 0.19241982698440552, |
| "learning_rate": 0.0001, |
| "loss": 1.8787, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.3259596616785947, |
| "grad_norm": 0.20695632696151733, |
| "learning_rate": 0.0001, |
| "loss": 1.9397, |
| "step": 501 |
| }, |
| { |
| "epoch": 0.3266102797657775, |
| "grad_norm": 0.1998564749956131, |
| "learning_rate": 0.0001, |
| "loss": 2.1463, |
| "step": 502 |
| }, |
| { |
| "epoch": 0.3272608978529603, |
| "grad_norm": 0.27775317430496216, |
| "learning_rate": 0.0001, |
| "loss": 2.7956, |
| "step": 503 |
| }, |
| { |
| "epoch": 0.3279115159401431, |
| "grad_norm": 0.2393936961889267, |
| "learning_rate": 0.0001, |
| "loss": 2.3785, |
| "step": 504 |
| }, |
| { |
| "epoch": 0.328562134027326, |
| "grad_norm": 0.20921163260936737, |
| "learning_rate": 0.0001, |
| "loss": 2.1909, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.3292127521145088, |
| "grad_norm": 0.25875911116600037, |
| "learning_rate": 0.0001, |
| "loss": 2.129, |
| "step": 506 |
| }, |
| { |
| "epoch": 0.3298633702016916, |
| "grad_norm": 0.2382909208536148, |
| "learning_rate": 0.0001, |
| "loss": 2.3786, |
| "step": 507 |
| }, |
| { |
| "epoch": 0.3305139882888744, |
| "grad_norm": 0.19657136499881744, |
| "learning_rate": 0.0001, |
| "loss": 1.951, |
| "step": 508 |
| }, |
| { |
| "epoch": 0.33116460637605727, |
| "grad_norm": 0.23688004910945892, |
| "learning_rate": 0.0001, |
| "loss": 2.4348, |
| "step": 509 |
| }, |
| { |
| "epoch": 0.3318152244632401, |
| "grad_norm": 0.1988734006881714, |
| "learning_rate": 0.0001, |
| "loss": 2.2352, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.3324658425504229, |
| "grad_norm": 0.2078763097524643, |
| "learning_rate": 0.0001, |
| "loss": 2.1376, |
| "step": 511 |
| }, |
| { |
| "epoch": 0.33311646063760575, |
| "grad_norm": 0.18860888481140137, |
| "learning_rate": 0.0001, |
| "loss": 1.9367, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.33376707872478856, |
| "grad_norm": 0.30205249786376953, |
| "learning_rate": 0.0001, |
| "loss": 2.6822, |
| "step": 513 |
| }, |
| { |
| "epoch": 0.33441769681197137, |
| "grad_norm": 0.2146618664264679, |
| "learning_rate": 0.0001, |
| "loss": 2.1927, |
| "step": 514 |
| }, |
| { |
| "epoch": 0.3350683148991542, |
| "grad_norm": 0.19332504272460938, |
| "learning_rate": 0.0001, |
| "loss": 2.0442, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.33571893298633704, |
| "grad_norm": 0.2289431244134903, |
| "learning_rate": 0.0001, |
| "loss": 2.0152, |
| "step": 516 |
| }, |
| { |
| "epoch": 0.33636955107351985, |
| "grad_norm": 0.21815945208072662, |
| "learning_rate": 0.0001, |
| "loss": 2.0015, |
| "step": 517 |
| }, |
| { |
| "epoch": 0.33702016916070265, |
| "grad_norm": 0.2226189821958542, |
| "learning_rate": 0.0001, |
| "loss": 2.2989, |
| "step": 518 |
| }, |
| { |
| "epoch": 0.3376707872478855, |
| "grad_norm": 0.22195078432559967, |
| "learning_rate": 0.0001, |
| "loss": 2.2237, |
| "step": 519 |
| }, |
| { |
| "epoch": 0.3383214053350683, |
| "grad_norm": 0.1946515589952469, |
| "learning_rate": 0.0001, |
| "loss": 1.9459, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.33897202342225113, |
| "grad_norm": 0.21510568261146545, |
| "learning_rate": 0.0001, |
| "loss": 2.1305, |
| "step": 521 |
| }, |
| { |
| "epoch": 0.33962264150943394, |
| "grad_norm": 0.23448903858661652, |
| "learning_rate": 0.0001, |
| "loss": 2.1838, |
| "step": 522 |
| }, |
| { |
| "epoch": 0.3402732595966168, |
| "grad_norm": 0.19046911597251892, |
| "learning_rate": 0.0001, |
| "loss": 1.9739, |
| "step": 523 |
| }, |
| { |
| "epoch": 0.3409238776837996, |
| "grad_norm": 0.2314033806324005, |
| "learning_rate": 0.0001, |
| "loss": 2.2053, |
| "step": 524 |
| }, |
| { |
| "epoch": 0.3415744957709824, |
| "grad_norm": 0.2206612378358841, |
| "learning_rate": 0.0001, |
| "loss": 2.2566, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.34222511385816523, |
| "grad_norm": 0.19578076899051666, |
| "learning_rate": 0.0001, |
| "loss": 2.045, |
| "step": 526 |
| }, |
| { |
| "epoch": 0.3428757319453481, |
| "grad_norm": 0.1787755936384201, |
| "learning_rate": 0.0001, |
| "loss": 1.8942, |
| "step": 527 |
| }, |
| { |
| "epoch": 0.3435263500325309, |
| "grad_norm": 0.20091751217842102, |
| "learning_rate": 0.0001, |
| "loss": 2.1576, |
| "step": 528 |
| }, |
| { |
| "epoch": 0.3441769681197137, |
| "grad_norm": 0.21869762241840363, |
| "learning_rate": 0.0001, |
| "loss": 2.1938, |
| "step": 529 |
| }, |
| { |
| "epoch": 0.3448275862068966, |
| "grad_norm": 0.26101449131965637, |
| "learning_rate": 0.0001, |
| "loss": 2.3642, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.3454782042940794, |
| "grad_norm": 0.21874766051769257, |
| "learning_rate": 0.0001, |
| "loss": 2.4553, |
| "step": 531 |
| }, |
| { |
| "epoch": 0.3461288223812622, |
| "grad_norm": 0.224325492978096, |
| "learning_rate": 0.0001, |
| "loss": 2.2959, |
| "step": 532 |
| }, |
| { |
| "epoch": 0.346779440468445, |
| "grad_norm": 0.21268363296985626, |
| "learning_rate": 0.0001, |
| "loss": 2.1021, |
| "step": 533 |
| }, |
| { |
| "epoch": 0.34743005855562786, |
| "grad_norm": 0.20979231595993042, |
| "learning_rate": 0.0001, |
| "loss": 2.0304, |
| "step": 534 |
| }, |
| { |
| "epoch": 0.34808067664281067, |
| "grad_norm": 0.19552691280841827, |
| "learning_rate": 0.0001, |
| "loss": 1.9747, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.3487312947299935, |
| "grad_norm": 0.27929842472076416, |
| "learning_rate": 0.0001, |
| "loss": 2.445, |
| "step": 536 |
| }, |
| { |
| "epoch": 0.34938191281717634, |
| "grad_norm": 0.19953188300132751, |
| "learning_rate": 0.0001, |
| "loss": 1.9766, |
| "step": 537 |
| }, |
| { |
| "epoch": 0.35003253090435915, |
| "grad_norm": 0.29898926615715027, |
| "learning_rate": 0.0001, |
| "loss": 2.4818, |
| "step": 538 |
| }, |
| { |
| "epoch": 0.35068314899154196, |
| "grad_norm": 0.18719644844532013, |
| "learning_rate": 0.0001, |
| "loss": 1.9046, |
| "step": 539 |
| }, |
| { |
| "epoch": 0.35133376707872477, |
| "grad_norm": 0.2602563798427582, |
| "learning_rate": 0.0001, |
| "loss": 2.1539, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.35198438516590763, |
| "grad_norm": 0.23460406064987183, |
| "learning_rate": 0.0001, |
| "loss": 2.3826, |
| "step": 541 |
| }, |
| { |
| "epoch": 0.35263500325309044, |
| "grad_norm": 0.2821134328842163, |
| "learning_rate": 0.0001, |
| "loss": 2.223, |
| "step": 542 |
| }, |
| { |
| "epoch": 0.35328562134027325, |
| "grad_norm": 0.2641044557094574, |
| "learning_rate": 0.0001, |
| "loss": 2.2402, |
| "step": 543 |
| }, |
| { |
| "epoch": 0.35393623942745606, |
| "grad_norm": 0.21963565051555634, |
| "learning_rate": 0.0001, |
| "loss": 2.3988, |
| "step": 544 |
| }, |
| { |
| "epoch": 0.3545868575146389, |
| "grad_norm": 0.26475685834884644, |
| "learning_rate": 0.0001, |
| "loss": 2.3046, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.35523747560182173, |
| "grad_norm": 0.27148157358169556, |
| "learning_rate": 0.0001, |
| "loss": 2.5076, |
| "step": 546 |
| }, |
| { |
| "epoch": 0.35588809368900454, |
| "grad_norm": 0.28925588726997375, |
| "learning_rate": 0.0001, |
| "loss": 2.8395, |
| "step": 547 |
| }, |
| { |
| "epoch": 0.3565387117761874, |
| "grad_norm": 0.22953632473945618, |
| "learning_rate": 0.0001, |
| "loss": 2.1198, |
| "step": 548 |
| }, |
| { |
| "epoch": 0.3571893298633702, |
| "grad_norm": 0.23960557579994202, |
| "learning_rate": 0.0001, |
| "loss": 2.3064, |
| "step": 549 |
| }, |
| { |
| "epoch": 0.357839947950553, |
| "grad_norm": 0.3133333921432495, |
| "learning_rate": 0.0001, |
| "loss": 2.6034, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.3584905660377358, |
| "grad_norm": 0.21745215356349945, |
| "learning_rate": 0.0001, |
| "loss": 2.4553, |
| "step": 551 |
| }, |
| { |
| "epoch": 0.3591411841249187, |
| "grad_norm": 0.23547130823135376, |
| "learning_rate": 0.0001, |
| "loss": 2.0469, |
| "step": 552 |
| }, |
| { |
| "epoch": 0.3597918022121015, |
| "grad_norm": 0.2646094262599945, |
| "learning_rate": 0.0001, |
| "loss": 1.9016, |
| "step": 553 |
| }, |
| { |
| "epoch": 0.3604424202992843, |
| "grad_norm": 0.3079530596733093, |
| "learning_rate": 0.0001, |
| "loss": 2.8979, |
| "step": 554 |
| }, |
| { |
| "epoch": 0.36109303838646717, |
| "grad_norm": 0.38223740458488464, |
| "learning_rate": 0.0001, |
| "loss": 3.066, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.36174365647365, |
| "grad_norm": 0.2535337209701538, |
| "learning_rate": 0.0001, |
| "loss": 2.1327, |
| "step": 556 |
| }, |
| { |
| "epoch": 0.3623942745608328, |
| "grad_norm": 0.2373637855052948, |
| "learning_rate": 0.0001, |
| "loss": 2.1141, |
| "step": 557 |
| }, |
| { |
| "epoch": 0.3630448926480156, |
| "grad_norm": 0.19437271356582642, |
| "learning_rate": 0.0001, |
| "loss": 1.9753, |
| "step": 558 |
| }, |
| { |
| "epoch": 0.36369551073519846, |
| "grad_norm": 0.20236878097057343, |
| "learning_rate": 0.0001, |
| "loss": 2.2516, |
| "step": 559 |
| }, |
| { |
| "epoch": 0.36434612882238127, |
| "grad_norm": 0.21252363920211792, |
| "learning_rate": 0.0001, |
| "loss": 2.3645, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.3649967469095641, |
| "grad_norm": 0.21689258515834808, |
| "learning_rate": 0.0001, |
| "loss": 2.1145, |
| "step": 561 |
| }, |
| { |
| "epoch": 0.3656473649967469, |
| "grad_norm": 0.22365228831768036, |
| "learning_rate": 0.0001, |
| "loss": 2.3083, |
| "step": 562 |
| }, |
| { |
| "epoch": 0.36629798308392975, |
| "grad_norm": 0.21607807278633118, |
| "learning_rate": 0.0001, |
| "loss": 2.3199, |
| "step": 563 |
| }, |
| { |
| "epoch": 0.36694860117111255, |
| "grad_norm": 0.1885683536529541, |
| "learning_rate": 0.0001, |
| "loss": 1.9303, |
| "step": 564 |
| }, |
| { |
| "epoch": 0.36759921925829536, |
| "grad_norm": 0.20064905285835266, |
| "learning_rate": 0.0001, |
| "loss": 2.0661, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.3682498373454782, |
| "grad_norm": 0.23532240092754364, |
| "learning_rate": 0.0001, |
| "loss": 2.6942, |
| "step": 566 |
| }, |
| { |
| "epoch": 0.36890045543266103, |
| "grad_norm": 0.22937807440757751, |
| "learning_rate": 0.0001, |
| "loss": 2.1962, |
| "step": 567 |
| }, |
| { |
| "epoch": 0.36955107351984384, |
| "grad_norm": 0.2540866732597351, |
| "learning_rate": 0.0001, |
| "loss": 2.5012, |
| "step": 568 |
| }, |
| { |
| "epoch": 0.37020169160702665, |
| "grad_norm": 0.23405294120311737, |
| "learning_rate": 0.0001, |
| "loss": 2.2439, |
| "step": 569 |
| }, |
| { |
| "epoch": 0.3708523096942095, |
| "grad_norm": 0.24394820630550385, |
| "learning_rate": 0.0001, |
| "loss": 2.0741, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.3715029277813923, |
| "grad_norm": 0.2063736468553543, |
| "learning_rate": 0.0001, |
| "loss": 2.0864, |
| "step": 571 |
| }, |
| { |
| "epoch": 0.37215354586857513, |
| "grad_norm": 0.3300686180591583, |
| "learning_rate": 0.0001, |
| "loss": 2.4983, |
| "step": 572 |
| }, |
| { |
| "epoch": 0.372804163955758, |
| "grad_norm": 0.21294772624969482, |
| "learning_rate": 0.0001, |
| "loss": 2.2273, |
| "step": 573 |
| }, |
| { |
| "epoch": 0.3734547820429408, |
| "grad_norm": 0.2629190981388092, |
| "learning_rate": 0.0001, |
| "loss": 2.1732, |
| "step": 574 |
| }, |
| { |
| "epoch": 0.3741054001301236, |
| "grad_norm": 0.2141999751329422, |
| "learning_rate": 0.0001, |
| "loss": 2.3038, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.3747560182173064, |
| "grad_norm": 0.3467566668987274, |
| "learning_rate": 0.0001, |
| "loss": 2.7748, |
| "step": 576 |
| }, |
| { |
| "epoch": 0.3754066363044893, |
| "grad_norm": 0.3112248182296753, |
| "learning_rate": 0.0001, |
| "loss": 2.2376, |
| "step": 577 |
| }, |
| { |
| "epoch": 0.3760572543916721, |
| "grad_norm": 0.21217738091945648, |
| "learning_rate": 0.0001, |
| "loss": 1.9146, |
| "step": 578 |
| }, |
| { |
| "epoch": 0.3767078724788549, |
| "grad_norm": 0.19359458982944489, |
| "learning_rate": 0.0001, |
| "loss": 2.0913, |
| "step": 579 |
| }, |
| { |
| "epoch": 0.37735849056603776, |
| "grad_norm": 0.27635738253593445, |
| "learning_rate": 0.0001, |
| "loss": 2.2855, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.37800910865322057, |
| "grad_norm": 0.19366882741451263, |
| "learning_rate": 0.0001, |
| "loss": 2.0194, |
| "step": 581 |
| }, |
| { |
| "epoch": 0.3786597267404034, |
| "grad_norm": 0.2016839236021042, |
| "learning_rate": 0.0001, |
| "loss": 2.1519, |
| "step": 582 |
| }, |
| { |
| "epoch": 0.3793103448275862, |
| "grad_norm": 0.22154097259044647, |
| "learning_rate": 0.0001, |
| "loss": 1.9849, |
| "step": 583 |
| }, |
| { |
| "epoch": 0.37996096291476905, |
| "grad_norm": 0.2089187502861023, |
| "learning_rate": 0.0001, |
| "loss": 2.3624, |
| "step": 584 |
| }, |
| { |
| "epoch": 0.38061158100195186, |
| "grad_norm": 0.25050756335258484, |
| "learning_rate": 0.0001, |
| "loss": 2.1773, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.38126219908913467, |
| "grad_norm": 0.23007918894290924, |
| "learning_rate": 0.0001, |
| "loss": 2.2054, |
| "step": 586 |
| }, |
| { |
| "epoch": 0.3819128171763175, |
| "grad_norm": 0.25022968649864197, |
| "learning_rate": 0.0001, |
| "loss": 2.219, |
| "step": 587 |
| }, |
| { |
| "epoch": 0.38256343526350034, |
| "grad_norm": 0.2205193042755127, |
| "learning_rate": 0.0001, |
| "loss": 2.2049, |
| "step": 588 |
| }, |
| { |
| "epoch": 0.38321405335068315, |
| "grad_norm": 0.21454961597919464, |
| "learning_rate": 0.0001, |
| "loss": 2.0683, |
| "step": 589 |
| }, |
| { |
| "epoch": 0.38386467143786596, |
| "grad_norm": 0.2088347226381302, |
| "learning_rate": 0.0001, |
| "loss": 2.1301, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.3845152895250488, |
| "grad_norm": 0.20322394371032715, |
| "learning_rate": 0.0001, |
| "loss": 2.2098, |
| "step": 591 |
| }, |
| { |
| "epoch": 0.38516590761223163, |
| "grad_norm": 0.231514111161232, |
| "learning_rate": 0.0001, |
| "loss": 2.5523, |
| "step": 592 |
| }, |
| { |
| "epoch": 0.38581652569941444, |
| "grad_norm": 0.24791982769966125, |
| "learning_rate": 0.0001, |
| "loss": 2.2259, |
| "step": 593 |
| }, |
| { |
| "epoch": 0.38646714378659724, |
| "grad_norm": 0.21148578822612762, |
| "learning_rate": 0.0001, |
| "loss": 2.0834, |
| "step": 594 |
| }, |
| { |
| "epoch": 0.3871177618737801, |
| "grad_norm": 0.263713538646698, |
| "learning_rate": 0.0001, |
| "loss": 2.3101, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.3877683799609629, |
| "grad_norm": 0.22197774052619934, |
| "learning_rate": 0.0001, |
| "loss": 2.1173, |
| "step": 596 |
| }, |
| { |
| "epoch": 0.3884189980481457, |
| "grad_norm": 0.2237439900636673, |
| "learning_rate": 0.0001, |
| "loss": 2.1109, |
| "step": 597 |
| }, |
| { |
| "epoch": 0.3890696161353286, |
| "grad_norm": 0.27451419830322266, |
| "learning_rate": 0.0001, |
| "loss": 2.5311, |
| "step": 598 |
| }, |
| { |
| "epoch": 0.3897202342225114, |
| "grad_norm": 0.18475750088691711, |
| "learning_rate": 0.0001, |
| "loss": 1.9241, |
| "step": 599 |
| }, |
| { |
| "epoch": 0.3903708523096942, |
| "grad_norm": 0.20120149850845337, |
| "learning_rate": 0.0001, |
| "loss": 2.1033, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.391021470396877, |
| "grad_norm": 0.19626259803771973, |
| "learning_rate": 0.0001, |
| "loss": 2.1223, |
| "step": 601 |
| }, |
| { |
| "epoch": 0.3916720884840599, |
| "grad_norm": 0.22795897722244263, |
| "learning_rate": 0.0001, |
| "loss": 2.2021, |
| "step": 602 |
| }, |
| { |
| "epoch": 0.3923227065712427, |
| "grad_norm": 0.5195867419242859, |
| "learning_rate": 0.0001, |
| "loss": 3.1849, |
| "step": 603 |
| }, |
| { |
| "epoch": 0.3929733246584255, |
| "grad_norm": 0.2636241614818573, |
| "learning_rate": 0.0001, |
| "loss": 2.0739, |
| "step": 604 |
| }, |
| { |
| "epoch": 0.3936239427456083, |
| "grad_norm": 0.33922895789146423, |
| "learning_rate": 0.0001, |
| "loss": 2.31, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.39427456083279117, |
| "grad_norm": 0.17467042803764343, |
| "learning_rate": 0.0001, |
| "loss": 1.9201, |
| "step": 606 |
| }, |
| { |
| "epoch": 0.394925178919974, |
| "grad_norm": 0.22457371652126312, |
| "learning_rate": 0.0001, |
| "loss": 1.9783, |
| "step": 607 |
| }, |
| { |
| "epoch": 0.3955757970071568, |
| "grad_norm": 0.5104444026947021, |
| "learning_rate": 0.0001, |
| "loss": 2.3777, |
| "step": 608 |
| }, |
| { |
| "epoch": 0.39622641509433965, |
| "grad_norm": 0.4531616270542145, |
| "learning_rate": 0.0001, |
| "loss": 2.8208, |
| "step": 609 |
| }, |
| { |
| "epoch": 0.39687703318152245, |
| "grad_norm": 0.20649151504039764, |
| "learning_rate": 0.0001, |
| "loss": 2.1377, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.39752765126870526, |
| "grad_norm": 0.39769667387008667, |
| "learning_rate": 0.0001, |
| "loss": 2.2228, |
| "step": 611 |
| }, |
| { |
| "epoch": 0.39817826935588807, |
| "grad_norm": 0.2832731008529663, |
| "learning_rate": 0.0001, |
| "loss": 1.9664, |
| "step": 612 |
| }, |
| { |
| "epoch": 0.39882888744307093, |
| "grad_norm": 0.2754386067390442, |
| "learning_rate": 0.0001, |
| "loss": 2.5595, |
| "step": 613 |
| }, |
| { |
| "epoch": 0.39947950553025374, |
| "grad_norm": 0.404364675283432, |
| "learning_rate": 0.0001, |
| "loss": 2.8133, |
| "step": 614 |
| }, |
| { |
| "epoch": 0.40013012361743655, |
| "grad_norm": 0.30304789543151855, |
| "learning_rate": 0.0001, |
| "loss": 2.2729, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.4007807417046194, |
| "grad_norm": 0.2519910931587219, |
| "learning_rate": 0.0001, |
| "loss": 2.3655, |
| "step": 616 |
| }, |
| { |
| "epoch": 0.4014313597918022, |
| "grad_norm": 0.2863995134830475, |
| "learning_rate": 0.0001, |
| "loss": 2.0774, |
| "step": 617 |
| }, |
| { |
| "epoch": 0.40208197787898503, |
| "grad_norm": 0.393622487783432, |
| "learning_rate": 0.0001, |
| "loss": 2.5082, |
| "step": 618 |
| }, |
| { |
| "epoch": 0.40273259596616784, |
| "grad_norm": 0.21836060285568237, |
| "learning_rate": 0.0001, |
| "loss": 1.9548, |
| "step": 619 |
| }, |
| { |
| "epoch": 0.4033832140533507, |
| "grad_norm": 0.358052521944046, |
| "learning_rate": 0.0001, |
| "loss": 2.5158, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.4040338321405335, |
| "grad_norm": 0.237140953540802, |
| "learning_rate": 0.0001, |
| "loss": 2.2111, |
| "step": 621 |
| }, |
| { |
| "epoch": 0.4046844502277163, |
| "grad_norm": 0.20998883247375488, |
| "learning_rate": 0.0001, |
| "loss": 2.1351, |
| "step": 622 |
| }, |
| { |
| "epoch": 0.4053350683148991, |
| "grad_norm": 0.18059247732162476, |
| "learning_rate": 0.0001, |
| "loss": 1.9451, |
| "step": 623 |
| }, |
| { |
| "epoch": 0.405985686402082, |
| "grad_norm": 0.17532669007778168, |
| "learning_rate": 0.0001, |
| "loss": 1.8591, |
| "step": 624 |
| }, |
| { |
| "epoch": 0.4066363044892648, |
| "grad_norm": 0.24097976088523865, |
| "learning_rate": 0.0001, |
| "loss": 2.6534, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.4072869225764476, |
| "grad_norm": 0.19505445659160614, |
| "learning_rate": 0.0001, |
| "loss": 1.8952, |
| "step": 626 |
| }, |
| { |
| "epoch": 0.40793754066363047, |
| "grad_norm": 0.232722207903862, |
| "learning_rate": 0.0001, |
| "loss": 2.2055, |
| "step": 627 |
| }, |
| { |
| "epoch": 0.4085881587508133, |
| "grad_norm": 0.23899732530117035, |
| "learning_rate": 0.0001, |
| "loss": 2.5848, |
| "step": 628 |
| }, |
| { |
| "epoch": 0.4092387768379961, |
| "grad_norm": 0.2411729097366333, |
| "learning_rate": 0.0001, |
| "loss": 2.5315, |
| "step": 629 |
| }, |
| { |
| "epoch": 0.4098893949251789, |
| "grad_norm": 0.25042012333869934, |
| "learning_rate": 0.0001, |
| "loss": 2.4154, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.41054001301236176, |
| "grad_norm": 0.2764488160610199, |
| "learning_rate": 0.0001, |
| "loss": 2.0564, |
| "step": 631 |
| }, |
| { |
| "epoch": 0.41119063109954457, |
| "grad_norm": 0.24761155247688293, |
| "learning_rate": 0.0001, |
| "loss": 2.3245, |
| "step": 632 |
| }, |
| { |
| "epoch": 0.4118412491867274, |
| "grad_norm": 0.22376200556755066, |
| "learning_rate": 0.0001, |
| "loss": 2.1881, |
| "step": 633 |
| }, |
| { |
| "epoch": 0.41249186727391024, |
| "grad_norm": 0.19060148298740387, |
| "learning_rate": 0.0001, |
| "loss": 1.9588, |
| "step": 634 |
| }, |
| { |
| "epoch": 0.41314248536109305, |
| "grad_norm": 0.4157400131225586, |
| "learning_rate": 0.0001, |
| "loss": 2.9024, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.41379310344827586, |
| "grad_norm": 0.2557002007961273, |
| "learning_rate": 0.0001, |
| "loss": 1.9819, |
| "step": 636 |
| }, |
| { |
| "epoch": 0.41444372153545866, |
| "grad_norm": 0.2908417880535126, |
| "learning_rate": 0.0001, |
| "loss": 2.112, |
| "step": 637 |
| }, |
| { |
| "epoch": 0.41509433962264153, |
| "grad_norm": 0.32937270402908325, |
| "learning_rate": 0.0001, |
| "loss": 2.4976, |
| "step": 638 |
| }, |
| { |
| "epoch": 0.41574495770982434, |
| "grad_norm": 0.20382268726825714, |
| "learning_rate": 0.0001, |
| "loss": 2.0448, |
| "step": 639 |
| }, |
| { |
| "epoch": 0.41639557579700714, |
| "grad_norm": 0.23484939336776733, |
| "learning_rate": 0.0001, |
| "loss": 1.9514, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.41704619388418995, |
| "grad_norm": 0.23023058474063873, |
| "learning_rate": 0.0001, |
| "loss": 2.0768, |
| "step": 641 |
| }, |
| { |
| "epoch": 0.4176968119713728, |
| "grad_norm": 0.22951190173625946, |
| "learning_rate": 0.0001, |
| "loss": 2.0764, |
| "step": 642 |
| }, |
| { |
| "epoch": 0.4183474300585556, |
| "grad_norm": 0.18971513211727142, |
| "learning_rate": 0.0001, |
| "loss": 1.9693, |
| "step": 643 |
| }, |
| { |
| "epoch": 0.41899804814573843, |
| "grad_norm": 0.24955709278583527, |
| "learning_rate": 0.0001, |
| "loss": 2.4898, |
| "step": 644 |
| }, |
| { |
| "epoch": 0.4196486662329213, |
| "grad_norm": 0.3344306945800781, |
| "learning_rate": 0.0001, |
| "loss": 2.4779, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.4202992843201041, |
| "grad_norm": 0.21661825478076935, |
| "learning_rate": 0.0001, |
| "loss": 2.0472, |
| "step": 646 |
| }, |
| { |
| "epoch": 0.4209499024072869, |
| "grad_norm": 0.1972419023513794, |
| "learning_rate": 0.0001, |
| "loss": 2.1712, |
| "step": 647 |
| }, |
| { |
| "epoch": 0.4216005204944697, |
| "grad_norm": 0.21619470417499542, |
| "learning_rate": 0.0001, |
| "loss": 2.0739, |
| "step": 648 |
| }, |
| { |
| "epoch": 0.4222511385816526, |
| "grad_norm": 0.2329091727733612, |
| "learning_rate": 0.0001, |
| "loss": 2.1362, |
| "step": 649 |
| }, |
| { |
| "epoch": 0.4229017566688354, |
| "grad_norm": 0.22971969842910767, |
| "learning_rate": 0.0001, |
| "loss": 1.9898, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.4235523747560182, |
| "grad_norm": 0.20185063779354095, |
| "learning_rate": 0.0001, |
| "loss": 2.1008, |
| "step": 651 |
| }, |
| { |
| "epoch": 0.42420299284320107, |
| "grad_norm": 0.2658546566963196, |
| "learning_rate": 0.0001, |
| "loss": 2.5734, |
| "step": 652 |
| }, |
| { |
| "epoch": 0.4248536109303839, |
| "grad_norm": 0.23109374940395355, |
| "learning_rate": 0.0001, |
| "loss": 2.2569, |
| "step": 653 |
| }, |
| { |
| "epoch": 0.4255042290175667, |
| "grad_norm": 0.25115352869033813, |
| "learning_rate": 0.0001, |
| "loss": 2.5967, |
| "step": 654 |
| }, |
| { |
| "epoch": 0.4261548471047495, |
| "grad_norm": 0.20470669865608215, |
| "learning_rate": 0.0001, |
| "loss": 2.0302, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.42680546519193235, |
| "grad_norm": 0.2151513546705246, |
| "learning_rate": 0.0001, |
| "loss": 2.5183, |
| "step": 656 |
| }, |
| { |
| "epoch": 0.42745608327911516, |
| "grad_norm": 0.2571411728858948, |
| "learning_rate": 0.0001, |
| "loss": 2.255, |
| "step": 657 |
| }, |
| { |
| "epoch": 0.42810670136629797, |
| "grad_norm": 0.2414022833108902, |
| "learning_rate": 0.0001, |
| "loss": 2.4076, |
| "step": 658 |
| }, |
| { |
| "epoch": 0.42875731945348083, |
| "grad_norm": 0.21041014790534973, |
| "learning_rate": 0.0001, |
| "loss": 2.0091, |
| "step": 659 |
| }, |
| { |
| "epoch": 0.42940793754066364, |
| "grad_norm": 0.21241822838783264, |
| "learning_rate": 0.0001, |
| "loss": 2.355, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.43005855562784645, |
| "grad_norm": 0.21031403541564941, |
| "learning_rate": 0.0001, |
| "loss": 1.9887, |
| "step": 661 |
| }, |
| { |
| "epoch": 0.43070917371502926, |
| "grad_norm": 0.19765952229499817, |
| "learning_rate": 0.0001, |
| "loss": 2.1555, |
| "step": 662 |
| }, |
| { |
| "epoch": 0.4313597918022121, |
| "grad_norm": 0.24740834534168243, |
| "learning_rate": 0.0001, |
| "loss": 2.2349, |
| "step": 663 |
| }, |
| { |
| "epoch": 0.43201040988939493, |
| "grad_norm": 0.22086234390735626, |
| "learning_rate": 0.0001, |
| "loss": 2.0948, |
| "step": 664 |
| }, |
| { |
| "epoch": 0.43266102797657774, |
| "grad_norm": 0.21949239075183868, |
| "learning_rate": 0.0001, |
| "loss": 2.3905, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.43331164606376055, |
| "grad_norm": 0.20536834001541138, |
| "learning_rate": 0.0001, |
| "loss": 2.0547, |
| "step": 666 |
| }, |
| { |
| "epoch": 0.4339622641509434, |
| "grad_norm": 0.2570655941963196, |
| "learning_rate": 0.0001, |
| "loss": 2.0261, |
| "step": 667 |
| }, |
| { |
| "epoch": 0.4346128822381262, |
| "grad_norm": 0.3293687701225281, |
| "learning_rate": 0.0001, |
| "loss": 2.344, |
| "step": 668 |
| }, |
| { |
| "epoch": 0.435263500325309, |
| "grad_norm": 0.22947120666503906, |
| "learning_rate": 0.0001, |
| "loss": 2.232, |
| "step": 669 |
| }, |
| { |
| "epoch": 0.4359141184124919, |
| "grad_norm": 0.2425599992275238, |
| "learning_rate": 0.0001, |
| "loss": 2.309, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.4365647364996747, |
| "grad_norm": 0.2506352663040161, |
| "learning_rate": 0.0001, |
| "loss": 2.1249, |
| "step": 671 |
| }, |
| { |
| "epoch": 0.4372153545868575, |
| "grad_norm": 0.19457192718982697, |
| "learning_rate": 0.0001, |
| "loss": 1.9461, |
| "step": 672 |
| }, |
| { |
| "epoch": 0.4378659726740403, |
| "grad_norm": 0.3749271035194397, |
| "learning_rate": 0.0001, |
| "loss": 2.8532, |
| "step": 673 |
| }, |
| { |
| "epoch": 0.4385165907612232, |
| "grad_norm": 0.25384366512298584, |
| "learning_rate": 0.0001, |
| "loss": 2.6495, |
| "step": 674 |
| }, |
| { |
| "epoch": 0.439167208848406, |
| "grad_norm": 0.21413469314575195, |
| "learning_rate": 0.0001, |
| "loss": 2.084, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.4398178269355888, |
| "grad_norm": 0.228125661611557, |
| "learning_rate": 0.0001, |
| "loss": 2.2175, |
| "step": 676 |
| }, |
| { |
| "epoch": 0.44046844502277166, |
| "grad_norm": 0.1948491632938385, |
| "learning_rate": 0.0001, |
| "loss": 1.9702, |
| "step": 677 |
| }, |
| { |
| "epoch": 0.44111906310995447, |
| "grad_norm": 0.307992547750473, |
| "learning_rate": 0.0001, |
| "loss": 2.5884, |
| "step": 678 |
| }, |
| { |
| "epoch": 0.4417696811971373, |
| "grad_norm": 0.23681728541851044, |
| "learning_rate": 0.0001, |
| "loss": 2.2104, |
| "step": 679 |
| }, |
| { |
| "epoch": 0.4424202992843201, |
| "grad_norm": 0.23185166716575623, |
| "learning_rate": 0.0001, |
| "loss": 2.0823, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.44307091737150295, |
| "grad_norm": 0.2772667109966278, |
| "learning_rate": 0.0001, |
| "loss": 2.3729, |
| "step": 681 |
| }, |
| { |
| "epoch": 0.44372153545868576, |
| "grad_norm": 0.18908965587615967, |
| "learning_rate": 0.0001, |
| "loss": 2.0585, |
| "step": 682 |
| }, |
| { |
| "epoch": 0.44437215354586856, |
| "grad_norm": 0.2063988745212555, |
| "learning_rate": 0.0001, |
| "loss": 1.9474, |
| "step": 683 |
| }, |
| { |
| "epoch": 0.4450227716330514, |
| "grad_norm": 0.19444917142391205, |
| "learning_rate": 0.0001, |
| "loss": 1.9269, |
| "step": 684 |
| }, |
| { |
| "epoch": 0.44567338972023424, |
| "grad_norm": 0.2866727113723755, |
| "learning_rate": 0.0001, |
| "loss": 2.5145, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.44632400780741704, |
| "grad_norm": 0.24801641702651978, |
| "learning_rate": 0.0001, |
| "loss": 2.2954, |
| "step": 686 |
| }, |
| { |
| "epoch": 0.44697462589459985, |
| "grad_norm": 0.2115658074617386, |
| "learning_rate": 0.0001, |
| "loss": 2.1956, |
| "step": 687 |
| }, |
| { |
| "epoch": 0.4476252439817827, |
| "grad_norm": 0.3155558109283447, |
| "learning_rate": 0.0001, |
| "loss": 2.7396, |
| "step": 688 |
| }, |
| { |
| "epoch": 0.4482758620689655, |
| "grad_norm": 0.22418133914470673, |
| "learning_rate": 0.0001, |
| "loss": 2.1066, |
| "step": 689 |
| }, |
| { |
| "epoch": 0.44892648015614833, |
| "grad_norm": 0.2707614600658417, |
| "learning_rate": 0.0001, |
| "loss": 2.3353, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.44957709824333114, |
| "grad_norm": 0.22262880206108093, |
| "learning_rate": 0.0001, |
| "loss": 2.2143, |
| "step": 691 |
| }, |
| { |
| "epoch": 0.450227716330514, |
| "grad_norm": 0.25256767868995667, |
| "learning_rate": 0.0001, |
| "loss": 2.2786, |
| "step": 692 |
| }, |
| { |
| "epoch": 0.4508783344176968, |
| "grad_norm": 0.20360921323299408, |
| "learning_rate": 0.0001, |
| "loss": 2.0059, |
| "step": 693 |
| }, |
| { |
| "epoch": 0.4515289525048796, |
| "grad_norm": 0.20573420822620392, |
| "learning_rate": 0.0001, |
| "loss": 2.0884, |
| "step": 694 |
| }, |
| { |
| "epoch": 0.4521795705920625, |
| "grad_norm": 0.31812623143196106, |
| "learning_rate": 0.0001, |
| "loss": 2.5905, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.4528301886792453, |
| "grad_norm": 0.24690969288349152, |
| "learning_rate": 0.0001, |
| "loss": 2.5157, |
| "step": 696 |
| }, |
| { |
| "epoch": 0.4534808067664281, |
| "grad_norm": 0.256793737411499, |
| "learning_rate": 0.0001, |
| "loss": 2.1548, |
| "step": 697 |
| }, |
| { |
| "epoch": 0.4541314248536109, |
| "grad_norm": 0.2659960985183716, |
| "learning_rate": 0.0001, |
| "loss": 2.2977, |
| "step": 698 |
| }, |
| { |
| "epoch": 0.4547820429407938, |
| "grad_norm": 0.23824195563793182, |
| "learning_rate": 0.0001, |
| "loss": 2.5946, |
| "step": 699 |
| }, |
| { |
| "epoch": 0.4554326610279766, |
| "grad_norm": 0.2580608129501343, |
| "learning_rate": 0.0001, |
| "loss": 2.2608, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.4560832791151594, |
| "grad_norm": 0.270622193813324, |
| "learning_rate": 0.0001, |
| "loss": 2.5848, |
| "step": 701 |
| }, |
| { |
| "epoch": 0.4567338972023422, |
| "grad_norm": 0.2170489877462387, |
| "learning_rate": 0.0001, |
| "loss": 2.4315, |
| "step": 702 |
| }, |
| { |
| "epoch": 0.45738451528952506, |
| "grad_norm": 0.20716050267219543, |
| "learning_rate": 0.0001, |
| "loss": 2.1592, |
| "step": 703 |
| }, |
| { |
| "epoch": 0.45803513337670787, |
| "grad_norm": 0.24847671389579773, |
| "learning_rate": 0.0001, |
| "loss": 2.3202, |
| "step": 704 |
| }, |
| { |
| "epoch": 0.4586857514638907, |
| "grad_norm": 0.24049146473407745, |
| "learning_rate": 0.0001, |
| "loss": 2.1968, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.45933636955107354, |
| "grad_norm": 0.2079533487558365, |
| "learning_rate": 0.0001, |
| "loss": 2.2966, |
| "step": 706 |
| }, |
| { |
| "epoch": 0.45998698763825635, |
| "grad_norm": 0.18255428969860077, |
| "learning_rate": 0.0001, |
| "loss": 1.9931, |
| "step": 707 |
| }, |
| { |
| "epoch": 0.46063760572543916, |
| "grad_norm": 0.28015655279159546, |
| "learning_rate": 0.0001, |
| "loss": 2.2605, |
| "step": 708 |
| }, |
| { |
| "epoch": 0.46128822381262197, |
| "grad_norm": 0.27453094720840454, |
| "learning_rate": 0.0001, |
| "loss": 2.2835, |
| "step": 709 |
| }, |
| { |
| "epoch": 0.46193884189980483, |
| "grad_norm": 0.2751506268978119, |
| "learning_rate": 0.0001, |
| "loss": 2.665, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.46258945998698764, |
| "grad_norm": 0.2759210169315338, |
| "learning_rate": 0.0001, |
| "loss": 2.3593, |
| "step": 711 |
| }, |
| { |
| "epoch": 0.46324007807417045, |
| "grad_norm": 0.2902829051017761, |
| "learning_rate": 0.0001, |
| "loss": 2.7421, |
| "step": 712 |
| }, |
| { |
| "epoch": 0.4638906961613533, |
| "grad_norm": 0.24083854258060455, |
| "learning_rate": 0.0001, |
| "loss": 2.4644, |
| "step": 713 |
| }, |
| { |
| "epoch": 0.4645413142485361, |
| "grad_norm": 0.23614934086799622, |
| "learning_rate": 0.0001, |
| "loss": 2.2939, |
| "step": 714 |
| }, |
| { |
| "epoch": 0.4651919323357189, |
| "grad_norm": 0.1972537487745285, |
| "learning_rate": 0.0001, |
| "loss": 1.9391, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.46584255042290174, |
| "grad_norm": 0.2227838933467865, |
| "learning_rate": 0.0001, |
| "loss": 1.9396, |
| "step": 716 |
| }, |
| { |
| "epoch": 0.4664931685100846, |
| "grad_norm": 0.3672918379306793, |
| "learning_rate": 0.0001, |
| "loss": 2.7508, |
| "step": 717 |
| }, |
| { |
| "epoch": 0.4671437865972674, |
| "grad_norm": 0.2712246775627136, |
| "learning_rate": 0.0001, |
| "loss": 2.2838, |
| "step": 718 |
| }, |
| { |
| "epoch": 0.4677944046844502, |
| "grad_norm": 0.2337927669286728, |
| "learning_rate": 0.0001, |
| "loss": 1.9807, |
| "step": 719 |
| }, |
| { |
| "epoch": 0.468445022771633, |
| "grad_norm": 0.2051180601119995, |
| "learning_rate": 0.0001, |
| "loss": 2.0311, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.4690956408588159, |
| "grad_norm": 0.1965889185667038, |
| "learning_rate": 0.0001, |
| "loss": 2.1114, |
| "step": 721 |
| }, |
| { |
| "epoch": 0.4697462589459987, |
| "grad_norm": 0.2106337547302246, |
| "learning_rate": 0.0001, |
| "loss": 2.0792, |
| "step": 722 |
| }, |
| { |
| "epoch": 0.4703968770331815, |
| "grad_norm": 0.19918356835842133, |
| "learning_rate": 0.0001, |
| "loss": 2.1323, |
| "step": 723 |
| }, |
| { |
| "epoch": 0.47104749512036437, |
| "grad_norm": 0.20124401152133942, |
| "learning_rate": 0.0001, |
| "loss": 2.0008, |
| "step": 724 |
| }, |
| { |
| "epoch": 0.4716981132075472, |
| "grad_norm": 0.2172473967075348, |
| "learning_rate": 0.0001, |
| "loss": 2.3891, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.47234873129473, |
| "grad_norm": 0.2524811029434204, |
| "learning_rate": 0.0001, |
| "loss": 2.3343, |
| "step": 726 |
| }, |
| { |
| "epoch": 0.4729993493819128, |
| "grad_norm": 0.22882957756519318, |
| "learning_rate": 0.0001, |
| "loss": 2.6723, |
| "step": 727 |
| }, |
| { |
| "epoch": 0.47364996746909566, |
| "grad_norm": 0.2434161901473999, |
| "learning_rate": 0.0001, |
| "loss": 1.9549, |
| "step": 728 |
| }, |
| { |
| "epoch": 0.47430058555627846, |
| "grad_norm": 0.19140364229679108, |
| "learning_rate": 0.0001, |
| "loss": 2.0468, |
| "step": 729 |
| }, |
| { |
| "epoch": 0.4749512036434613, |
| "grad_norm": 0.22166937589645386, |
| "learning_rate": 0.0001, |
| "loss": 2.3432, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.47560182173064414, |
| "grad_norm": 0.2005748748779297, |
| "learning_rate": 0.0001, |
| "loss": 2.0616, |
| "step": 731 |
| }, |
| { |
| "epoch": 0.47625243981782694, |
| "grad_norm": 0.3115980923175812, |
| "learning_rate": 0.0001, |
| "loss": 2.6153, |
| "step": 732 |
| }, |
| { |
| "epoch": 0.47690305790500975, |
| "grad_norm": 0.27135169506073, |
| "learning_rate": 0.0001, |
| "loss": 2.3225, |
| "step": 733 |
| }, |
| { |
| "epoch": 0.47755367599219256, |
| "grad_norm": 0.20748727023601532, |
| "learning_rate": 0.0001, |
| "loss": 1.834, |
| "step": 734 |
| }, |
| { |
| "epoch": 0.4782042940793754, |
| "grad_norm": 0.4031495153903961, |
| "learning_rate": 0.0001, |
| "loss": 2.8177, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.47885491216655823, |
| "grad_norm": 0.2978368401527405, |
| "learning_rate": 0.0001, |
| "loss": 2.6178, |
| "step": 736 |
| }, |
| { |
| "epoch": 0.47950553025374104, |
| "grad_norm": 0.3466270864009857, |
| "learning_rate": 0.0001, |
| "loss": 2.6031, |
| "step": 737 |
| }, |
| { |
| "epoch": 0.4801561483409239, |
| "grad_norm": 0.20074127614498138, |
| "learning_rate": 0.0001, |
| "loss": 2.247, |
| "step": 738 |
| }, |
| { |
| "epoch": 0.4808067664281067, |
| "grad_norm": 0.2393479198217392, |
| "learning_rate": 0.0001, |
| "loss": 2.1265, |
| "step": 739 |
| }, |
| { |
| "epoch": 0.4814573845152895, |
| "grad_norm": 0.27758634090423584, |
| "learning_rate": 0.0001, |
| "loss": 2.5025, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.48210800260247233, |
| "grad_norm": 0.20123820006847382, |
| "learning_rate": 0.0001, |
| "loss": 2.0083, |
| "step": 741 |
| }, |
| { |
| "epoch": 0.4827586206896552, |
| "grad_norm": 0.19012506306171417, |
| "learning_rate": 0.0001, |
| "loss": 2.0212, |
| "step": 742 |
| }, |
| { |
| "epoch": 0.483409238776838, |
| "grad_norm": 0.19451047480106354, |
| "learning_rate": 0.0001, |
| "loss": 2.0295, |
| "step": 743 |
| }, |
| { |
| "epoch": 0.4840598568640208, |
| "grad_norm": 0.3339052200317383, |
| "learning_rate": 0.0001, |
| "loss": 2.4813, |
| "step": 744 |
| }, |
| { |
| "epoch": 0.4847104749512036, |
| "grad_norm": 0.2646152973175049, |
| "learning_rate": 0.0001, |
| "loss": 2.4302, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.4853610930383865, |
| "grad_norm": 0.23590324819087982, |
| "learning_rate": 0.0001, |
| "loss": 2.1723, |
| "step": 746 |
| }, |
| { |
| "epoch": 0.4860117111255693, |
| "grad_norm": 0.28924039006233215, |
| "learning_rate": 0.0001, |
| "loss": 2.8005, |
| "step": 747 |
| }, |
| { |
| "epoch": 0.4866623292127521, |
| "grad_norm": 0.21145464479923248, |
| "learning_rate": 0.0001, |
| "loss": 2.3501, |
| "step": 748 |
| }, |
| { |
| "epoch": 0.48731294729993496, |
| "grad_norm": 0.22815656661987305, |
| "learning_rate": 0.0001, |
| "loss": 2.1997, |
| "step": 749 |
| }, |
| { |
| "epoch": 0.48796356538711777, |
| "grad_norm": 0.24325215816497803, |
| "learning_rate": 0.0001, |
| "loss": 2.039, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.4886141834743006, |
| "grad_norm": 0.3235335052013397, |
| "learning_rate": 0.0001, |
| "loss": 2.4533, |
| "step": 751 |
| }, |
| { |
| "epoch": 0.4892648015614834, |
| "grad_norm": 0.25513559579849243, |
| "learning_rate": 0.0001, |
| "loss": 2.3779, |
| "step": 752 |
| }, |
| { |
| "epoch": 0.48991541964866625, |
| "grad_norm": 0.2905427813529968, |
| "learning_rate": 0.0001, |
| "loss": 1.9843, |
| "step": 753 |
| }, |
| { |
| "epoch": 0.49056603773584906, |
| "grad_norm": 0.23760183155536652, |
| "learning_rate": 0.0001, |
| "loss": 2.1825, |
| "step": 754 |
| }, |
| { |
| "epoch": 0.49121665582303187, |
| "grad_norm": 0.2170071303844452, |
| "learning_rate": 0.0001, |
| "loss": 1.9877, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.49186727391021473, |
| "grad_norm": 0.2555190920829773, |
| "learning_rate": 0.0001, |
| "loss": 2.457, |
| "step": 756 |
| }, |
| { |
| "epoch": 0.49251789199739754, |
| "grad_norm": 0.2571033835411072, |
| "learning_rate": 0.0001, |
| "loss": 2.1152, |
| "step": 757 |
| }, |
| { |
| "epoch": 0.49316851008458035, |
| "grad_norm": 0.23969238996505737, |
| "learning_rate": 0.0001, |
| "loss": 2.3439, |
| "step": 758 |
| }, |
| { |
| "epoch": 0.49381912817176316, |
| "grad_norm": 0.1900262087583542, |
| "learning_rate": 0.0001, |
| "loss": 1.8999, |
| "step": 759 |
| }, |
| { |
| "epoch": 0.494469746258946, |
| "grad_norm": 0.19621430337429047, |
| "learning_rate": 0.0001, |
| "loss": 2.0658, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.4951203643461288, |
| "grad_norm": 0.21956481039524078, |
| "learning_rate": 0.0001, |
| "loss": 2.5427, |
| "step": 761 |
| }, |
| { |
| "epoch": 0.49577098243331164, |
| "grad_norm": 0.22567258775234222, |
| "learning_rate": 0.0001, |
| "loss": 2.2777, |
| "step": 762 |
| }, |
| { |
| "epoch": 0.49642160052049444, |
| "grad_norm": 0.20233570039272308, |
| "learning_rate": 0.0001, |
| "loss": 2.0342, |
| "step": 763 |
| }, |
| { |
| "epoch": 0.4970722186076773, |
| "grad_norm": 0.23662947118282318, |
| "learning_rate": 0.0001, |
| "loss": 2.3668, |
| "step": 764 |
| }, |
| { |
| "epoch": 0.4977228366948601, |
| "grad_norm": 0.2625278830528259, |
| "learning_rate": 0.0001, |
| "loss": 2.6536, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.4983734547820429, |
| "grad_norm": 0.23235228657722473, |
| "learning_rate": 0.0001, |
| "loss": 2.1891, |
| "step": 766 |
| }, |
| { |
| "epoch": 0.4990240728692258, |
| "grad_norm": 0.19439217448234558, |
| "learning_rate": 0.0001, |
| "loss": 1.9647, |
| "step": 767 |
| }, |
| { |
| "epoch": 0.4996746909564086, |
| "grad_norm": 0.19810114800930023, |
| "learning_rate": 0.0001, |
| "loss": 1.9965, |
| "step": 768 |
| }, |
| { |
| "epoch": 0.5003253090435914, |
| "grad_norm": 0.2525380253791809, |
| "learning_rate": 0.0001, |
| "loss": 2.2444, |
| "step": 769 |
| }, |
| { |
| "epoch": 0.5009759271307742, |
| "grad_norm": 0.2409314513206482, |
| "learning_rate": 0.0001, |
| "loss": 2.1717, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.501626545217957, |
| "grad_norm": 0.25244686007499695, |
| "learning_rate": 0.0001, |
| "loss": 2.0126, |
| "step": 771 |
| }, |
| { |
| "epoch": 0.5022771633051398, |
| "grad_norm": 0.19767141342163086, |
| "learning_rate": 0.0001, |
| "loss": 2.1384, |
| "step": 772 |
| }, |
| { |
| "epoch": 0.5029277813923227, |
| "grad_norm": 0.39446812868118286, |
| "learning_rate": 0.0001, |
| "loss": 2.8039, |
| "step": 773 |
| }, |
| { |
| "epoch": 0.5035783994795056, |
| "grad_norm": 0.2643390893936157, |
| "learning_rate": 0.0001, |
| "loss": 2.1524, |
| "step": 774 |
| }, |
| { |
| "epoch": 0.5042290175666884, |
| "grad_norm": 0.27606508135795593, |
| "learning_rate": 0.0001, |
| "loss": 2.1802, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.5048796356538712, |
| "grad_norm": 0.364106148481369, |
| "learning_rate": 0.0001, |
| "loss": 2.9694, |
| "step": 776 |
| }, |
| { |
| "epoch": 0.505530253741054, |
| "grad_norm": 0.23091645538806915, |
| "learning_rate": 0.0001, |
| "loss": 2.5471, |
| "step": 777 |
| }, |
| { |
| "epoch": 0.5061808718282368, |
| "grad_norm": 0.19318193197250366, |
| "learning_rate": 0.0001, |
| "loss": 2.2082, |
| "step": 778 |
| }, |
| { |
| "epoch": 0.5068314899154196, |
| "grad_norm": 0.28997862339019775, |
| "learning_rate": 0.0001, |
| "loss": 2.4399, |
| "step": 779 |
| }, |
| { |
| "epoch": 0.5074821080026025, |
| "grad_norm": 0.22487197816371918, |
| "learning_rate": 0.0001, |
| "loss": 2.1946, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.5081327260897853, |
| "grad_norm": 0.24430596828460693, |
| "learning_rate": 0.0001, |
| "loss": 2.4456, |
| "step": 781 |
| }, |
| { |
| "epoch": 0.5087833441769681, |
| "grad_norm": 0.21677151322364807, |
| "learning_rate": 0.0001, |
| "loss": 2.2082, |
| "step": 782 |
| }, |
| { |
| "epoch": 0.5094339622641509, |
| "grad_norm": 0.47995632886886597, |
| "learning_rate": 0.0001, |
| "loss": 3.1358, |
| "step": 783 |
| }, |
| { |
| "epoch": 0.5100845803513337, |
| "grad_norm": 0.19044414162635803, |
| "learning_rate": 0.0001, |
| "loss": 1.8924, |
| "step": 784 |
| }, |
| { |
| "epoch": 0.5107351984385166, |
| "grad_norm": 0.19143608212471008, |
| "learning_rate": 0.0001, |
| "loss": 2.0459, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.5113858165256994, |
| "grad_norm": 0.22588413953781128, |
| "learning_rate": 0.0001, |
| "loss": 2.1369, |
| "step": 786 |
| }, |
| { |
| "epoch": 0.5120364346128823, |
| "grad_norm": 0.2786167860031128, |
| "learning_rate": 0.0001, |
| "loss": 2.2029, |
| "step": 787 |
| }, |
| { |
| "epoch": 0.5126870527000651, |
| "grad_norm": 0.24471627175807953, |
| "learning_rate": 0.0001, |
| "loss": 2.1248, |
| "step": 788 |
| }, |
| { |
| "epoch": 0.5133376707872479, |
| "grad_norm": 0.17795225977897644, |
| "learning_rate": 0.0001, |
| "loss": 1.7926, |
| "step": 789 |
| }, |
| { |
| "epoch": 0.5139882888744307, |
| "grad_norm": 0.2173709124326706, |
| "learning_rate": 0.0001, |
| "loss": 2.0538, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.5146389069616135, |
| "grad_norm": 0.2027692049741745, |
| "learning_rate": 0.0001, |
| "loss": 1.8568, |
| "step": 791 |
| }, |
| { |
| "epoch": 0.5152895250487963, |
| "grad_norm": 0.2013595849275589, |
| "learning_rate": 0.0001, |
| "loss": 2.0501, |
| "step": 792 |
| }, |
| { |
| "epoch": 0.5159401431359791, |
| "grad_norm": 0.21996662020683289, |
| "learning_rate": 0.0001, |
| "loss": 2.0374, |
| "step": 793 |
| }, |
| { |
| "epoch": 0.516590761223162, |
| "grad_norm": 0.21435722708702087, |
| "learning_rate": 0.0001, |
| "loss": 2.1907, |
| "step": 794 |
| }, |
| { |
| "epoch": 0.5172413793103449, |
| "grad_norm": 0.21512284874916077, |
| "learning_rate": 0.0001, |
| "loss": 2.315, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.5178919973975277, |
| "grad_norm": 0.19432400166988373, |
| "learning_rate": 0.0001, |
| "loss": 2.103, |
| "step": 796 |
| }, |
| { |
| "epoch": 0.5185426154847105, |
| "grad_norm": 0.23112992942333221, |
| "learning_rate": 0.0001, |
| "loss": 2.328, |
| "step": 797 |
| }, |
| { |
| "epoch": 0.5191932335718933, |
| "grad_norm": 0.19719737768173218, |
| "learning_rate": 0.0001, |
| "loss": 1.9569, |
| "step": 798 |
| }, |
| { |
| "epoch": 0.5198438516590761, |
| "grad_norm": 0.2115892618894577, |
| "learning_rate": 0.0001, |
| "loss": 2.2533, |
| "step": 799 |
| }, |
| { |
| "epoch": 0.5204944697462589, |
| "grad_norm": 0.24321842193603516, |
| "learning_rate": 0.0001, |
| "loss": 2.6597, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.5211450878334418, |
| "grad_norm": 0.18219350278377533, |
| "learning_rate": 0.0001, |
| "loss": 1.8709, |
| "step": 801 |
| }, |
| { |
| "epoch": 0.5217957059206246, |
| "grad_norm": 0.18715021014213562, |
| "learning_rate": 0.0001, |
| "loss": 2.0021, |
| "step": 802 |
| }, |
| { |
| "epoch": 0.5224463240078074, |
| "grad_norm": 0.25940024852752686, |
| "learning_rate": 0.0001, |
| "loss": 2.3742, |
| "step": 803 |
| }, |
| { |
| "epoch": 0.5230969420949902, |
| "grad_norm": 0.18714728951454163, |
| "learning_rate": 0.0001, |
| "loss": 2.211, |
| "step": 804 |
| }, |
| { |
| "epoch": 0.523747560182173, |
| "grad_norm": 0.20145951211452484, |
| "learning_rate": 0.0001, |
| "loss": 2.0047, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.5243981782693559, |
| "grad_norm": 0.18992845714092255, |
| "learning_rate": 0.0001, |
| "loss": 1.8559, |
| "step": 806 |
| }, |
| { |
| "epoch": 0.5250487963565387, |
| "grad_norm": 0.2682324945926666, |
| "learning_rate": 0.0001, |
| "loss": 2.4791, |
| "step": 807 |
| }, |
| { |
| "epoch": 0.5256994144437215, |
| "grad_norm": 0.33034664392471313, |
| "learning_rate": 0.0001, |
| "loss": 2.3089, |
| "step": 808 |
| }, |
| { |
| "epoch": 0.5263500325309044, |
| "grad_norm": 0.18838956952095032, |
| "learning_rate": 0.0001, |
| "loss": 1.9462, |
| "step": 809 |
| }, |
| { |
| "epoch": 0.5270006506180872, |
| "grad_norm": 0.42872169613838196, |
| "learning_rate": 0.0001, |
| "loss": 2.6874, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.52765126870527, |
| "grad_norm": 0.2108643501996994, |
| "learning_rate": 0.0001, |
| "loss": 2.3627, |
| "step": 811 |
| }, |
| { |
| "epoch": 0.5283018867924528, |
| "grad_norm": 0.21745599806308746, |
| "learning_rate": 0.0001, |
| "loss": 2.1204, |
| "step": 812 |
| }, |
| { |
| "epoch": 0.5289525048796356, |
| "grad_norm": 0.2577585279941559, |
| "learning_rate": 0.0001, |
| "loss": 1.9746, |
| "step": 813 |
| }, |
| { |
| "epoch": 0.5296031229668184, |
| "grad_norm": 0.372471421957016, |
| "learning_rate": 0.0001, |
| "loss": 2.688, |
| "step": 814 |
| }, |
| { |
| "epoch": 0.5302537410540012, |
| "grad_norm": 0.2425181120634079, |
| "learning_rate": 0.0001, |
| "loss": 2.1377, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.5309043591411842, |
| "grad_norm": 0.2638307511806488, |
| "learning_rate": 0.0001, |
| "loss": 2.1088, |
| "step": 816 |
| }, |
| { |
| "epoch": 0.531554977228367, |
| "grad_norm": 0.2356933355331421, |
| "learning_rate": 0.0001, |
| "loss": 2.2291, |
| "step": 817 |
| }, |
| { |
| "epoch": 0.5322055953155498, |
| "grad_norm": 0.23714864253997803, |
| "learning_rate": 0.0001, |
| "loss": 2.0929, |
| "step": 818 |
| }, |
| { |
| "epoch": 0.5328562134027326, |
| "grad_norm": 0.19541950523853302, |
| "learning_rate": 0.0001, |
| "loss": 2.0883, |
| "step": 819 |
| }, |
| { |
| "epoch": 0.5335068314899154, |
| "grad_norm": 0.3091617822647095, |
| "learning_rate": 0.0001, |
| "loss": 3.0127, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.5341574495770982, |
| "grad_norm": 0.2592740058898926, |
| "learning_rate": 0.0001, |
| "loss": 1.8307, |
| "step": 821 |
| }, |
| { |
| "epoch": 0.534808067664281, |
| "grad_norm": 0.22505807876586914, |
| "learning_rate": 0.0001, |
| "loss": 2.462, |
| "step": 822 |
| }, |
| { |
| "epoch": 0.5354586857514639, |
| "grad_norm": 0.22032824158668518, |
| "learning_rate": 0.0001, |
| "loss": 2.2718, |
| "step": 823 |
| }, |
| { |
| "epoch": 0.5361093038386467, |
| "grad_norm": 0.2457459270954132, |
| "learning_rate": 0.0001, |
| "loss": 2.4213, |
| "step": 824 |
| }, |
| { |
| "epoch": 0.5367599219258296, |
| "grad_norm": 0.24181683361530304, |
| "learning_rate": 0.0001, |
| "loss": 1.9347, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.5374105400130124, |
| "grad_norm": 0.29988738894462585, |
| "learning_rate": 0.0001, |
| "loss": 2.7697, |
| "step": 826 |
| }, |
| { |
| "epoch": 0.5380611581001952, |
| "grad_norm": 0.24946388602256775, |
| "learning_rate": 0.0001, |
| "loss": 2.2117, |
| "step": 827 |
| }, |
| { |
| "epoch": 0.538711776187378, |
| "grad_norm": 0.20339331030845642, |
| "learning_rate": 0.0001, |
| "loss": 1.9936, |
| "step": 828 |
| }, |
| { |
| "epoch": 0.5393623942745608, |
| "grad_norm": 0.22250457108020782, |
| "learning_rate": 0.0001, |
| "loss": 2.0785, |
| "step": 829 |
| }, |
| { |
| "epoch": 0.5400130123617437, |
| "grad_norm": 0.1869298666715622, |
| "learning_rate": 0.0001, |
| "loss": 2.0406, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.5406636304489265, |
| "grad_norm": 0.1873755156993866, |
| "learning_rate": 0.0001, |
| "loss": 1.9126, |
| "step": 831 |
| }, |
| { |
| "epoch": 0.5413142485361093, |
| "grad_norm": 0.3135535418987274, |
| "learning_rate": 0.0001, |
| "loss": 2.2881, |
| "step": 832 |
| }, |
| { |
| "epoch": 0.5419648666232921, |
| "grad_norm": 0.20596185326576233, |
| "learning_rate": 0.0001, |
| "loss": 2.0682, |
| "step": 833 |
| }, |
| { |
| "epoch": 0.5426154847104749, |
| "grad_norm": 0.25786712765693665, |
| "learning_rate": 0.0001, |
| "loss": 2.0591, |
| "step": 834 |
| }, |
| { |
| "epoch": 0.5432661027976577, |
| "grad_norm": 0.2592066824436188, |
| "learning_rate": 0.0001, |
| "loss": 2.052, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.5439167208848406, |
| "grad_norm": 0.20738951861858368, |
| "learning_rate": 0.0001, |
| "loss": 1.9726, |
| "step": 836 |
| }, |
| { |
| "epoch": 0.5445673389720235, |
| "grad_norm": 0.21384763717651367, |
| "learning_rate": 0.0001, |
| "loss": 2.1897, |
| "step": 837 |
| }, |
| { |
| "epoch": 0.5452179570592063, |
| "grad_norm": 0.22050943970680237, |
| "learning_rate": 0.0001, |
| "loss": 2.3597, |
| "step": 838 |
| }, |
| { |
| "epoch": 0.5458685751463891, |
| "grad_norm": 0.1996280699968338, |
| "learning_rate": 0.0001, |
| "loss": 2.0492, |
| "step": 839 |
| }, |
| { |
| "epoch": 0.5465191932335719, |
| "grad_norm": 0.2430533468723297, |
| "learning_rate": 0.0001, |
| "loss": 2.2774, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.5471698113207547, |
| "grad_norm": 0.22777177393436432, |
| "learning_rate": 0.0001, |
| "loss": 2.0779, |
| "step": 841 |
| }, |
| { |
| "epoch": 0.5478204294079375, |
| "grad_norm": 0.22464539110660553, |
| "learning_rate": 0.0001, |
| "loss": 2.3316, |
| "step": 842 |
| }, |
| { |
| "epoch": 0.5484710474951203, |
| "grad_norm": 0.17759400606155396, |
| "learning_rate": 0.0001, |
| "loss": 1.8407, |
| "step": 843 |
| }, |
| { |
| "epoch": 0.5491216655823032, |
| "grad_norm": 0.22264355421066284, |
| "learning_rate": 0.0001, |
| "loss": 2.2869, |
| "step": 844 |
| }, |
| { |
| "epoch": 0.549772283669486, |
| "grad_norm": 0.20819737017154694, |
| "learning_rate": 0.0001, |
| "loss": 2.1209, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.5504229017566689, |
| "grad_norm": 0.2194463461637497, |
| "learning_rate": 0.0001, |
| "loss": 2.1457, |
| "step": 846 |
| }, |
| { |
| "epoch": 0.5510735198438517, |
| "grad_norm": 0.19314661622047424, |
| "learning_rate": 0.0001, |
| "loss": 2.1063, |
| "step": 847 |
| }, |
| { |
| "epoch": 0.5517241379310345, |
| "grad_norm": 0.186354860663414, |
| "learning_rate": 0.0001, |
| "loss": 2.0833, |
| "step": 848 |
| }, |
| { |
| "epoch": 0.5523747560182173, |
| "grad_norm": 0.1862732619047165, |
| "learning_rate": 0.0001, |
| "loss": 1.9441, |
| "step": 849 |
| }, |
| { |
| "epoch": 0.5530253741054001, |
| "grad_norm": 0.24664181470870972, |
| "learning_rate": 0.0001, |
| "loss": 2.3277, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.5536759921925829, |
| "grad_norm": 0.20182165503501892, |
| "learning_rate": 0.0001, |
| "loss": 2.1902, |
| "step": 851 |
| }, |
| { |
| "epoch": 0.5543266102797658, |
| "grad_norm": 0.2108999788761139, |
| "learning_rate": 0.0001, |
| "loss": 2.0826, |
| "step": 852 |
| }, |
| { |
| "epoch": 0.5549772283669486, |
| "grad_norm": 0.25388890504837036, |
| "learning_rate": 0.0001, |
| "loss": 2.5149, |
| "step": 853 |
| }, |
| { |
| "epoch": 0.5556278464541314, |
| "grad_norm": 0.2074718177318573, |
| "learning_rate": 0.0001, |
| "loss": 1.9135, |
| "step": 854 |
| }, |
| { |
| "epoch": 0.5562784645413142, |
| "grad_norm": 0.1992723047733307, |
| "learning_rate": 0.0001, |
| "loss": 2.186, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.556929082628497, |
| "grad_norm": 0.18721085786819458, |
| "learning_rate": 0.0001, |
| "loss": 1.9453, |
| "step": 856 |
| }, |
| { |
| "epoch": 0.5575797007156799, |
| "grad_norm": 0.21606992185115814, |
| "learning_rate": 0.0001, |
| "loss": 2.1703, |
| "step": 857 |
| }, |
| { |
| "epoch": 0.5582303188028627, |
| "grad_norm": 0.2854723334312439, |
| "learning_rate": 0.0001, |
| "loss": 2.9538, |
| "step": 858 |
| }, |
| { |
| "epoch": 0.5588809368900456, |
| "grad_norm": 0.21503040194511414, |
| "learning_rate": 0.0001, |
| "loss": 2.0194, |
| "step": 859 |
| }, |
| { |
| "epoch": 0.5595315549772284, |
| "grad_norm": 0.2690679430961609, |
| "learning_rate": 0.0001, |
| "loss": 2.1562, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.5601821730644112, |
| "grad_norm": 0.2811613976955414, |
| "learning_rate": 0.0001, |
| "loss": 2.2475, |
| "step": 861 |
| }, |
| { |
| "epoch": 0.560832791151594, |
| "grad_norm": 0.2551681697368622, |
| "learning_rate": 0.0001, |
| "loss": 2.5585, |
| "step": 862 |
| }, |
| { |
| "epoch": 0.5614834092387768, |
| "grad_norm": 0.21423856914043427, |
| "learning_rate": 0.0001, |
| "loss": 2.1194, |
| "step": 863 |
| }, |
| { |
| "epoch": 0.5621340273259596, |
| "grad_norm": 0.22121264040470123, |
| "learning_rate": 0.0001, |
| "loss": 1.9257, |
| "step": 864 |
| }, |
| { |
| "epoch": 0.5627846454131424, |
| "grad_norm": 0.38684332370758057, |
| "learning_rate": 0.0001, |
| "loss": 2.5203, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.5634352635003254, |
| "grad_norm": 0.20299634337425232, |
| "learning_rate": 0.0001, |
| "loss": 2.0868, |
| "step": 866 |
| }, |
| { |
| "epoch": 0.5640858815875082, |
| "grad_norm": 0.33485493063926697, |
| "learning_rate": 0.0001, |
| "loss": 2.457, |
| "step": 867 |
| }, |
| { |
| "epoch": 0.564736499674691, |
| "grad_norm": 0.23778866231441498, |
| "learning_rate": 0.0001, |
| "loss": 1.9863, |
| "step": 868 |
| }, |
| { |
| "epoch": 0.5653871177618738, |
| "grad_norm": 0.18562458455562592, |
| "learning_rate": 0.0001, |
| "loss": 1.915, |
| "step": 869 |
| }, |
| { |
| "epoch": 0.5660377358490566, |
| "grad_norm": 0.3780176341533661, |
| "learning_rate": 0.0001, |
| "loss": 2.5518, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.5666883539362394, |
| "grad_norm": 0.1924014538526535, |
| "learning_rate": 0.0001, |
| "loss": 2.0665, |
| "step": 871 |
| }, |
| { |
| "epoch": 0.5673389720234222, |
| "grad_norm": 0.19788160920143127, |
| "learning_rate": 0.0001, |
| "loss": 1.9408, |
| "step": 872 |
| }, |
| { |
| "epoch": 0.5679895901106051, |
| "grad_norm": 0.2435147911310196, |
| "learning_rate": 0.0001, |
| "loss": 2.3716, |
| "step": 873 |
| }, |
| { |
| "epoch": 0.5686402081977879, |
| "grad_norm": 0.2023211270570755, |
| "learning_rate": 0.0001, |
| "loss": 2.2786, |
| "step": 874 |
| }, |
| { |
| "epoch": 0.5692908262849707, |
| "grad_norm": 0.29936715960502625, |
| "learning_rate": 0.0001, |
| "loss": 2.6689, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.5699414443721535, |
| "grad_norm": 0.18846483528614044, |
| "learning_rate": 0.0001, |
| "loss": 1.9436, |
| "step": 876 |
| }, |
| { |
| "epoch": 0.5705920624593364, |
| "grad_norm": 0.44592785835266113, |
| "learning_rate": 0.0001, |
| "loss": 2.8648, |
| "step": 877 |
| }, |
| { |
| "epoch": 0.5712426805465192, |
| "grad_norm": 0.221640944480896, |
| "learning_rate": 0.0001, |
| "loss": 2.1613, |
| "step": 878 |
| }, |
| { |
| "epoch": 0.571893298633702, |
| "grad_norm": 0.22345726191997528, |
| "learning_rate": 0.0001, |
| "loss": 2.076, |
| "step": 879 |
| }, |
| { |
| "epoch": 0.5725439167208849, |
| "grad_norm": 0.20094214379787445, |
| "learning_rate": 0.0001, |
| "loss": 2.0474, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.5731945348080677, |
| "grad_norm": 0.1997043937444687, |
| "learning_rate": 0.0001, |
| "loss": 1.9812, |
| "step": 881 |
| }, |
| { |
| "epoch": 0.5738451528952505, |
| "grad_norm": 0.3758605420589447, |
| "learning_rate": 0.0001, |
| "loss": 2.8357, |
| "step": 882 |
| }, |
| { |
| "epoch": 0.5744957709824333, |
| "grad_norm": 0.2940578758716583, |
| "learning_rate": 0.0001, |
| "loss": 2.4955, |
| "step": 883 |
| }, |
| { |
| "epoch": 0.5751463890696161, |
| "grad_norm": 0.2434762865304947, |
| "learning_rate": 0.0001, |
| "loss": 2.0011, |
| "step": 884 |
| }, |
| { |
| "epoch": 0.5757970071567989, |
| "grad_norm": 0.24335308372974396, |
| "learning_rate": 0.0001, |
| "loss": 2.5458, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.5764476252439817, |
| "grad_norm": 0.2063351422548294, |
| "learning_rate": 0.0001, |
| "loss": 1.9801, |
| "step": 886 |
| }, |
| { |
| "epoch": 0.5770982433311646, |
| "grad_norm": 0.35102301836013794, |
| "learning_rate": 0.0001, |
| "loss": 2.5647, |
| "step": 887 |
| }, |
| { |
| "epoch": 0.5777488614183475, |
| "grad_norm": 0.22332875430583954, |
| "learning_rate": 0.0001, |
| "loss": 2.0542, |
| "step": 888 |
| }, |
| { |
| "epoch": 0.5783994795055303, |
| "grad_norm": 0.2073124796152115, |
| "learning_rate": 0.0001, |
| "loss": 1.9348, |
| "step": 889 |
| }, |
| { |
| "epoch": 0.5790500975927131, |
| "grad_norm": 0.21079733967781067, |
| "learning_rate": 0.0001, |
| "loss": 1.9829, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.5797007156798959, |
| "grad_norm": 0.2842913866043091, |
| "learning_rate": 0.0001, |
| "loss": 2.7215, |
| "step": 891 |
| }, |
| { |
| "epoch": 0.5803513337670787, |
| "grad_norm": 0.2807595133781433, |
| "learning_rate": 0.0001, |
| "loss": 2.1827, |
| "step": 892 |
| }, |
| { |
| "epoch": 0.5810019518542615, |
| "grad_norm": 0.24955599009990692, |
| "learning_rate": 0.0001, |
| "loss": 2.6246, |
| "step": 893 |
| }, |
| { |
| "epoch": 0.5816525699414443, |
| "grad_norm": 0.23281241953372955, |
| "learning_rate": 0.0001, |
| "loss": 2.3944, |
| "step": 894 |
| }, |
| { |
| "epoch": 0.5823031880286272, |
| "grad_norm": 0.2617682218551636, |
| "learning_rate": 0.0001, |
| "loss": 2.6147, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.58295380611581, |
| "grad_norm": 0.1915360391139984, |
| "learning_rate": 0.0001, |
| "loss": 2.0095, |
| "step": 896 |
| }, |
| { |
| "epoch": 0.5836044242029929, |
| "grad_norm": 0.20270249247550964, |
| "learning_rate": 0.0001, |
| "loss": 1.8983, |
| "step": 897 |
| }, |
| { |
| "epoch": 0.5842550422901757, |
| "grad_norm": 0.21804624795913696, |
| "learning_rate": 0.0001, |
| "loss": 2.0425, |
| "step": 898 |
| }, |
| { |
| "epoch": 0.5849056603773585, |
| "grad_norm": 0.25326576828956604, |
| "learning_rate": 0.0001, |
| "loss": 2.4875, |
| "step": 899 |
| }, |
| { |
| "epoch": 0.5855562784645413, |
| "grad_norm": 0.21714434027671814, |
| "learning_rate": 0.0001, |
| "loss": 2.269, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.5862068965517241, |
| "grad_norm": 0.22771766781806946, |
| "learning_rate": 0.0001, |
| "loss": 2.3039, |
| "step": 901 |
| }, |
| { |
| "epoch": 0.586857514638907, |
| "grad_norm": 0.3638748824596405, |
| "learning_rate": 0.0001, |
| "loss": 2.7448, |
| "step": 902 |
| }, |
| { |
| "epoch": 0.5875081327260898, |
| "grad_norm": 0.20194686949253082, |
| "learning_rate": 0.0001, |
| "loss": 2.0141, |
| "step": 903 |
| }, |
| { |
| "epoch": 0.5881587508132726, |
| "grad_norm": 0.187494158744812, |
| "learning_rate": 0.0001, |
| "loss": 2.1188, |
| "step": 904 |
| }, |
| { |
| "epoch": 0.5888093689004554, |
| "grad_norm": 0.23371635377407074, |
| "learning_rate": 0.0001, |
| "loss": 2.6014, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.5894599869876382, |
| "grad_norm": 0.2642146050930023, |
| "learning_rate": 0.0001, |
| "loss": 2.2053, |
| "step": 906 |
| }, |
| { |
| "epoch": 0.590110605074821, |
| "grad_norm": 0.20045514404773712, |
| "learning_rate": 0.0001, |
| "loss": 2.1828, |
| "step": 907 |
| }, |
| { |
| "epoch": 0.5907612231620039, |
| "grad_norm": 0.22904321551322937, |
| "learning_rate": 0.0001, |
| "loss": 2.3128, |
| "step": 908 |
| }, |
| { |
| "epoch": 0.5914118412491868, |
| "grad_norm": 0.36857542395591736, |
| "learning_rate": 0.0001, |
| "loss": 3.3891, |
| "step": 909 |
| }, |
| { |
| "epoch": 0.5920624593363696, |
| "grad_norm": 0.3417764902114868, |
| "learning_rate": 0.0001, |
| "loss": 2.6737, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.5927130774235524, |
| "grad_norm": 0.46861669421195984, |
| "learning_rate": 0.0001, |
| "loss": 2.5329, |
| "step": 911 |
| }, |
| { |
| "epoch": 0.5933636955107352, |
| "grad_norm": 0.32909440994262695, |
| "learning_rate": 0.0001, |
| "loss": 2.4894, |
| "step": 912 |
| }, |
| { |
| "epoch": 0.594014313597918, |
| "grad_norm": 0.2176060974597931, |
| "learning_rate": 0.0001, |
| "loss": 1.9696, |
| "step": 913 |
| }, |
| { |
| "epoch": 0.5946649316851008, |
| "grad_norm": 0.27317941188812256, |
| "learning_rate": 0.0001, |
| "loss": 2.2179, |
| "step": 914 |
| }, |
| { |
| "epoch": 0.5953155497722836, |
| "grad_norm": 0.267123281955719, |
| "learning_rate": 0.0001, |
| "loss": 2.5464, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.5959661678594665, |
| "grad_norm": 0.320402055978775, |
| "learning_rate": 0.0001, |
| "loss": 2.5021, |
| "step": 916 |
| }, |
| { |
| "epoch": 0.5966167859466494, |
| "grad_norm": 0.20610998570919037, |
| "learning_rate": 0.0001, |
| "loss": 2.0586, |
| "step": 917 |
| }, |
| { |
| "epoch": 0.5972674040338322, |
| "grad_norm": 0.2108345478773117, |
| "learning_rate": 0.0001, |
| "loss": 2.3278, |
| "step": 918 |
| }, |
| { |
| "epoch": 0.597918022121015, |
| "grad_norm": 0.18368126451969147, |
| "learning_rate": 0.0001, |
| "loss": 2.1026, |
| "step": 919 |
| }, |
| { |
| "epoch": 0.5985686402081978, |
| "grad_norm": 0.20730890333652496, |
| "learning_rate": 0.0001, |
| "loss": 2.1936, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.5992192582953806, |
| "grad_norm": 0.2921161651611328, |
| "learning_rate": 0.0001, |
| "loss": 2.5618, |
| "step": 921 |
| }, |
| { |
| "epoch": 0.5998698763825634, |
| "grad_norm": 0.23977220058441162, |
| "learning_rate": 0.0001, |
| "loss": 2.533, |
| "step": 922 |
| }, |
| { |
| "epoch": 0.6005204944697463, |
| "grad_norm": 0.25839105248451233, |
| "learning_rate": 0.0001, |
| "loss": 2.7033, |
| "step": 923 |
| }, |
| { |
| "epoch": 0.6011711125569291, |
| "grad_norm": 0.214335098862648, |
| "learning_rate": 0.0001, |
| "loss": 1.9153, |
| "step": 924 |
| }, |
| { |
| "epoch": 0.6018217306441119, |
| "grad_norm": 0.19577006995677948, |
| "learning_rate": 0.0001, |
| "loss": 1.8612, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.6024723487312947, |
| "grad_norm": 0.22480078041553497, |
| "learning_rate": 0.0001, |
| "loss": 2.2383, |
| "step": 926 |
| }, |
| { |
| "epoch": 0.6031229668184775, |
| "grad_norm": 0.2090427577495575, |
| "learning_rate": 0.0001, |
| "loss": 1.9532, |
| "step": 927 |
| }, |
| { |
| "epoch": 0.6037735849056604, |
| "grad_norm": 0.21045666933059692, |
| "learning_rate": 0.0001, |
| "loss": 2.1285, |
| "step": 928 |
| }, |
| { |
| "epoch": 0.6044242029928432, |
| "grad_norm": 0.2302238792181015, |
| "learning_rate": 0.0001, |
| "loss": 2.5368, |
| "step": 929 |
| }, |
| { |
| "epoch": 0.605074821080026, |
| "grad_norm": 0.22230245172977448, |
| "learning_rate": 0.0001, |
| "loss": 2.0551, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.6057254391672089, |
| "grad_norm": 0.2619292140007019, |
| "learning_rate": 0.0001, |
| "loss": 2.5149, |
| "step": 931 |
| }, |
| { |
| "epoch": 0.6063760572543917, |
| "grad_norm": 0.20247308909893036, |
| "learning_rate": 0.0001, |
| "loss": 2.0032, |
| "step": 932 |
| }, |
| { |
| "epoch": 0.6070266753415745, |
| "grad_norm": 0.19772449135780334, |
| "learning_rate": 0.0001, |
| "loss": 1.9627, |
| "step": 933 |
| }, |
| { |
| "epoch": 0.6076772934287573, |
| "grad_norm": 0.1917680948972702, |
| "learning_rate": 0.0001, |
| "loss": 1.9659, |
| "step": 934 |
| }, |
| { |
| "epoch": 0.6083279115159401, |
| "grad_norm": 0.3457018733024597, |
| "learning_rate": 0.0001, |
| "loss": 2.4537, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.6089785296031229, |
| "grad_norm": 0.2027028501033783, |
| "learning_rate": 0.0001, |
| "loss": 2.1681, |
| "step": 936 |
| }, |
| { |
| "epoch": 0.6096291476903057, |
| "grad_norm": 0.24525637924671173, |
| "learning_rate": 0.0001, |
| "loss": 2.0816, |
| "step": 937 |
| }, |
| { |
| "epoch": 0.6102797657774887, |
| "grad_norm": 0.2690584659576416, |
| "learning_rate": 0.0001, |
| "loss": 2.7011, |
| "step": 938 |
| }, |
| { |
| "epoch": 0.6109303838646715, |
| "grad_norm": 0.20961976051330566, |
| "learning_rate": 0.0001, |
| "loss": 2.576, |
| "step": 939 |
| }, |
| { |
| "epoch": 0.6115810019518543, |
| "grad_norm": 0.21827319264411926, |
| "learning_rate": 0.0001, |
| "loss": 2.2605, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.6122316200390371, |
| "grad_norm": 0.20448362827301025, |
| "learning_rate": 0.0001, |
| "loss": 1.9963, |
| "step": 941 |
| }, |
| { |
| "epoch": 0.6128822381262199, |
| "grad_norm": 0.2513864040374756, |
| "learning_rate": 0.0001, |
| "loss": 2.4111, |
| "step": 942 |
| }, |
| { |
| "epoch": 0.6135328562134027, |
| "grad_norm": 0.28347763419151306, |
| "learning_rate": 0.0001, |
| "loss": 2.3459, |
| "step": 943 |
| }, |
| { |
| "epoch": 0.6141834743005855, |
| "grad_norm": 0.20679716765880585, |
| "learning_rate": 0.0001, |
| "loss": 1.9423, |
| "step": 944 |
| }, |
| { |
| "epoch": 0.6148340923877684, |
| "grad_norm": 0.20072445273399353, |
| "learning_rate": 0.0001, |
| "loss": 2.2, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.6154847104749512, |
| "grad_norm": 0.2190425843000412, |
| "learning_rate": 0.0001, |
| "loss": 2.358, |
| "step": 946 |
| }, |
| { |
| "epoch": 0.616135328562134, |
| "grad_norm": 0.2672726511955261, |
| "learning_rate": 0.0001, |
| "loss": 2.5034, |
| "step": 947 |
| }, |
| { |
| "epoch": 0.6167859466493169, |
| "grad_norm": 0.20329232513904572, |
| "learning_rate": 0.0001, |
| "loss": 2.2972, |
| "step": 948 |
| }, |
| { |
| "epoch": 0.6174365647364997, |
| "grad_norm": 0.21593444049358368, |
| "learning_rate": 0.0001, |
| "loss": 2.8221, |
| "step": 949 |
| }, |
| { |
| "epoch": 0.6180871828236825, |
| "grad_norm": 0.22062361240386963, |
| "learning_rate": 0.0001, |
| "loss": 2.2051, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.6187378009108653, |
| "grad_norm": 0.20640413463115692, |
| "learning_rate": 0.0001, |
| "loss": 2.1973, |
| "step": 951 |
| }, |
| { |
| "epoch": 0.6193884189980482, |
| "grad_norm": 0.18919388949871063, |
| "learning_rate": 0.0001, |
| "loss": 2.1166, |
| "step": 952 |
| }, |
| { |
| "epoch": 0.620039037085231, |
| "grad_norm": 0.18566597998142242, |
| "learning_rate": 0.0001, |
| "loss": 1.9342, |
| "step": 953 |
| }, |
| { |
| "epoch": 0.6206896551724138, |
| "grad_norm": 0.3724953234195709, |
| "learning_rate": 0.0001, |
| "loss": 3.0303, |
| "step": 954 |
| }, |
| { |
| "epoch": 0.6213402732595966, |
| "grad_norm": 0.24559584259986877, |
| "learning_rate": 0.0001, |
| "loss": 2.387, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.6219908913467794, |
| "grad_norm": 0.20384235680103302, |
| "learning_rate": 0.0001, |
| "loss": 2.1224, |
| "step": 956 |
| }, |
| { |
| "epoch": 0.6226415094339622, |
| "grad_norm": 0.3225831687450409, |
| "learning_rate": 0.0001, |
| "loss": 2.4856, |
| "step": 957 |
| }, |
| { |
| "epoch": 0.623292127521145, |
| "grad_norm": 0.21676267683506012, |
| "learning_rate": 0.0001, |
| "loss": 2.3457, |
| "step": 958 |
| }, |
| { |
| "epoch": 0.623942745608328, |
| "grad_norm": 0.21707187592983246, |
| "learning_rate": 0.0001, |
| "loss": 2.3985, |
| "step": 959 |
| }, |
| { |
| "epoch": 0.6245933636955108, |
| "grad_norm": 0.311277836561203, |
| "learning_rate": 0.0001, |
| "loss": 2.3087, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.6252439817826936, |
| "grad_norm": 0.18904085457324982, |
| "learning_rate": 0.0001, |
| "loss": 1.9421, |
| "step": 961 |
| }, |
| { |
| "epoch": 0.6258945998698764, |
| "grad_norm": 0.39046210050582886, |
| "learning_rate": 0.0001, |
| "loss": 2.7524, |
| "step": 962 |
| }, |
| { |
| "epoch": 0.6265452179570592, |
| "grad_norm": 0.18455897271633148, |
| "learning_rate": 0.0001, |
| "loss": 1.7536, |
| "step": 963 |
| }, |
| { |
| "epoch": 0.627195836044242, |
| "grad_norm": 0.1874053180217743, |
| "learning_rate": 0.0001, |
| "loss": 2.0853, |
| "step": 964 |
| }, |
| { |
| "epoch": 0.6278464541314248, |
| "grad_norm": 0.24766068160533905, |
| "learning_rate": 0.0001, |
| "loss": 2.8099, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.6284970722186076, |
| "grad_norm": 0.20977729558944702, |
| "learning_rate": 0.0001, |
| "loss": 2.0339, |
| "step": 966 |
| }, |
| { |
| "epoch": 0.6291476903057905, |
| "grad_norm": 0.2659202516078949, |
| "learning_rate": 0.0001, |
| "loss": 2.1282, |
| "step": 967 |
| }, |
| { |
| "epoch": 0.6297983083929733, |
| "grad_norm": 0.23760046064853668, |
| "learning_rate": 0.0001, |
| "loss": 2.4225, |
| "step": 968 |
| }, |
| { |
| "epoch": 0.6304489264801562, |
| "grad_norm": 0.1884511113166809, |
| "learning_rate": 0.0001, |
| "loss": 1.972, |
| "step": 969 |
| }, |
| { |
| "epoch": 0.631099544567339, |
| "grad_norm": 0.2816404402256012, |
| "learning_rate": 0.0001, |
| "loss": 2.6831, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.6317501626545218, |
| "grad_norm": 0.1874386966228485, |
| "learning_rate": 0.0001, |
| "loss": 2.0042, |
| "step": 971 |
| }, |
| { |
| "epoch": 0.6324007807417046, |
| "grad_norm": 0.21592558920383453, |
| "learning_rate": 0.0001, |
| "loss": 2.338, |
| "step": 972 |
| }, |
| { |
| "epoch": 0.6330513988288874, |
| "grad_norm": 0.22190915048122406, |
| "learning_rate": 0.0001, |
| "loss": 2.23, |
| "step": 973 |
| }, |
| { |
| "epoch": 0.6337020169160703, |
| "grad_norm": 0.23270365595817566, |
| "learning_rate": 0.0001, |
| "loss": 2.1849, |
| "step": 974 |
| }, |
| { |
| "epoch": 0.6343526350032531, |
| "grad_norm": 0.20524165034294128, |
| "learning_rate": 0.0001, |
| "loss": 1.8509, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.6350032530904359, |
| "grad_norm": 0.27826493978500366, |
| "learning_rate": 0.0001, |
| "loss": 2.6736, |
| "step": 976 |
| }, |
| { |
| "epoch": 0.6356538711776187, |
| "grad_norm": 0.19887575507164001, |
| "learning_rate": 0.0001, |
| "loss": 2.1369, |
| "step": 977 |
| }, |
| { |
| "epoch": 0.6363044892648015, |
| "grad_norm": 0.3760605752468109, |
| "learning_rate": 0.0001, |
| "loss": 2.7617, |
| "step": 978 |
| }, |
| { |
| "epoch": 0.6369551073519844, |
| "grad_norm": 0.2116486132144928, |
| "learning_rate": 0.0001, |
| "loss": 2.1353, |
| "step": 979 |
| }, |
| { |
| "epoch": 0.6376057254391672, |
| "grad_norm": 0.20685400068759918, |
| "learning_rate": 0.0001, |
| "loss": 2.2221, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.6382563435263501, |
| "grad_norm": 0.25631460547447205, |
| "learning_rate": 0.0001, |
| "loss": 2.2755, |
| "step": 981 |
| }, |
| { |
| "epoch": 0.6389069616135329, |
| "grad_norm": 0.2831932604312897, |
| "learning_rate": 0.0001, |
| "loss": 2.2544, |
| "step": 982 |
| }, |
| { |
| "epoch": 0.6395575797007157, |
| "grad_norm": 0.19301310181617737, |
| "learning_rate": 0.0001, |
| "loss": 2.1736, |
| "step": 983 |
| }, |
| { |
| "epoch": 0.6402081977878985, |
| "grad_norm": 0.18511143326759338, |
| "learning_rate": 0.0001, |
| "loss": 1.8847, |
| "step": 984 |
| }, |
| { |
| "epoch": 0.6408588158750813, |
| "grad_norm": 0.23753167688846588, |
| "learning_rate": 0.0001, |
| "loss": 2.131, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.6415094339622641, |
| "grad_norm": 0.24566152691841125, |
| "learning_rate": 0.0001, |
| "loss": 2.2071, |
| "step": 986 |
| }, |
| { |
| "epoch": 0.6421600520494469, |
| "grad_norm": 0.21481812000274658, |
| "learning_rate": 0.0001, |
| "loss": 2.0292, |
| "step": 987 |
| }, |
| { |
| "epoch": 0.6428106701366298, |
| "grad_norm": 0.3042278587818146, |
| "learning_rate": 0.0001, |
| "loss": 2.6444, |
| "step": 988 |
| }, |
| { |
| "epoch": 0.6434612882238127, |
| "grad_norm": 0.30741778016090393, |
| "learning_rate": 0.0001, |
| "loss": 2.5146, |
| "step": 989 |
| }, |
| { |
| "epoch": 0.6441119063109955, |
| "grad_norm": 0.40835896134376526, |
| "learning_rate": 0.0001, |
| "loss": 2.9053, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.6447625243981783, |
| "grad_norm": 0.21121574938297272, |
| "learning_rate": 0.0001, |
| "loss": 2.4513, |
| "step": 991 |
| }, |
| { |
| "epoch": 0.6454131424853611, |
| "grad_norm": 0.2634606659412384, |
| "learning_rate": 0.0001, |
| "loss": 2.3141, |
| "step": 992 |
| }, |
| { |
| "epoch": 0.6460637605725439, |
| "grad_norm": 0.2463708072900772, |
| "learning_rate": 0.0001, |
| "loss": 2.4421, |
| "step": 993 |
| }, |
| { |
| "epoch": 0.6467143786597267, |
| "grad_norm": 0.25485244393348694, |
| "learning_rate": 0.0001, |
| "loss": 2.3788, |
| "step": 994 |
| }, |
| { |
| "epoch": 0.6473649967469096, |
| "grad_norm": 0.20773370563983917, |
| "learning_rate": 0.0001, |
| "loss": 1.9861, |
| "step": 995 |
| }, |
| { |
| "epoch": 0.6480156148340924, |
| "grad_norm": 0.20728078484535217, |
| "learning_rate": 0.0001, |
| "loss": 2.3341, |
| "step": 996 |
| }, |
| { |
| "epoch": 0.6486662329212752, |
| "grad_norm": 0.26925981044769287, |
| "learning_rate": 0.0001, |
| "loss": 2.9172, |
| "step": 997 |
| }, |
| { |
| "epoch": 0.649316851008458, |
| "grad_norm": 0.21403877437114716, |
| "learning_rate": 0.0001, |
| "loss": 2.1318, |
| "step": 998 |
| }, |
| { |
| "epoch": 0.6499674690956408, |
| "grad_norm": 0.2597064673900604, |
| "learning_rate": 0.0001, |
| "loss": 2.4316, |
| "step": 999 |
| }, |
| { |
| "epoch": 0.6506180871828237, |
| "grad_norm": 0.26858747005462646, |
| "learning_rate": 0.0001, |
| "loss": 2.2716, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.6512687052700065, |
| "grad_norm": 0.5603036880493164, |
| "learning_rate": 0.0001, |
| "loss": 3.1137, |
| "step": 1001 |
| }, |
| { |
| "epoch": 0.6519193233571894, |
| "grad_norm": 0.2423018366098404, |
| "learning_rate": 0.0001, |
| "loss": 2.2346, |
| "step": 1002 |
| }, |
| { |
| "epoch": 0.6525699414443722, |
| "grad_norm": 0.22914621233940125, |
| "learning_rate": 0.0001, |
| "loss": 2.2852, |
| "step": 1003 |
| }, |
| { |
| "epoch": 0.653220559531555, |
| "grad_norm": 0.22781658172607422, |
| "learning_rate": 0.0001, |
| "loss": 2.1961, |
| "step": 1004 |
| }, |
| { |
| "epoch": 0.6538711776187378, |
| "grad_norm": 0.2614092528820038, |
| "learning_rate": 0.0001, |
| "loss": 2.0631, |
| "step": 1005 |
| }, |
| { |
| "epoch": 0.6545217957059206, |
| "grad_norm": 0.23658867180347443, |
| "learning_rate": 0.0001, |
| "loss": 2.0379, |
| "step": 1006 |
| }, |
| { |
| "epoch": 0.6551724137931034, |
| "grad_norm": 0.20862211287021637, |
| "learning_rate": 0.0001, |
| "loss": 2.2786, |
| "step": 1007 |
| }, |
| { |
| "epoch": 0.6558230318802862, |
| "grad_norm": 0.2251960188150406, |
| "learning_rate": 0.0001, |
| "loss": 2.06, |
| "step": 1008 |
| }, |
| { |
| "epoch": 0.656473649967469, |
| "grad_norm": 0.2885074317455292, |
| "learning_rate": 0.0001, |
| "loss": 2.2583, |
| "step": 1009 |
| }, |
| { |
| "epoch": 0.657124268054652, |
| "grad_norm": 0.20309656858444214, |
| "learning_rate": 0.0001, |
| "loss": 2.1557, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.6577748861418348, |
| "grad_norm": 0.20139531791210175, |
| "learning_rate": 0.0001, |
| "loss": 2.3419, |
| "step": 1011 |
| }, |
| { |
| "epoch": 0.6584255042290176, |
| "grad_norm": 0.2853332757949829, |
| "learning_rate": 0.0001, |
| "loss": 2.1415, |
| "step": 1012 |
| }, |
| { |
| "epoch": 0.6590761223162004, |
| "grad_norm": 0.2907620966434479, |
| "learning_rate": 0.0001, |
| "loss": 2.4452, |
| "step": 1013 |
| }, |
| { |
| "epoch": 0.6597267404033832, |
| "grad_norm": 0.18982461094856262, |
| "learning_rate": 0.0001, |
| "loss": 2.0215, |
| "step": 1014 |
| }, |
| { |
| "epoch": 0.660377358490566, |
| "grad_norm": 0.20890061557292938, |
| "learning_rate": 0.0001, |
| "loss": 2.0383, |
| "step": 1015 |
| }, |
| { |
| "epoch": 0.6610279765777488, |
| "grad_norm": 0.21294118463993073, |
| "learning_rate": 0.0001, |
| "loss": 1.7722, |
| "step": 1016 |
| }, |
| { |
| "epoch": 0.6616785946649317, |
| "grad_norm": 0.22494040429592133, |
| "learning_rate": 0.0001, |
| "loss": 2.034, |
| "step": 1017 |
| }, |
| { |
| "epoch": 0.6623292127521145, |
| "grad_norm": 0.25089555978775024, |
| "learning_rate": 0.0001, |
| "loss": 2.3322, |
| "step": 1018 |
| }, |
| { |
| "epoch": 0.6629798308392973, |
| "grad_norm": 0.18898023664951324, |
| "learning_rate": 0.0001, |
| "loss": 1.9914, |
| "step": 1019 |
| }, |
| { |
| "epoch": 0.6636304489264802, |
| "grad_norm": 0.221091166138649, |
| "learning_rate": 0.0001, |
| "loss": 2.1613, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.664281067013663, |
| "grad_norm": 0.22317297756671906, |
| "learning_rate": 0.0001, |
| "loss": 2.3438, |
| "step": 1021 |
| }, |
| { |
| "epoch": 0.6649316851008458, |
| "grad_norm": 0.18826670944690704, |
| "learning_rate": 0.0001, |
| "loss": 2.0218, |
| "step": 1022 |
| }, |
| { |
| "epoch": 0.6655823031880286, |
| "grad_norm": 0.22612391412258148, |
| "learning_rate": 0.0001, |
| "loss": 2.2931, |
| "step": 1023 |
| }, |
| { |
| "epoch": 0.6662329212752115, |
| "grad_norm": 0.3006114959716797, |
| "learning_rate": 0.0001, |
| "loss": 2.4949, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.6668835393623943, |
| "grad_norm": 0.1835569143295288, |
| "learning_rate": 0.0001, |
| "loss": 1.9396, |
| "step": 1025 |
| }, |
| { |
| "epoch": 0.6675341574495771, |
| "grad_norm": 0.19352416694164276, |
| "learning_rate": 0.0001, |
| "loss": 2.0038, |
| "step": 1026 |
| }, |
| { |
| "epoch": 0.6681847755367599, |
| "grad_norm": 0.2259102463722229, |
| "learning_rate": 0.0001, |
| "loss": 2.1818, |
| "step": 1027 |
| }, |
| { |
| "epoch": 0.6688353936239427, |
| "grad_norm": 0.20237034559249878, |
| "learning_rate": 0.0001, |
| "loss": 2.3196, |
| "step": 1028 |
| }, |
| { |
| "epoch": 0.6694860117111255, |
| "grad_norm": 0.1844060719013214, |
| "learning_rate": 0.0001, |
| "loss": 2.1389, |
| "step": 1029 |
| }, |
| { |
| "epoch": 0.6701366297983083, |
| "grad_norm": 0.21057841181755066, |
| "learning_rate": 0.0001, |
| "loss": 2.0058, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.6707872478854913, |
| "grad_norm": 0.20054426789283752, |
| "learning_rate": 0.0001, |
| "loss": 2.2874, |
| "step": 1031 |
| }, |
| { |
| "epoch": 0.6714378659726741, |
| "grad_norm": 0.2507307529449463, |
| "learning_rate": 0.0001, |
| "loss": 2.4245, |
| "step": 1032 |
| }, |
| { |
| "epoch": 0.6720884840598569, |
| "grad_norm": 0.21066251397132874, |
| "learning_rate": 0.0001, |
| "loss": 2.1688, |
| "step": 1033 |
| }, |
| { |
| "epoch": 0.6727391021470397, |
| "grad_norm": 0.22210632264614105, |
| "learning_rate": 0.0001, |
| "loss": 2.1985, |
| "step": 1034 |
| }, |
| { |
| "epoch": 0.6733897202342225, |
| "grad_norm": 0.21617744863033295, |
| "learning_rate": 0.0001, |
| "loss": 2.5918, |
| "step": 1035 |
| }, |
| { |
| "epoch": 0.6740403383214053, |
| "grad_norm": 0.46473971009254456, |
| "learning_rate": 0.0001, |
| "loss": 2.9341, |
| "step": 1036 |
| }, |
| { |
| "epoch": 0.6746909564085881, |
| "grad_norm": 0.20464558899402618, |
| "learning_rate": 0.0001, |
| "loss": 2.1654, |
| "step": 1037 |
| }, |
| { |
| "epoch": 0.675341574495771, |
| "grad_norm": 0.212956503033638, |
| "learning_rate": 0.0001, |
| "loss": 2.1959, |
| "step": 1038 |
| }, |
| { |
| "epoch": 0.6759921925829538, |
| "grad_norm": 0.2572340667247772, |
| "learning_rate": 0.0001, |
| "loss": 2.4918, |
| "step": 1039 |
| }, |
| { |
| "epoch": 0.6766428106701367, |
| "grad_norm": 0.3264685273170471, |
| "learning_rate": 0.0001, |
| "loss": 2.8708, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.6772934287573195, |
| "grad_norm": 0.22119931876659393, |
| "learning_rate": 0.0001, |
| "loss": 2.2222, |
| "step": 1041 |
| }, |
| { |
| "epoch": 0.6779440468445023, |
| "grad_norm": 0.24374569952487946, |
| "learning_rate": 0.0001, |
| "loss": 2.2457, |
| "step": 1042 |
| }, |
| { |
| "epoch": 0.6785946649316851, |
| "grad_norm": 0.2548108696937561, |
| "learning_rate": 0.0001, |
| "loss": 2.485, |
| "step": 1043 |
| }, |
| { |
| "epoch": 0.6792452830188679, |
| "grad_norm": 0.20976418256759644, |
| "learning_rate": 0.0001, |
| "loss": 2.3068, |
| "step": 1044 |
| }, |
| { |
| "epoch": 0.6798959011060507, |
| "grad_norm": 0.25135618448257446, |
| "learning_rate": 0.0001, |
| "loss": 2.1083, |
| "step": 1045 |
| }, |
| { |
| "epoch": 0.6805465191932336, |
| "grad_norm": 0.2677728831768036, |
| "learning_rate": 0.0001, |
| "loss": 2.4257, |
| "step": 1046 |
| }, |
| { |
| "epoch": 0.6811971372804164, |
| "grad_norm": 0.20250125229358673, |
| "learning_rate": 0.0001, |
| "loss": 2.0643, |
| "step": 1047 |
| }, |
| { |
| "epoch": 0.6818477553675992, |
| "grad_norm": 0.20850299298763275, |
| "learning_rate": 0.0001, |
| "loss": 2.0383, |
| "step": 1048 |
| }, |
| { |
| "epoch": 0.682498373454782, |
| "grad_norm": 0.21116970479488373, |
| "learning_rate": 0.0001, |
| "loss": 2.0259, |
| "step": 1049 |
| }, |
| { |
| "epoch": 0.6831489915419648, |
| "grad_norm": 0.2572707235813141, |
| "learning_rate": 0.0001, |
| "loss": 2.1982, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.6837996096291477, |
| "grad_norm": 0.2010831981897354, |
| "learning_rate": 0.0001, |
| "loss": 2.0687, |
| "step": 1051 |
| }, |
| { |
| "epoch": 0.6844502277163305, |
| "grad_norm": 0.23995356261730194, |
| "learning_rate": 0.0001, |
| "loss": 2.1938, |
| "step": 1052 |
| }, |
| { |
| "epoch": 0.6851008458035134, |
| "grad_norm": 0.21428103744983673, |
| "learning_rate": 0.0001, |
| "loss": 2.2514, |
| "step": 1053 |
| }, |
| { |
| "epoch": 0.6857514638906962, |
| "grad_norm": 0.21370433270931244, |
| "learning_rate": 0.0001, |
| "loss": 2.2523, |
| "step": 1054 |
| }, |
| { |
| "epoch": 0.686402081977879, |
| "grad_norm": 0.2131800800561905, |
| "learning_rate": 0.0001, |
| "loss": 2.2413, |
| "step": 1055 |
| }, |
| { |
| "epoch": 0.6870527000650618, |
| "grad_norm": 0.20007681846618652, |
| "learning_rate": 0.0001, |
| "loss": 2.176, |
| "step": 1056 |
| }, |
| { |
| "epoch": 0.6877033181522446, |
| "grad_norm": 0.2108153998851776, |
| "learning_rate": 0.0001, |
| "loss": 2.1081, |
| "step": 1057 |
| }, |
| { |
| "epoch": 0.6883539362394274, |
| "grad_norm": 0.19952858984470367, |
| "learning_rate": 0.0001, |
| "loss": 2.0249, |
| "step": 1058 |
| }, |
| { |
| "epoch": 0.6890045543266102, |
| "grad_norm": 0.20590882003307343, |
| "learning_rate": 0.0001, |
| "loss": 2.1949, |
| "step": 1059 |
| }, |
| { |
| "epoch": 0.6896551724137931, |
| "grad_norm": 0.2126530408859253, |
| "learning_rate": 0.0001, |
| "loss": 2.2726, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.690305790500976, |
| "grad_norm": 0.30162468552589417, |
| "learning_rate": 0.0001, |
| "loss": 2.5032, |
| "step": 1061 |
| }, |
| { |
| "epoch": 0.6909564085881588, |
| "grad_norm": 0.24452462792396545, |
| "learning_rate": 0.0001, |
| "loss": 2.3021, |
| "step": 1062 |
| }, |
| { |
| "epoch": 0.6916070266753416, |
| "grad_norm": 0.17819760739803314, |
| "learning_rate": 0.0001, |
| "loss": 1.9628, |
| "step": 1063 |
| }, |
| { |
| "epoch": 0.6922576447625244, |
| "grad_norm": 0.17437471449375153, |
| "learning_rate": 0.0001, |
| "loss": 1.879, |
| "step": 1064 |
| }, |
| { |
| "epoch": 0.6929082628497072, |
| "grad_norm": 0.3003963232040405, |
| "learning_rate": 0.0001, |
| "loss": 2.4695, |
| "step": 1065 |
| }, |
| { |
| "epoch": 0.69355888093689, |
| "grad_norm": 0.2007562667131424, |
| "learning_rate": 0.0001, |
| "loss": 1.9754, |
| "step": 1066 |
| }, |
| { |
| "epoch": 0.6942094990240729, |
| "grad_norm": 0.21425336599349976, |
| "learning_rate": 0.0001, |
| "loss": 2.1767, |
| "step": 1067 |
| }, |
| { |
| "epoch": 0.6948601171112557, |
| "grad_norm": 0.20287302136421204, |
| "learning_rate": 0.0001, |
| "loss": 1.9933, |
| "step": 1068 |
| }, |
| { |
| "epoch": 0.6955107351984385, |
| "grad_norm": 0.2762700021266937, |
| "learning_rate": 0.0001, |
| "loss": 2.1079, |
| "step": 1069 |
| }, |
| { |
| "epoch": 0.6961613532856213, |
| "grad_norm": 0.18358288705348969, |
| "learning_rate": 0.0001, |
| "loss": 1.9445, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.6968119713728042, |
| "grad_norm": 0.21157526969909668, |
| "learning_rate": 0.0001, |
| "loss": 2.169, |
| "step": 1071 |
| }, |
| { |
| "epoch": 0.697462589459987, |
| "grad_norm": 0.1847715675830841, |
| "learning_rate": 0.0001, |
| "loss": 2.0757, |
| "step": 1072 |
| }, |
| { |
| "epoch": 0.6981132075471698, |
| "grad_norm": 0.1923181712627411, |
| "learning_rate": 0.0001, |
| "loss": 2.2365, |
| "step": 1073 |
| }, |
| { |
| "epoch": 0.6987638256343527, |
| "grad_norm": 0.26491835713386536, |
| "learning_rate": 0.0001, |
| "loss": 2.4613, |
| "step": 1074 |
| }, |
| { |
| "epoch": 0.6994144437215355, |
| "grad_norm": 0.17674419283866882, |
| "learning_rate": 0.0001, |
| "loss": 1.9706, |
| "step": 1075 |
| }, |
| { |
| "epoch": 0.7000650618087183, |
| "grad_norm": 0.19894379377365112, |
| "learning_rate": 0.0001, |
| "loss": 1.9227, |
| "step": 1076 |
| }, |
| { |
| "epoch": 0.7007156798959011, |
| "grad_norm": 0.19496971368789673, |
| "learning_rate": 0.0001, |
| "loss": 2.1783, |
| "step": 1077 |
| }, |
| { |
| "epoch": 0.7013662979830839, |
| "grad_norm": 0.20685461163520813, |
| "learning_rate": 0.0001, |
| "loss": 2.1542, |
| "step": 1078 |
| }, |
| { |
| "epoch": 0.7020169160702667, |
| "grad_norm": 0.23061524331569672, |
| "learning_rate": 0.0001, |
| "loss": 2.3346, |
| "step": 1079 |
| }, |
| { |
| "epoch": 0.7026675341574495, |
| "grad_norm": 0.2044321447610855, |
| "learning_rate": 0.0001, |
| "loss": 2.0157, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.7033181522446325, |
| "grad_norm": 0.18851466476917267, |
| "learning_rate": 0.0001, |
| "loss": 2.2045, |
| "step": 1081 |
| }, |
| { |
| "epoch": 0.7039687703318153, |
| "grad_norm": 0.18530018627643585, |
| "learning_rate": 0.0001, |
| "loss": 2.0695, |
| "step": 1082 |
| }, |
| { |
| "epoch": 0.7046193884189981, |
| "grad_norm": 0.23562023043632507, |
| "learning_rate": 0.0001, |
| "loss": 2.3919, |
| "step": 1083 |
| }, |
| { |
| "epoch": 0.7052700065061809, |
| "grad_norm": 0.22246116399765015, |
| "learning_rate": 0.0001, |
| "loss": 2.5821, |
| "step": 1084 |
| }, |
| { |
| "epoch": 0.7059206245933637, |
| "grad_norm": 0.2134729027748108, |
| "learning_rate": 0.0001, |
| "loss": 2.2181, |
| "step": 1085 |
| }, |
| { |
| "epoch": 0.7065712426805465, |
| "grad_norm": 0.29674917459487915, |
| "learning_rate": 0.0001, |
| "loss": 2.5069, |
| "step": 1086 |
| }, |
| { |
| "epoch": 0.7072218607677293, |
| "grad_norm": 0.2098974883556366, |
| "learning_rate": 0.0001, |
| "loss": 2.3307, |
| "step": 1087 |
| }, |
| { |
| "epoch": 0.7078724788549121, |
| "grad_norm": 0.27041876316070557, |
| "learning_rate": 0.0001, |
| "loss": 2.8081, |
| "step": 1088 |
| }, |
| { |
| "epoch": 0.708523096942095, |
| "grad_norm": 0.19734299182891846, |
| "learning_rate": 0.0001, |
| "loss": 2.0588, |
| "step": 1089 |
| }, |
| { |
| "epoch": 0.7091737150292778, |
| "grad_norm": 0.22952257096767426, |
| "learning_rate": 0.0001, |
| "loss": 2.2607, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.7098243331164606, |
| "grad_norm": 0.20846691727638245, |
| "learning_rate": 0.0001, |
| "loss": 2.1657, |
| "step": 1091 |
| }, |
| { |
| "epoch": 0.7104749512036435, |
| "grad_norm": 0.19664259254932404, |
| "learning_rate": 0.0001, |
| "loss": 2.1256, |
| "step": 1092 |
| }, |
| { |
| "epoch": 0.7111255692908263, |
| "grad_norm": 0.23994791507720947, |
| "learning_rate": 0.0001, |
| "loss": 2.5377, |
| "step": 1093 |
| }, |
| { |
| "epoch": 0.7117761873780091, |
| "grad_norm": 0.22439789772033691, |
| "learning_rate": 0.0001, |
| "loss": 2.6225, |
| "step": 1094 |
| }, |
| { |
| "epoch": 0.7124268054651919, |
| "grad_norm": 0.20211316645145416, |
| "learning_rate": 0.0001, |
| "loss": 2.0582, |
| "step": 1095 |
| }, |
| { |
| "epoch": 0.7130774235523748, |
| "grad_norm": 0.23308198153972626, |
| "learning_rate": 0.0001, |
| "loss": 2.4341, |
| "step": 1096 |
| }, |
| { |
| "epoch": 0.7137280416395576, |
| "grad_norm": 0.17806245386600494, |
| "learning_rate": 0.0001, |
| "loss": 2.0211, |
| "step": 1097 |
| }, |
| { |
| "epoch": 0.7143786597267404, |
| "grad_norm": 0.20525243878364563, |
| "learning_rate": 0.0001, |
| "loss": 2.1248, |
| "step": 1098 |
| }, |
| { |
| "epoch": 0.7150292778139232, |
| "grad_norm": 0.22835716605186462, |
| "learning_rate": 0.0001, |
| "loss": 2.2993, |
| "step": 1099 |
| }, |
| { |
| "epoch": 0.715679895901106, |
| "grad_norm": 0.37078213691711426, |
| "learning_rate": 0.0001, |
| "loss": 3.1289, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.7163305139882888, |
| "grad_norm": 0.22253082692623138, |
| "learning_rate": 0.0001, |
| "loss": 2.2304, |
| "step": 1101 |
| }, |
| { |
| "epoch": 0.7169811320754716, |
| "grad_norm": 0.20494401454925537, |
| "learning_rate": 0.0001, |
| "loss": 1.9473, |
| "step": 1102 |
| }, |
| { |
| "epoch": 0.7176317501626546, |
| "grad_norm": 0.22128112614154816, |
| "learning_rate": 0.0001, |
| "loss": 1.993, |
| "step": 1103 |
| }, |
| { |
| "epoch": 0.7182823682498374, |
| "grad_norm": 0.20786182582378387, |
| "learning_rate": 0.0001, |
| "loss": 2.0048, |
| "step": 1104 |
| }, |
| { |
| "epoch": 0.7189329863370202, |
| "grad_norm": 0.27697819471359253, |
| "learning_rate": 0.0001, |
| "loss": 2.372, |
| "step": 1105 |
| }, |
| { |
| "epoch": 0.719583604424203, |
| "grad_norm": 0.26237788796424866, |
| "learning_rate": 0.0001, |
| "loss": 1.9573, |
| "step": 1106 |
| }, |
| { |
| "epoch": 0.7202342225113858, |
| "grad_norm": 0.2544906437397003, |
| "learning_rate": 0.0001, |
| "loss": 2.2805, |
| "step": 1107 |
| }, |
| { |
| "epoch": 0.7208848405985686, |
| "grad_norm": 0.2175043374300003, |
| "learning_rate": 0.0001, |
| "loss": 2.3201, |
| "step": 1108 |
| }, |
| { |
| "epoch": 0.7215354586857514, |
| "grad_norm": 0.19637277722358704, |
| "learning_rate": 0.0001, |
| "loss": 1.8868, |
| "step": 1109 |
| }, |
| { |
| "epoch": 0.7221860767729343, |
| "grad_norm": 0.19888024032115936, |
| "learning_rate": 0.0001, |
| "loss": 2.0324, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.7228366948601171, |
| "grad_norm": 0.20008981227874756, |
| "learning_rate": 0.0001, |
| "loss": 2.2898, |
| "step": 1111 |
| }, |
| { |
| "epoch": 0.7234873129473, |
| "grad_norm": 0.25185343623161316, |
| "learning_rate": 0.0001, |
| "loss": 2.2424, |
| "step": 1112 |
| }, |
| { |
| "epoch": 0.7241379310344828, |
| "grad_norm": 0.2434062957763672, |
| "learning_rate": 0.0001, |
| "loss": 2.2884, |
| "step": 1113 |
| }, |
| { |
| "epoch": 0.7247885491216656, |
| "grad_norm": 0.2278825044631958, |
| "learning_rate": 0.0001, |
| "loss": 2.1751, |
| "step": 1114 |
| }, |
| { |
| "epoch": 0.7254391672088484, |
| "grad_norm": 0.23180316388607025, |
| "learning_rate": 0.0001, |
| "loss": 2.6033, |
| "step": 1115 |
| }, |
| { |
| "epoch": 0.7260897852960312, |
| "grad_norm": 0.18574117124080658, |
| "learning_rate": 0.0001, |
| "loss": 2.3172, |
| "step": 1116 |
| }, |
| { |
| "epoch": 0.7267404033832141, |
| "grad_norm": 0.286155641078949, |
| "learning_rate": 0.0001, |
| "loss": 2.0482, |
| "step": 1117 |
| }, |
| { |
| "epoch": 0.7273910214703969, |
| "grad_norm": 0.1757357120513916, |
| "learning_rate": 0.0001, |
| "loss": 1.8881, |
| "step": 1118 |
| }, |
| { |
| "epoch": 0.7280416395575797, |
| "grad_norm": 0.25008201599121094, |
| "learning_rate": 0.0001, |
| "loss": 2.3797, |
| "step": 1119 |
| }, |
| { |
| "epoch": 0.7286922576447625, |
| "grad_norm": 0.29816892743110657, |
| "learning_rate": 0.0001, |
| "loss": 2.9163, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.7293428757319453, |
| "grad_norm": 0.1951293647289276, |
| "learning_rate": 0.0001, |
| "loss": 2.0613, |
| "step": 1121 |
| }, |
| { |
| "epoch": 0.7299934938191281, |
| "grad_norm": 0.23593062162399292, |
| "learning_rate": 0.0001, |
| "loss": 2.2103, |
| "step": 1122 |
| }, |
| { |
| "epoch": 0.730644111906311, |
| "grad_norm": 0.18619036674499512, |
| "learning_rate": 0.0001, |
| "loss": 1.9223, |
| "step": 1123 |
| }, |
| { |
| "epoch": 0.7312947299934938, |
| "grad_norm": 0.20853224396705627, |
| "learning_rate": 0.0001, |
| "loss": 2.2651, |
| "step": 1124 |
| }, |
| { |
| "epoch": 0.7319453480806767, |
| "grad_norm": 0.27427271008491516, |
| "learning_rate": 0.0001, |
| "loss": 2.3866, |
| "step": 1125 |
| }, |
| { |
| "epoch": 0.7325959661678595, |
| "grad_norm": 0.35531318187713623, |
| "learning_rate": 0.0001, |
| "loss": 2.8333, |
| "step": 1126 |
| }, |
| { |
| "epoch": 0.7332465842550423, |
| "grad_norm": 0.21375155448913574, |
| "learning_rate": 0.0001, |
| "loss": 2.0703, |
| "step": 1127 |
| }, |
| { |
| "epoch": 0.7338972023422251, |
| "grad_norm": 0.24240247905254364, |
| "learning_rate": 0.0001, |
| "loss": 2.3032, |
| "step": 1128 |
| }, |
| { |
| "epoch": 0.7345478204294079, |
| "grad_norm": 0.2277136594057083, |
| "learning_rate": 0.0001, |
| "loss": 2.585, |
| "step": 1129 |
| }, |
| { |
| "epoch": 0.7351984385165907, |
| "grad_norm": 0.20665140450000763, |
| "learning_rate": 0.0001, |
| "loss": 2.1351, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.7358490566037735, |
| "grad_norm": 0.2534540891647339, |
| "learning_rate": 0.0001, |
| "loss": 2.5023, |
| "step": 1131 |
| }, |
| { |
| "epoch": 0.7364996746909565, |
| "grad_norm": 0.19695554673671722, |
| "learning_rate": 0.0001, |
| "loss": 1.9286, |
| "step": 1132 |
| }, |
| { |
| "epoch": 0.7371502927781393, |
| "grad_norm": 0.18500645458698273, |
| "learning_rate": 0.0001, |
| "loss": 2.0609, |
| "step": 1133 |
| }, |
| { |
| "epoch": 0.7378009108653221, |
| "grad_norm": 0.2103162556886673, |
| "learning_rate": 0.0001, |
| "loss": 2.2247, |
| "step": 1134 |
| }, |
| { |
| "epoch": 0.7384515289525049, |
| "grad_norm": 0.20303300023078918, |
| "learning_rate": 0.0001, |
| "loss": 2.1164, |
| "step": 1135 |
| }, |
| { |
| "epoch": 0.7391021470396877, |
| "grad_norm": 0.23574739694595337, |
| "learning_rate": 0.0001, |
| "loss": 2.6325, |
| "step": 1136 |
| }, |
| { |
| "epoch": 0.7397527651268705, |
| "grad_norm": 0.2764929234981537, |
| "learning_rate": 0.0001, |
| "loss": 2.3049, |
| "step": 1137 |
| }, |
| { |
| "epoch": 0.7404033832140533, |
| "grad_norm": 0.23995018005371094, |
| "learning_rate": 0.0001, |
| "loss": 2.3196, |
| "step": 1138 |
| }, |
| { |
| "epoch": 0.7410540013012362, |
| "grad_norm": 0.19074063003063202, |
| "learning_rate": 0.0001, |
| "loss": 2.1566, |
| "step": 1139 |
| }, |
| { |
| "epoch": 0.741704619388419, |
| "grad_norm": 0.18186306953430176, |
| "learning_rate": 0.0001, |
| "loss": 1.9629, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.7423552374756018, |
| "grad_norm": 0.23841345310211182, |
| "learning_rate": 0.0001, |
| "loss": 2.1942, |
| "step": 1141 |
| }, |
| { |
| "epoch": 0.7430058555627846, |
| "grad_norm": 0.19697019457817078, |
| "learning_rate": 0.0001, |
| "loss": 2.0186, |
| "step": 1142 |
| }, |
| { |
| "epoch": 0.7436564736499675, |
| "grad_norm": 0.2117876410484314, |
| "learning_rate": 0.0001, |
| "loss": 2.4395, |
| "step": 1143 |
| }, |
| { |
| "epoch": 0.7443070917371503, |
| "grad_norm": 0.26921918988227844, |
| "learning_rate": 0.0001, |
| "loss": 2.4332, |
| "step": 1144 |
| }, |
| { |
| "epoch": 0.7449577098243331, |
| "grad_norm": 0.18999671936035156, |
| "learning_rate": 0.0001, |
| "loss": 2.0209, |
| "step": 1145 |
| }, |
| { |
| "epoch": 0.745608327911516, |
| "grad_norm": 0.22686484456062317, |
| "learning_rate": 0.0001, |
| "loss": 2.4369, |
| "step": 1146 |
| }, |
| { |
| "epoch": 0.7462589459986988, |
| "grad_norm": 0.22974656522274017, |
| "learning_rate": 0.0001, |
| "loss": 2.3737, |
| "step": 1147 |
| }, |
| { |
| "epoch": 0.7469095640858816, |
| "grad_norm": 0.19007977843284607, |
| "learning_rate": 0.0001, |
| "loss": 2.145, |
| "step": 1148 |
| }, |
| { |
| "epoch": 0.7475601821730644, |
| "grad_norm": 0.23000845313072205, |
| "learning_rate": 0.0001, |
| "loss": 2.0555, |
| "step": 1149 |
| }, |
| { |
| "epoch": 0.7482108002602472, |
| "grad_norm": 0.33339783549308777, |
| "learning_rate": 0.0001, |
| "loss": 2.7318, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.74886141834743, |
| "grad_norm": 0.18458595871925354, |
| "learning_rate": 0.0001, |
| "loss": 1.7868, |
| "step": 1151 |
| }, |
| { |
| "epoch": 0.7495120364346128, |
| "grad_norm": 0.2283509373664856, |
| "learning_rate": 0.0001, |
| "loss": 2.2609, |
| "step": 1152 |
| }, |
| { |
| "epoch": 0.7501626545217958, |
| "grad_norm": 0.31175729632377625, |
| "learning_rate": 0.0001, |
| "loss": 2.5524, |
| "step": 1153 |
| }, |
| { |
| "epoch": 0.7508132726089786, |
| "grad_norm": 0.18617112934589386, |
| "learning_rate": 0.0001, |
| "loss": 2.2029, |
| "step": 1154 |
| }, |
| { |
| "epoch": 0.7514638906961614, |
| "grad_norm": 0.28690317273139954, |
| "learning_rate": 0.0001, |
| "loss": 2.4705, |
| "step": 1155 |
| }, |
| { |
| "epoch": 0.7521145087833442, |
| "grad_norm": 0.2267671674489975, |
| "learning_rate": 0.0001, |
| "loss": 2.1093, |
| "step": 1156 |
| }, |
| { |
| "epoch": 0.752765126870527, |
| "grad_norm": 0.21956512331962585, |
| "learning_rate": 0.0001, |
| "loss": 2.0962, |
| "step": 1157 |
| }, |
| { |
| "epoch": 0.7534157449577098, |
| "grad_norm": 0.2681393027305603, |
| "learning_rate": 0.0001, |
| "loss": 2.35, |
| "step": 1158 |
| }, |
| { |
| "epoch": 0.7540663630448926, |
| "grad_norm": 0.23306699097156525, |
| "learning_rate": 0.0001, |
| "loss": 2.4911, |
| "step": 1159 |
| }, |
| { |
| "epoch": 0.7547169811320755, |
| "grad_norm": 0.3148876428604126, |
| "learning_rate": 0.0001, |
| "loss": 2.8802, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.7553675992192583, |
| "grad_norm": 0.2260347157716751, |
| "learning_rate": 0.0001, |
| "loss": 1.9286, |
| "step": 1161 |
| }, |
| { |
| "epoch": 0.7560182173064411, |
| "grad_norm": 0.24939195811748505, |
| "learning_rate": 0.0001, |
| "loss": 2.3544, |
| "step": 1162 |
| }, |
| { |
| "epoch": 0.756668835393624, |
| "grad_norm": 0.21007601916790009, |
| "learning_rate": 0.0001, |
| "loss": 2.0132, |
| "step": 1163 |
| }, |
| { |
| "epoch": 0.7573194534808068, |
| "grad_norm": 0.2570975720882416, |
| "learning_rate": 0.0001, |
| "loss": 1.9665, |
| "step": 1164 |
| }, |
| { |
| "epoch": 0.7579700715679896, |
| "grad_norm": 0.2818357050418854, |
| "learning_rate": 0.0001, |
| "loss": 2.2252, |
| "step": 1165 |
| }, |
| { |
| "epoch": 0.7586206896551724, |
| "grad_norm": 0.22388941049575806, |
| "learning_rate": 0.0001, |
| "loss": 2.4553, |
| "step": 1166 |
| }, |
| { |
| "epoch": 0.7592713077423552, |
| "grad_norm": 0.22799374163150787, |
| "learning_rate": 0.0001, |
| "loss": 2.4447, |
| "step": 1167 |
| }, |
| { |
| "epoch": 0.7599219258295381, |
| "grad_norm": 0.2610357105731964, |
| "learning_rate": 0.0001, |
| "loss": 2.4024, |
| "step": 1168 |
| }, |
| { |
| "epoch": 0.7605725439167209, |
| "grad_norm": 0.39793217182159424, |
| "learning_rate": 0.0001, |
| "loss": 3.1529, |
| "step": 1169 |
| }, |
| { |
| "epoch": 0.7612231620039037, |
| "grad_norm": 0.19805116951465607, |
| "learning_rate": 0.0001, |
| "loss": 1.9483, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.7618737800910865, |
| "grad_norm": 0.208368182182312, |
| "learning_rate": 0.0001, |
| "loss": 2.1785, |
| "step": 1171 |
| }, |
| { |
| "epoch": 0.7625243981782693, |
| "grad_norm": 0.25101637840270996, |
| "learning_rate": 0.0001, |
| "loss": 2.2517, |
| "step": 1172 |
| }, |
| { |
| "epoch": 0.7631750162654521, |
| "grad_norm": 0.27432793378829956, |
| "learning_rate": 0.0001, |
| "loss": 2.4759, |
| "step": 1173 |
| }, |
| { |
| "epoch": 0.763825634352635, |
| "grad_norm": 0.18746371567249298, |
| "learning_rate": 0.0001, |
| "loss": 2.0188, |
| "step": 1174 |
| }, |
| { |
| "epoch": 0.7644762524398179, |
| "grad_norm": 0.2882263958454132, |
| "learning_rate": 0.0001, |
| "loss": 2.2948, |
| "step": 1175 |
| }, |
| { |
| "epoch": 0.7651268705270007, |
| "grad_norm": 0.22075092792510986, |
| "learning_rate": 0.0001, |
| "loss": 2.4894, |
| "step": 1176 |
| }, |
| { |
| "epoch": 0.7657774886141835, |
| "grad_norm": 0.20792776346206665, |
| "learning_rate": 0.0001, |
| "loss": 1.8502, |
| "step": 1177 |
| }, |
| { |
| "epoch": 0.7664281067013663, |
| "grad_norm": 0.2436477392911911, |
| "learning_rate": 0.0001, |
| "loss": 2.1296, |
| "step": 1178 |
| }, |
| { |
| "epoch": 0.7670787247885491, |
| "grad_norm": 0.2839182913303375, |
| "learning_rate": 0.0001, |
| "loss": 2.8409, |
| "step": 1179 |
| }, |
| { |
| "epoch": 0.7677293428757319, |
| "grad_norm": 0.1826743334531784, |
| "learning_rate": 0.0001, |
| "loss": 1.941, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.7683799609629147, |
| "grad_norm": 0.2757255434989929, |
| "learning_rate": 0.0001, |
| "loss": 2.7297, |
| "step": 1181 |
| }, |
| { |
| "epoch": 0.7690305790500976, |
| "grad_norm": 0.23313826322555542, |
| "learning_rate": 0.0001, |
| "loss": 2.8796, |
| "step": 1182 |
| }, |
| { |
| "epoch": 0.7696811971372804, |
| "grad_norm": 0.28900882601737976, |
| "learning_rate": 0.0001, |
| "loss": 2.313, |
| "step": 1183 |
| }, |
| { |
| "epoch": 0.7703318152244633, |
| "grad_norm": 0.32883039116859436, |
| "learning_rate": 0.0001, |
| "loss": 3.041, |
| "step": 1184 |
| }, |
| { |
| "epoch": 0.7709824333116461, |
| "grad_norm": 0.2116912454366684, |
| "learning_rate": 0.0001, |
| "loss": 1.9891, |
| "step": 1185 |
| }, |
| { |
| "epoch": 0.7716330513988289, |
| "grad_norm": 0.2055017203092575, |
| "learning_rate": 0.0001, |
| "loss": 1.9567, |
| "step": 1186 |
| }, |
| { |
| "epoch": 0.7722836694860117, |
| "grad_norm": 0.2978801131248474, |
| "learning_rate": 0.0001, |
| "loss": 2.3322, |
| "step": 1187 |
| }, |
| { |
| "epoch": 0.7729342875731945, |
| "grad_norm": 0.21910034120082855, |
| "learning_rate": 0.0001, |
| "loss": 2.0262, |
| "step": 1188 |
| }, |
| { |
| "epoch": 0.7735849056603774, |
| "grad_norm": 0.19952894747257233, |
| "learning_rate": 0.0001, |
| "loss": 2.0621, |
| "step": 1189 |
| }, |
| { |
| "epoch": 0.7742355237475602, |
| "grad_norm": 0.20744554698467255, |
| "learning_rate": 0.0001, |
| "loss": 2.1154, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.774886141834743, |
| "grad_norm": 0.23886847496032715, |
| "learning_rate": 0.0001, |
| "loss": 2.3023, |
| "step": 1191 |
| }, |
| { |
| "epoch": 0.7755367599219258, |
| "grad_norm": 0.20722374320030212, |
| "learning_rate": 0.0001, |
| "loss": 2.2384, |
| "step": 1192 |
| }, |
| { |
| "epoch": 0.7761873780091086, |
| "grad_norm": 0.23317816853523254, |
| "learning_rate": 0.0001, |
| "loss": 2.6381, |
| "step": 1193 |
| }, |
| { |
| "epoch": 0.7768379960962914, |
| "grad_norm": 0.2527480125427246, |
| "learning_rate": 0.0001, |
| "loss": 2.1711, |
| "step": 1194 |
| }, |
| { |
| "epoch": 0.7774886141834743, |
| "grad_norm": 0.23817451298236847, |
| "learning_rate": 0.0001, |
| "loss": 2.6561, |
| "step": 1195 |
| }, |
| { |
| "epoch": 0.7781392322706572, |
| "grad_norm": 0.2609005570411682, |
| "learning_rate": 0.0001, |
| "loss": 2.5488, |
| "step": 1196 |
| }, |
| { |
| "epoch": 0.77878985035784, |
| "grad_norm": 0.19870908558368683, |
| "learning_rate": 0.0001, |
| "loss": 2.0435, |
| "step": 1197 |
| }, |
| { |
| "epoch": 0.7794404684450228, |
| "grad_norm": 0.20385386049747467, |
| "learning_rate": 0.0001, |
| "loss": 1.9711, |
| "step": 1198 |
| }, |
| { |
| "epoch": 0.7800910865322056, |
| "grad_norm": 0.20179738104343414, |
| "learning_rate": 0.0001, |
| "loss": 2.0247, |
| "step": 1199 |
| }, |
| { |
| "epoch": 0.7807417046193884, |
| "grad_norm": 0.40090981125831604, |
| "learning_rate": 0.0001, |
| "loss": 2.795, |
| "step": 1200 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 1537, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 300, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.8102162944950272e+18, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|