| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9997883760749533, |
| "eval_steps": 100, |
| "global_step": 1624, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0006156332364993556, |
| "grad_norm": 26.714405059814453, |
| "learning_rate": 2.0408163265306121e-07, |
| "loss": 0.9855, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.0012312664729987111, |
| "grad_norm": 25.45075225830078, |
| "learning_rate": 4.0816326530612243e-07, |
| "loss": 0.926, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.0018468997094980666, |
| "grad_norm": 24.12227439880371, |
| "learning_rate": 6.122448979591837e-07, |
| "loss": 0.8903, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.0024625329459974222, |
| "grad_norm": 26.573762893676758, |
| "learning_rate": 8.163265306122449e-07, |
| "loss": 0.9779, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.0030781661824967775, |
| "grad_norm": 26.632810592651367, |
| "learning_rate": 1.0204081632653063e-06, |
| "loss": 0.9337, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.003693799418996133, |
| "grad_norm": 24.73032569885254, |
| "learning_rate": 1.2244897959183673e-06, |
| "loss": 0.8677, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.004309432655495489, |
| "grad_norm": 22.517139434814453, |
| "learning_rate": 1.4285714285714286e-06, |
| "loss": 0.813, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.0049250658919948445, |
| "grad_norm": 20.523988723754883, |
| "learning_rate": 1.6326530612244897e-06, |
| "loss": 0.7831, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.005540699128494199, |
| "grad_norm": 20.76930046081543, |
| "learning_rate": 1.8367346938775512e-06, |
| "loss": 0.7559, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.006156332364993555, |
| "grad_norm": 16.376604080200195, |
| "learning_rate": 2.0408163265306125e-06, |
| "loss": 0.6649, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.006771965601492911, |
| "grad_norm": 19.152969360351562, |
| "learning_rate": 2.244897959183674e-06, |
| "loss": 0.5937, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.007387598837992266, |
| "grad_norm": 14.190017700195312, |
| "learning_rate": 2.4489795918367347e-06, |
| "loss": 0.5062, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.008003232074491622, |
| "grad_norm": 7.9161272048950195, |
| "learning_rate": 2.6530612244897964e-06, |
| "loss": 0.5011, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.008618865310990978, |
| "grad_norm": 21.18466567993164, |
| "learning_rate": 2.8571428571428573e-06, |
| "loss": 0.4779, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.009234498547490333, |
| "grad_norm": 21.010019302368164, |
| "learning_rate": 3.0612244897959185e-06, |
| "loss": 0.582, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.009850131783989689, |
| "grad_norm": 11.696112632751465, |
| "learning_rate": 3.2653061224489794e-06, |
| "loss": 0.5208, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.010465765020489043, |
| "grad_norm": 6.880239486694336, |
| "learning_rate": 3.469387755102041e-06, |
| "loss": 0.4374, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.011081398256988399, |
| "grad_norm": 29.14151382446289, |
| "learning_rate": 3.6734693877551024e-06, |
| "loss": 0.5039, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.011697031493487754, |
| "grad_norm": 7.683447360992432, |
| "learning_rate": 3.877551020408164e-06, |
| "loss": 0.472, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.01231266472998711, |
| "grad_norm": 5.148148536682129, |
| "learning_rate": 4.081632653061225e-06, |
| "loss": 0.48, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.012928297966486466, |
| "grad_norm": 5.763864994049072, |
| "learning_rate": 4.2857142857142855e-06, |
| "loss": 0.5208, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.013543931202985821, |
| "grad_norm": 5.345851421356201, |
| "learning_rate": 4.489795918367348e-06, |
| "loss": 0.4837, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.014159564439485177, |
| "grad_norm": 4.355923652648926, |
| "learning_rate": 4.693877551020409e-06, |
| "loss": 0.4873, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.014775197675984533, |
| "grad_norm": 6.132438659667969, |
| "learning_rate": 4.897959183673469e-06, |
| "loss": 0.4696, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.015390830912483888, |
| "grad_norm": 5.307392597198486, |
| "learning_rate": 5.1020408163265315e-06, |
| "loss": 0.4756, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.016006464148983244, |
| "grad_norm": 4.319431781768799, |
| "learning_rate": 5.306122448979593e-06, |
| "loss": 0.4765, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.0166220973854826, |
| "grad_norm": 4.057931900024414, |
| "learning_rate": 5.510204081632653e-06, |
| "loss": 0.5113, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.017237730621981955, |
| "grad_norm": 4.102950572967529, |
| "learning_rate": 5.7142857142857145e-06, |
| "loss": 0.4438, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.01785336385848131, |
| "grad_norm": 4.861684322357178, |
| "learning_rate": 5.918367346938776e-06, |
| "loss": 0.4829, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.018468997094980667, |
| "grad_norm": 3.862741231918335, |
| "learning_rate": 6.122448979591837e-06, |
| "loss": 0.5129, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.019084630331480022, |
| "grad_norm": 3.8774240016937256, |
| "learning_rate": 6.326530612244899e-06, |
| "loss": 0.4498, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.019700263567979378, |
| "grad_norm": 3.369861602783203, |
| "learning_rate": 6.530612244897959e-06, |
| "loss": 0.4595, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.02031589680447873, |
| "grad_norm": 4.286397457122803, |
| "learning_rate": 6.734693877551021e-06, |
| "loss": 0.4822, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.020931530040978086, |
| "grad_norm": 4.149627685546875, |
| "learning_rate": 6.938775510204082e-06, |
| "loss": 0.4825, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.02154716327747744, |
| "grad_norm": 3.7248666286468506, |
| "learning_rate": 7.1428571428571436e-06, |
| "loss": 0.483, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.022162796513976797, |
| "grad_norm": 3.714015007019043, |
| "learning_rate": 7.346938775510205e-06, |
| "loss": 0.4759, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.022778429750476153, |
| "grad_norm": 4.151897430419922, |
| "learning_rate": 7.551020408163265e-06, |
| "loss": 0.5299, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.02339406298697551, |
| "grad_norm": 3.7130208015441895, |
| "learning_rate": 7.755102040816327e-06, |
| "loss": 0.4994, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.024009696223474864, |
| "grad_norm": 3.6021628379821777, |
| "learning_rate": 7.959183673469388e-06, |
| "loss": 0.494, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.02462532945997422, |
| "grad_norm": 4.136284828186035, |
| "learning_rate": 8.16326530612245e-06, |
| "loss": 0.4955, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.025240962696473575, |
| "grad_norm": 3.669343948364258, |
| "learning_rate": 8.36734693877551e-06, |
| "loss": 0.4999, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.02585659593297293, |
| "grad_norm": 3.592020273208618, |
| "learning_rate": 8.571428571428571e-06, |
| "loss": 0.505, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.026472229169472287, |
| "grad_norm": 3.9431192874908447, |
| "learning_rate": 8.775510204081633e-06, |
| "loss": 0.4714, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.027087862405971642, |
| "grad_norm": 3.835538864135742, |
| "learning_rate": 8.979591836734695e-06, |
| "loss": 0.4833, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.027703495642470998, |
| "grad_norm": 3.6205527782440186, |
| "learning_rate": 9.183673469387756e-06, |
| "loss": 0.4946, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.028319128878970354, |
| "grad_norm": 3.6335999965667725, |
| "learning_rate": 9.387755102040818e-06, |
| "loss": 0.5319, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.02893476211546971, |
| "grad_norm": 3.832490921020508, |
| "learning_rate": 9.591836734693878e-06, |
| "loss": 0.4922, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.029550395351969065, |
| "grad_norm": 3.39542555809021, |
| "learning_rate": 9.795918367346939e-06, |
| "loss": 0.5099, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.03016602858846842, |
| "grad_norm": 3.5109446048736572, |
| "learning_rate": 1e-05, |
| "loss": 0.5137, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.030781661824967776, |
| "grad_norm": 3.368596076965332, |
| "learning_rate": 9.99999005331204e-06, |
| "loss": 0.5375, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.03139729506146713, |
| "grad_norm": 4.006367206573486, |
| "learning_rate": 9.999960213287734e-06, |
| "loss": 0.5605, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.03201292829796649, |
| "grad_norm": 3.265024185180664, |
| "learning_rate": 9.999910480045805e-06, |
| "loss": 0.5158, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.03262856153446584, |
| "grad_norm": 3.560621976852417, |
| "learning_rate": 9.999840853784125e-06, |
| "loss": 0.5671, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.0332441947709652, |
| "grad_norm": 3.2886385917663574, |
| "learning_rate": 9.999751334779716e-06, |
| "loss": 0.5654, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.03385982800746455, |
| "grad_norm": 3.0971665382385254, |
| "learning_rate": 9.999641923388745e-06, |
| "loss": 0.5049, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.03447546124396391, |
| "grad_norm": 3.312525510787964, |
| "learning_rate": 9.999512620046523e-06, |
| "loss": 0.5291, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.03509109448046326, |
| "grad_norm": 3.5942792892456055, |
| "learning_rate": 9.999363425267506e-06, |
| "loss": 0.5531, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.03570672771696262, |
| "grad_norm": 2.7662947177886963, |
| "learning_rate": 9.999194339645292e-06, |
| "loss": 0.5195, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.036322360953461974, |
| "grad_norm": 3.3403029441833496, |
| "learning_rate": 9.999005363852619e-06, |
| "loss": 0.5265, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.03693799418996133, |
| "grad_norm": 3.2366700172424316, |
| "learning_rate": 9.99879649864136e-06, |
| "loss": 0.5558, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.037553627426460685, |
| "grad_norm": 2.8445067405700684, |
| "learning_rate": 9.998567744842518e-06, |
| "loss": 0.5047, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.038169260662960044, |
| "grad_norm": 3.0779359340667725, |
| "learning_rate": 9.998319103366233e-06, |
| "loss": 0.5314, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.0387848938994594, |
| "grad_norm": 3.4407589435577393, |
| "learning_rate": 9.998050575201772e-06, |
| "loss": 0.5547, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.039400527135958756, |
| "grad_norm": 3.0764389038085938, |
| "learning_rate": 9.997762161417517e-06, |
| "loss": 0.5269, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.04001616037245811, |
| "grad_norm": 2.964404344558716, |
| "learning_rate": 9.997453863160975e-06, |
| "loss": 0.5248, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.04063179360895746, |
| "grad_norm": 3.130155086517334, |
| "learning_rate": 9.997125681658761e-06, |
| "loss": 0.5261, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.04124742684545682, |
| "grad_norm": 3.1478676795959473, |
| "learning_rate": 9.996777618216608e-06, |
| "loss": 0.5227, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.04186306008195617, |
| "grad_norm": 3.343843936920166, |
| "learning_rate": 9.996409674219343e-06, |
| "loss": 0.5443, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.04247869331845553, |
| "grad_norm": 3.1439449787139893, |
| "learning_rate": 9.996021851130897e-06, |
| "loss": 0.5445, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.04309432655495488, |
| "grad_norm": 3.548717498779297, |
| "learning_rate": 9.995614150494293e-06, |
| "loss": 0.5376, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.04370995979145424, |
| "grad_norm": 3.1395504474639893, |
| "learning_rate": 9.995186573931638e-06, |
| "loss": 0.5118, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.044325593027953594, |
| "grad_norm": 2.945014476776123, |
| "learning_rate": 9.994739123144121e-06, |
| "loss": 0.5225, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.04494122626445295, |
| "grad_norm": 2.8614890575408936, |
| "learning_rate": 9.994271799912004e-06, |
| "loss": 0.5346, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.045556859500952306, |
| "grad_norm": 3.3048510551452637, |
| "learning_rate": 9.993784606094612e-06, |
| "loss": 0.5635, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.046172492737451665, |
| "grad_norm": 3.2629144191741943, |
| "learning_rate": 9.993277543630335e-06, |
| "loss": 0.5126, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.04678812597395102, |
| "grad_norm": 2.8238108158111572, |
| "learning_rate": 9.992750614536606e-06, |
| "loss": 0.5215, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.047403759210450376, |
| "grad_norm": 3.0240776538848877, |
| "learning_rate": 9.992203820909906e-06, |
| "loss": 0.5342, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.04801939244694973, |
| "grad_norm": 2.6868398189544678, |
| "learning_rate": 9.99163716492575e-06, |
| "loss": 0.5389, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.04863502568344909, |
| "grad_norm": 3.591717004776001, |
| "learning_rate": 9.991050648838676e-06, |
| "loss": 0.5321, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.04925065891994844, |
| "grad_norm": 3.2408864498138428, |
| "learning_rate": 9.990444274982245e-06, |
| "loss": 0.5375, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.0498662921564478, |
| "grad_norm": 3.125800371170044, |
| "learning_rate": 9.989818045769017e-06, |
| "loss": 0.5244, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.05048192539294715, |
| "grad_norm": 3.0238773822784424, |
| "learning_rate": 9.989171963690556e-06, |
| "loss": 0.5372, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.05109755862944651, |
| "grad_norm": 2.623504877090454, |
| "learning_rate": 9.988506031317416e-06, |
| "loss": 0.5165, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.05171319186594586, |
| "grad_norm": 2.9732768535614014, |
| "learning_rate": 9.987820251299121e-06, |
| "loss": 0.5287, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.05232882510244522, |
| "grad_norm": 2.9855895042419434, |
| "learning_rate": 9.987114626364172e-06, |
| "loss": 0.5379, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.052944458338944574, |
| "grad_norm": 2.6988890171051025, |
| "learning_rate": 9.986389159320016e-06, |
| "loss": 0.53, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.05356009157544393, |
| "grad_norm": 3.033294677734375, |
| "learning_rate": 9.985643853053053e-06, |
| "loss": 0.5345, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.054175724811943285, |
| "grad_norm": 2.823150873184204, |
| "learning_rate": 9.984878710528615e-06, |
| "loss": 0.5482, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.05479135804844264, |
| "grad_norm": 3.044827699661255, |
| "learning_rate": 9.984093734790955e-06, |
| "loss": 0.4908, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.055406991284941996, |
| "grad_norm": 2.8079962730407715, |
| "learning_rate": 9.983288928963238e-06, |
| "loss": 0.5431, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.05602262452144135, |
| "grad_norm": 2.7283718585968018, |
| "learning_rate": 9.982464296247523e-06, |
| "loss": 0.5427, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.05663825775794071, |
| "grad_norm": 3.2571794986724854, |
| "learning_rate": 9.981619839924757e-06, |
| "loss": 0.558, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.05725389099444006, |
| "grad_norm": 2.8086328506469727, |
| "learning_rate": 9.980755563354755e-06, |
| "loss": 0.5257, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.05786952423093942, |
| "grad_norm": 3.075275421142578, |
| "learning_rate": 9.979871469976197e-06, |
| "loss": 0.5065, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.05848515746743877, |
| "grad_norm": 2.7873611450195312, |
| "learning_rate": 9.978967563306599e-06, |
| "loss": 0.522, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.05910079070393813, |
| "grad_norm": 2.693174123764038, |
| "learning_rate": 9.978043846942314e-06, |
| "loss": 0.5522, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.05971642394043748, |
| "grad_norm": 3.019831657409668, |
| "learning_rate": 9.97710032455851e-06, |
| "loss": 0.5408, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.06033205717693684, |
| "grad_norm": 2.5392229557037354, |
| "learning_rate": 9.976136999909156e-06, |
| "loss": 0.5677, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.060947690413436194, |
| "grad_norm": 2.809018135070801, |
| "learning_rate": 9.975153876827008e-06, |
| "loss": 0.5415, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.06156332364993555, |
| "grad_norm": 3.3943169116973877, |
| "learning_rate": 9.974150959223591e-06, |
| "loss": 0.5673, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.06156332364993555, |
| "eval_loss": 0.5280219316482544, |
| "eval_runtime": 119.7149, |
| "eval_samples_per_second": 35.092, |
| "eval_steps_per_second": 4.394, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.062178956886434905, |
| "grad_norm": 3.059950351715088, |
| "learning_rate": 9.973128251089193e-06, |
| "loss": 0.577, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.06279459012293426, |
| "grad_norm": 3.0409281253814697, |
| "learning_rate": 9.972085756492831e-06, |
| "loss": 0.5375, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.06341022335943362, |
| "grad_norm": 2.890801429748535, |
| "learning_rate": 9.971023479582258e-06, |
| "loss": 0.5278, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.06402585659593298, |
| "grad_norm": 2.8441672325134277, |
| "learning_rate": 9.969941424583926e-06, |
| "loss": 0.573, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.06464148983243233, |
| "grad_norm": 2.7627878189086914, |
| "learning_rate": 9.968839595802982e-06, |
| "loss": 0.5409, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.06525712306893168, |
| "grad_norm": 3.9377424716949463, |
| "learning_rate": 9.967717997623245e-06, |
| "loss": 0.5905, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.06587275630543105, |
| "grad_norm": 2.8966352939605713, |
| "learning_rate": 9.966576634507187e-06, |
| "loss": 0.5224, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.0664883895419304, |
| "grad_norm": 3.656322956085205, |
| "learning_rate": 9.965415510995924e-06, |
| "loss": 0.54, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.06710402277842975, |
| "grad_norm": 2.6917004585266113, |
| "learning_rate": 9.964234631709188e-06, |
| "loss": 0.5301, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.0677196560149291, |
| "grad_norm": 2.6347432136535645, |
| "learning_rate": 9.963034001345313e-06, |
| "loss": 0.5244, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.06833528925142845, |
| "grad_norm": 2.6941580772399902, |
| "learning_rate": 9.96181362468122e-06, |
| "loss": 0.5582, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.06895092248792782, |
| "grad_norm": 2.9986321926116943, |
| "learning_rate": 9.960573506572391e-06, |
| "loss": 0.5007, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.06956655572442717, |
| "grad_norm": 2.8413689136505127, |
| "learning_rate": 9.95931365195285e-06, |
| "loss": 0.5437, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.07018218896092653, |
| "grad_norm": 2.505411148071289, |
| "learning_rate": 9.958034065835151e-06, |
| "loss": 0.4966, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.07079782219742588, |
| "grad_norm": 2.35355544090271, |
| "learning_rate": 9.956734753310355e-06, |
| "loss": 0.511, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.07141345543392524, |
| "grad_norm": 2.7929799556732178, |
| "learning_rate": 9.955415719547998e-06, |
| "loss": 0.5106, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.0720290886704246, |
| "grad_norm": 2.843961715698242, |
| "learning_rate": 9.954076969796093e-06, |
| "loss": 0.559, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.07264472190692395, |
| "grad_norm": 2.5479166507720947, |
| "learning_rate": 9.952718509381086e-06, |
| "loss": 0.5593, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.0732603551434233, |
| "grad_norm": 2.5203702449798584, |
| "learning_rate": 9.951340343707852e-06, |
| "loss": 0.5323, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.07387598837992267, |
| "grad_norm": 2.7408761978149414, |
| "learning_rate": 9.949942478259665e-06, |
| "loss": 0.5451, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.07449162161642202, |
| "grad_norm": 2.840665817260742, |
| "learning_rate": 9.948524918598175e-06, |
| "loss": 0.5662, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.07510725485292137, |
| "grad_norm": 2.8258652687072754, |
| "learning_rate": 9.947087670363395e-06, |
| "loss": 0.5246, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.07572288808942072, |
| "grad_norm": 2.577761650085449, |
| "learning_rate": 9.945630739273665e-06, |
| "loss": 0.5415, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.07633852132592009, |
| "grad_norm": 2.4587228298187256, |
| "learning_rate": 9.944154131125643e-06, |
| "loss": 0.5058, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.07695415456241944, |
| "grad_norm": 2.7867796421051025, |
| "learning_rate": 9.942657851794273e-06, |
| "loss": 0.5608, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.0775697877989188, |
| "grad_norm": 2.8648955821990967, |
| "learning_rate": 9.941141907232766e-06, |
| "loss": 0.5288, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.07818542103541815, |
| "grad_norm": 2.883604049682617, |
| "learning_rate": 9.93960630347257e-06, |
| "loss": 0.5214, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.07880105427191751, |
| "grad_norm": 3.0440468788146973, |
| "learning_rate": 9.938051046623353e-06, |
| "loss": 0.4979, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.07941668750841686, |
| "grad_norm": 2.5932624340057373, |
| "learning_rate": 9.936476142872979e-06, |
| "loss": 0.5342, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.08003232074491622, |
| "grad_norm": 2.6591358184814453, |
| "learning_rate": 9.934881598487478e-06, |
| "loss": 0.4935, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.08064795398141557, |
| "grad_norm": 2.621525526046753, |
| "learning_rate": 9.933267419811026e-06, |
| "loss": 0.5193, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.08126358721791492, |
| "grad_norm": 2.65376877784729, |
| "learning_rate": 9.931633613265913e-06, |
| "loss": 0.5228, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.08187922045441429, |
| "grad_norm": 2.4640400409698486, |
| "learning_rate": 9.929980185352525e-06, |
| "loss": 0.5097, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.08249485369091364, |
| "grad_norm": 2.425771951675415, |
| "learning_rate": 9.928307142649315e-06, |
| "loss": 0.4832, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.08311048692741299, |
| "grad_norm": 2.5605103969573975, |
| "learning_rate": 9.926614491812778e-06, |
| "loss": 0.5164, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.08372612016391234, |
| "grad_norm": 2.524656295776367, |
| "learning_rate": 9.924902239577419e-06, |
| "loss": 0.5565, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.08434175340041171, |
| "grad_norm": 2.78824782371521, |
| "learning_rate": 9.923170392755735e-06, |
| "loss": 0.5721, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.08495738663691106, |
| "grad_norm": 2.4345874786376953, |
| "learning_rate": 9.921418958238182e-06, |
| "loss": 0.5229, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.08557301987341041, |
| "grad_norm": 2.9033138751983643, |
| "learning_rate": 9.91964794299315e-06, |
| "loss": 0.537, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.08618865310990977, |
| "grad_norm": 2.644517660140991, |
| "learning_rate": 9.91785735406693e-06, |
| "loss": 0.5303, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.08680428634640913, |
| "grad_norm": 2.703230381011963, |
| "learning_rate": 9.916047198583698e-06, |
| "loss": 0.5568, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.08741991958290848, |
| "grad_norm": 2.5611250400543213, |
| "learning_rate": 9.914217483745472e-06, |
| "loss": 0.525, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.08803555281940784, |
| "grad_norm": 2.7442543506622314, |
| "learning_rate": 9.912368216832094e-06, |
| "loss": 0.5422, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.08865118605590719, |
| "grad_norm": 2.5576305389404297, |
| "learning_rate": 9.910499405201195e-06, |
| "loss": 0.5488, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.08926681929240655, |
| "grad_norm": 3.037740707397461, |
| "learning_rate": 9.90861105628817e-06, |
| "loss": 0.5239, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.0898824525289059, |
| "grad_norm": 2.7151741981506348, |
| "learning_rate": 9.906703177606149e-06, |
| "loss": 0.5273, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.09049808576540526, |
| "grad_norm": 2.5526156425476074, |
| "learning_rate": 9.904775776745959e-06, |
| "loss": 0.509, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.09111371900190461, |
| "grad_norm": 2.4451942443847656, |
| "learning_rate": 9.902828861376101e-06, |
| "loss": 0.5146, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.09172935223840398, |
| "grad_norm": 2.4070897102355957, |
| "learning_rate": 9.900862439242719e-06, |
| "loss": 0.5187, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.09234498547490333, |
| "grad_norm": 2.5667154788970947, |
| "learning_rate": 9.898876518169572e-06, |
| "loss": 0.5231, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.09296061871140268, |
| "grad_norm": 2.5709125995635986, |
| "learning_rate": 9.896871106057989e-06, |
| "loss": 0.5416, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.09357625194790203, |
| "grad_norm": 2.5680558681488037, |
| "learning_rate": 9.894846210886856e-06, |
| "loss": 0.5211, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.09419188518440139, |
| "grad_norm": 2.598843574523926, |
| "learning_rate": 9.892801840712576e-06, |
| "loss": 0.5311, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.09480751842090075, |
| "grad_norm": 2.8312110900878906, |
| "learning_rate": 9.890738003669029e-06, |
| "loss": 0.5565, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.0954231516574001, |
| "grad_norm": 2.64947247505188, |
| "learning_rate": 9.888654707967556e-06, |
| "loss": 0.5304, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.09603878489389946, |
| "grad_norm": 2.66739559173584, |
| "learning_rate": 9.88655196189691e-06, |
| "loss": 0.5197, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.09665441813039881, |
| "grad_norm": 2.781301259994507, |
| "learning_rate": 9.884429773823238e-06, |
| "loss": 0.5929, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.09727005136689817, |
| "grad_norm": 2.493687152862549, |
| "learning_rate": 9.882288152190039e-06, |
| "loss": 0.4982, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.09788568460339753, |
| "grad_norm": 2.649986982345581, |
| "learning_rate": 9.880127105518122e-06, |
| "loss": 0.5235, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.09850131783989688, |
| "grad_norm": 2.764665365219116, |
| "learning_rate": 9.877946642405598e-06, |
| "loss": 0.5313, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.09911695107639623, |
| "grad_norm": 2.6386075019836426, |
| "learning_rate": 9.875746771527817e-06, |
| "loss": 0.5328, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.0997325843128956, |
| "grad_norm": 2.810497999191284, |
| "learning_rate": 9.873527501637352e-06, |
| "loss": 0.5359, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.10034821754939495, |
| "grad_norm": 2.4889254570007324, |
| "learning_rate": 9.871288841563956e-06, |
| "loss": 0.5219, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.1009638507858943, |
| "grad_norm": 2.7173171043395996, |
| "learning_rate": 9.869030800214531e-06, |
| "loss": 0.5609, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.10157948402239365, |
| "grad_norm": 2.800018548965454, |
| "learning_rate": 9.866753386573091e-06, |
| "loss": 0.5286, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.10219511725889302, |
| "grad_norm": 2.4725608825683594, |
| "learning_rate": 9.864456609700726e-06, |
| "loss": 0.5079, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.10281075049539237, |
| "grad_norm": 2.834932327270508, |
| "learning_rate": 9.86214047873556e-06, |
| "loss": 0.5567, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.10342638373189172, |
| "grad_norm": 2.691657066345215, |
| "learning_rate": 9.859805002892733e-06, |
| "loss": 0.5305, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.10404201696839108, |
| "grad_norm": 2.373129367828369, |
| "learning_rate": 9.857450191464337e-06, |
| "loss": 0.5059, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.10465765020489044, |
| "grad_norm": 2.8620853424072266, |
| "learning_rate": 9.855076053819409e-06, |
| "loss": 0.5431, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.1052732834413898, |
| "grad_norm": 2.4922218322753906, |
| "learning_rate": 9.852682599403867e-06, |
| "loss": 0.5163, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.10588891667788915, |
| "grad_norm": 2.6138806343078613, |
| "learning_rate": 9.85026983774049e-06, |
| "loss": 0.5466, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.1065045499143885, |
| "grad_norm": 2.4458065032958984, |
| "learning_rate": 9.847837778428873e-06, |
| "loss": 0.5397, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.10712018315088787, |
| "grad_norm": 2.5585734844207764, |
| "learning_rate": 9.84538643114539e-06, |
| "loss": 0.5771, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.10773581638738722, |
| "grad_norm": 2.4674112796783447, |
| "learning_rate": 9.842915805643156e-06, |
| "loss": 0.5292, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.10835144962388657, |
| "grad_norm": 2.4433512687683105, |
| "learning_rate": 9.840425911751987e-06, |
| "loss": 0.5505, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.10896708286038592, |
| "grad_norm": 2.528373956680298, |
| "learning_rate": 9.837916759378363e-06, |
| "loss": 0.5325, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.10958271609688527, |
| "grad_norm": 2.3507394790649414, |
| "learning_rate": 9.835388358505383e-06, |
| "loss": 0.5224, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.11019834933338464, |
| "grad_norm": 2.5113093852996826, |
| "learning_rate": 9.832840719192737e-06, |
| "loss": 0.5798, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.11081398256988399, |
| "grad_norm": 2.4653539657592773, |
| "learning_rate": 9.830273851576651e-06, |
| "loss": 0.5304, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.11142961580638334, |
| "grad_norm": 2.6374294757843018, |
| "learning_rate": 9.827687765869859e-06, |
| "loss": 0.524, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.1120452490428827, |
| "grad_norm": 2.344590425491333, |
| "learning_rate": 9.825082472361558e-06, |
| "loss": 0.5064, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.11266088227938206, |
| "grad_norm": 2.315798759460449, |
| "learning_rate": 9.822457981417362e-06, |
| "loss": 0.5046, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.11327651551588142, |
| "grad_norm": 2.5081753730773926, |
| "learning_rate": 9.819814303479268e-06, |
| "loss": 0.4975, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.11389214875238077, |
| "grad_norm": 2.3528170585632324, |
| "learning_rate": 9.817151449065612e-06, |
| "loss": 0.5297, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.11450778198888012, |
| "grad_norm": 2.668283462524414, |
| "learning_rate": 9.814469428771028e-06, |
| "loss": 0.5598, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.11512341522537949, |
| "grad_norm": 2.670201539993286, |
| "learning_rate": 9.811768253266401e-06, |
| "loss": 0.4961, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.11573904846187884, |
| "grad_norm": 2.6510801315307617, |
| "learning_rate": 9.809047933298834e-06, |
| "loss": 0.5335, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.11635468169837819, |
| "grad_norm": 2.9253435134887695, |
| "learning_rate": 9.806308479691595e-06, |
| "loss": 0.5289, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.11697031493487754, |
| "grad_norm": 2.3529560565948486, |
| "learning_rate": 9.803549903344081e-06, |
| "loss": 0.5525, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.11758594817137691, |
| "grad_norm": 2.510202407836914, |
| "learning_rate": 9.80077221523177e-06, |
| "loss": 0.4958, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.11820158140787626, |
| "grad_norm": 2.914436101913452, |
| "learning_rate": 9.79797542640618e-06, |
| "loss": 0.5495, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.11881721464437561, |
| "grad_norm": 2.32578444480896, |
| "learning_rate": 9.79515954799483e-06, |
| "loss": 0.5378, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.11943284788087496, |
| "grad_norm": 2.4786176681518555, |
| "learning_rate": 9.792324591201179e-06, |
| "loss": 0.5331, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.12004848111737433, |
| "grad_norm": 2.7119698524475098, |
| "learning_rate": 9.789470567304604e-06, |
| "loss": 0.5563, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.12066411435387368, |
| "grad_norm": 2.453625202178955, |
| "learning_rate": 9.786597487660336e-06, |
| "loss": 0.53, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.12127974759037304, |
| "grad_norm": 2.4620349407196045, |
| "learning_rate": 9.78370536369943e-06, |
| "loss": 0.543, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.12189538082687239, |
| "grad_norm": 2.302342414855957, |
| "learning_rate": 9.780794206928704e-06, |
| "loss": 0.5526, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.12251101406337174, |
| "grad_norm": 2.308497905731201, |
| "learning_rate": 9.777864028930705e-06, |
| "loss": 0.4909, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.1231266472998711, |
| "grad_norm": 2.459630250930786, |
| "learning_rate": 9.774914841363661e-06, |
| "loss": 0.495, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.1231266472998711, |
| "eval_loss": 0.51694256067276, |
| "eval_runtime": 119.7286, |
| "eval_samples_per_second": 35.088, |
| "eval_steps_per_second": 4.393, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.12374228053637046, |
| "grad_norm": 2.3562207221984863, |
| "learning_rate": 9.771946655961431e-06, |
| "loss": 0.5513, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.12435791377286981, |
| "grad_norm": 2.693406343460083, |
| "learning_rate": 9.768959484533461e-06, |
| "loss": 0.5348, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.12497354700936916, |
| "grad_norm": 2.5745203495025635, |
| "learning_rate": 9.765953338964736e-06, |
| "loss": 0.5027, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.12558918024586851, |
| "grad_norm": 2.345874786376953, |
| "learning_rate": 9.762928231215731e-06, |
| "loss": 0.5171, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.12620481348236787, |
| "grad_norm": 2.9996278285980225, |
| "learning_rate": 9.75988417332237e-06, |
| "loss": 0.54, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.12682044671886725, |
| "grad_norm": 2.7544898986816406, |
| "learning_rate": 9.756821177395969e-06, |
| "loss": 0.5506, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.1274360799553666, |
| "grad_norm": 2.560554265975952, |
| "learning_rate": 9.753739255623193e-06, |
| "loss": 0.537, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.12805171319186595, |
| "grad_norm": 2.4070980548858643, |
| "learning_rate": 9.750638420266008e-06, |
| "loss": 0.5257, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.1286673464283653, |
| "grad_norm": 2.6485419273376465, |
| "learning_rate": 9.747518683661632e-06, |
| "loss": 0.5227, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.12928297966486466, |
| "grad_norm": 2.5590012073516846, |
| "learning_rate": 9.744380058222483e-06, |
| "loss": 0.5281, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.129898612901364, |
| "grad_norm": 2.250981330871582, |
| "learning_rate": 9.741222556436132e-06, |
| "loss": 0.5149, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.13051424613786336, |
| "grad_norm": 2.4137957096099854, |
| "learning_rate": 9.738046190865254e-06, |
| "loss": 0.5331, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.1311298793743627, |
| "grad_norm": 2.0661706924438477, |
| "learning_rate": 9.734850974147573e-06, |
| "loss": 0.497, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.1317455126108621, |
| "grad_norm": 2.2877936363220215, |
| "learning_rate": 9.731636918995821e-06, |
| "loss": 0.4819, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.13236114584736144, |
| "grad_norm": 2.485813856124878, |
| "learning_rate": 9.72840403819768e-06, |
| "loss": 0.4859, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.1329767790838608, |
| "grad_norm": 2.2839794158935547, |
| "learning_rate": 9.72515234461573e-06, |
| "loss": 0.4996, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.13359241232036015, |
| "grad_norm": 2.558027744293213, |
| "learning_rate": 9.721881851187406e-06, |
| "loss": 0.5171, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.1342080455568595, |
| "grad_norm": 2.5948326587677, |
| "learning_rate": 9.718592570924938e-06, |
| "loss": 0.5091, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.13482367879335885, |
| "grad_norm": 2.0874948501586914, |
| "learning_rate": 9.715284516915303e-06, |
| "loss": 0.5083, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.1354393120298582, |
| "grad_norm": 2.195895195007324, |
| "learning_rate": 9.711957702320176e-06, |
| "loss": 0.5147, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.13605494526635756, |
| "grad_norm": 2.4264023303985596, |
| "learning_rate": 9.708612140375867e-06, |
| "loss": 0.5375, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.1366705785028569, |
| "grad_norm": 2.349743604660034, |
| "learning_rate": 9.705247844393284e-06, |
| "loss": 0.5058, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.1372862117393563, |
| "grad_norm": 2.254120111465454, |
| "learning_rate": 9.701864827757868e-06, |
| "loss": 0.52, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.13790184497585564, |
| "grad_norm": 2.2042782306671143, |
| "learning_rate": 9.698463103929542e-06, |
| "loss": 0.5309, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.138517478212355, |
| "grad_norm": 2.114847183227539, |
| "learning_rate": 9.695042686442662e-06, |
| "loss": 0.4971, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.13913311144885435, |
| "grad_norm": 2.7704129219055176, |
| "learning_rate": 9.691603588905956e-06, |
| "loss": 0.4993, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.1397487446853537, |
| "grad_norm": 2.32524037361145, |
| "learning_rate": 9.688145825002475e-06, |
| "loss": 0.532, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.14036437792185305, |
| "grad_norm": 2.225130319595337, |
| "learning_rate": 9.684669408489542e-06, |
| "loss": 0.5176, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.1409800111583524, |
| "grad_norm": 2.4317879676818848, |
| "learning_rate": 9.681174353198687e-06, |
| "loss": 0.5321, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.14159564439485176, |
| "grad_norm": 2.2843475341796875, |
| "learning_rate": 9.6776606730356e-06, |
| "loss": 0.5326, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.14221127763135114, |
| "grad_norm": 2.269463062286377, |
| "learning_rate": 9.674128381980073e-06, |
| "loss": 0.5145, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.1428269108678505, |
| "grad_norm": 2.3606982231140137, |
| "learning_rate": 9.670577494085945e-06, |
| "loss": 0.5358, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.14344254410434984, |
| "grad_norm": 2.1334755420684814, |
| "learning_rate": 9.667008023481045e-06, |
| "loss": 0.5177, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.1440581773408492, |
| "grad_norm": 2.2030303478240967, |
| "learning_rate": 9.663419984367139e-06, |
| "loss": 0.5027, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.14467381057734854, |
| "grad_norm": 2.3622584342956543, |
| "learning_rate": 9.659813391019867e-06, |
| "loss": 0.5297, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.1452894438138479, |
| "grad_norm": 2.399989128112793, |
| "learning_rate": 9.656188257788694e-06, |
| "loss": 0.5509, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.14590507705034725, |
| "grad_norm": 2.143125057220459, |
| "learning_rate": 9.652544599096846e-06, |
| "loss": 0.5156, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.1465207102868466, |
| "grad_norm": 1.8969368934631348, |
| "learning_rate": 9.648882429441258e-06, |
| "loss": 0.4808, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.14713634352334598, |
| "grad_norm": 2.2974019050598145, |
| "learning_rate": 9.645201763392513e-06, |
| "loss": 0.5358, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.14775197675984533, |
| "grad_norm": 2.315643787384033, |
| "learning_rate": 9.641502615594789e-06, |
| "loss": 0.5157, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.14836760999634468, |
| "grad_norm": 2.4144253730773926, |
| "learning_rate": 9.637785000765789e-06, |
| "loss": 0.517, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.14898324323284404, |
| "grad_norm": 2.0618128776550293, |
| "learning_rate": 9.634048933696697e-06, |
| "loss": 0.4799, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.1495988764693434, |
| "grad_norm": 2.4816224575042725, |
| "learning_rate": 9.630294429252112e-06, |
| "loss": 0.5059, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.15021450970584274, |
| "grad_norm": 2.3560738563537598, |
| "learning_rate": 9.626521502369984e-06, |
| "loss": 0.5393, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.1508301429423421, |
| "grad_norm": 2.447284698486328, |
| "learning_rate": 9.622730168061568e-06, |
| "loss": 0.5179, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.15144577617884145, |
| "grad_norm": 2.3853726387023926, |
| "learning_rate": 9.618920441411346e-06, |
| "loss": 0.4957, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.1520614094153408, |
| "grad_norm": 2.0928761959075928, |
| "learning_rate": 9.615092337576987e-06, |
| "loss": 0.4782, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.15267704265184018, |
| "grad_norm": 2.4348087310791016, |
| "learning_rate": 9.611245871789273e-06, |
| "loss": 0.5522, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.15329267588833953, |
| "grad_norm": 2.5106360912323, |
| "learning_rate": 9.60738105935204e-06, |
| "loss": 0.5157, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.15390830912483888, |
| "grad_norm": 2.3466601371765137, |
| "learning_rate": 9.603497915642122e-06, |
| "loss": 0.5197, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.15452394236133823, |
| "grad_norm": 2.5545997619628906, |
| "learning_rate": 9.599596456109286e-06, |
| "loss": 0.4927, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.1551395755978376, |
| "grad_norm": 2.521439790725708, |
| "learning_rate": 9.595676696276173e-06, |
| "loss": 0.5541, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.15575520883433694, |
| "grad_norm": 2.3179068565368652, |
| "learning_rate": 9.591738651738235e-06, |
| "loss": 0.4994, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.1563708420708363, |
| "grad_norm": 2.2857041358947754, |
| "learning_rate": 9.58778233816367e-06, |
| "loss": 0.537, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.15698647530733564, |
| "grad_norm": 2.6174025535583496, |
| "learning_rate": 9.583807771293366e-06, |
| "loss": 0.5298, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.15760210854383502, |
| "grad_norm": 2.400820016860962, |
| "learning_rate": 9.579814966940833e-06, |
| "loss": 0.4969, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.15821774178033438, |
| "grad_norm": 2.4716885089874268, |
| "learning_rate": 9.575803940992143e-06, |
| "loss": 0.5227, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.15883337501683373, |
| "grad_norm": 2.1826908588409424, |
| "learning_rate": 9.571774709405866e-06, |
| "loss": 0.4987, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.15944900825333308, |
| "grad_norm": 2.163158893585205, |
| "learning_rate": 9.567727288213005e-06, |
| "loss": 0.5295, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.16006464148983243, |
| "grad_norm": 2.319798469543457, |
| "learning_rate": 9.563661693516934e-06, |
| "loss": 0.5388, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.16068027472633178, |
| "grad_norm": 2.828787088394165, |
| "learning_rate": 9.559577941493334e-06, |
| "loss": 0.5277, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.16129590796283114, |
| "grad_norm": 2.2098703384399414, |
| "learning_rate": 9.55547604839013e-06, |
| "loss": 0.5134, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.1619115411993305, |
| "grad_norm": 2.636500120162964, |
| "learning_rate": 9.551356030527417e-06, |
| "loss": 0.5242, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.16252717443582984, |
| "grad_norm": 2.5841751098632812, |
| "learning_rate": 9.547217904297411e-06, |
| "loss": 0.4996, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.16314280767232922, |
| "grad_norm": 2.4081389904022217, |
| "learning_rate": 9.543061686164374e-06, |
| "loss": 0.5207, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.16375844090882857, |
| "grad_norm": 2.21783447265625, |
| "learning_rate": 9.538887392664544e-06, |
| "loss": 0.5077, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.16437407414532793, |
| "grad_norm": 2.636134386062622, |
| "learning_rate": 9.534695040406082e-06, |
| "loss": 0.4935, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.16498970738182728, |
| "grad_norm": 2.134760856628418, |
| "learning_rate": 9.530484646068996e-06, |
| "loss": 0.5007, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.16560534061832663, |
| "grad_norm": 1.9881904125213623, |
| "learning_rate": 9.526256226405075e-06, |
| "loss": 0.5029, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.16622097385482598, |
| "grad_norm": 2.2205843925476074, |
| "learning_rate": 9.52200979823783e-06, |
| "loss": 0.4938, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.16683660709132533, |
| "grad_norm": 2.17769455909729, |
| "learning_rate": 9.517745378462417e-06, |
| "loss": 0.5203, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.1674522403278247, |
| "grad_norm": 2.097182273864746, |
| "learning_rate": 9.513462984045577e-06, |
| "loss": 0.5235, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.16806787356432407, |
| "grad_norm": 2.164889097213745, |
| "learning_rate": 9.50916263202557e-06, |
| "loss": 0.5092, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.16868350680082342, |
| "grad_norm": 2.035274028778076, |
| "learning_rate": 9.504844339512096e-06, |
| "loss": 0.4962, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.16929914003732277, |
| "grad_norm": 2.267477512359619, |
| "learning_rate": 9.500508123686241e-06, |
| "loss": 0.5486, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.16991477327382212, |
| "grad_norm": 2.092531204223633, |
| "learning_rate": 9.496154001800397e-06, |
| "loss": 0.487, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.17053040651032148, |
| "grad_norm": 2.2290382385253906, |
| "learning_rate": 9.491781991178203e-06, |
| "loss": 0.4949, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.17114603974682083, |
| "grad_norm": 2.1672849655151367, |
| "learning_rate": 9.487392109214468e-06, |
| "loss": 0.4772, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.17176167298332018, |
| "grad_norm": 2.4247779846191406, |
| "learning_rate": 9.482984373375105e-06, |
| "loss": 0.5293, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.17237730621981953, |
| "grad_norm": 2.375716209411621, |
| "learning_rate": 9.478558801197065e-06, |
| "loss": 0.5297, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.1729929394563189, |
| "grad_norm": 2.107602834701538, |
| "learning_rate": 9.474115410288263e-06, |
| "loss": 0.464, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.17360857269281826, |
| "grad_norm": 2.328535556793213, |
| "learning_rate": 9.469654218327503e-06, |
| "loss": 0.5001, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.17422420592931762, |
| "grad_norm": 2.2215569019317627, |
| "learning_rate": 9.465175243064428e-06, |
| "loss": 0.4842, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.17483983916581697, |
| "grad_norm": 2.3649046421051025, |
| "learning_rate": 9.460678502319419e-06, |
| "loss": 0.5121, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.17545547240231632, |
| "grad_norm": 2.451415538787842, |
| "learning_rate": 9.456164013983546e-06, |
| "loss": 0.5342, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.17607110563881567, |
| "grad_norm": 2.2832226753234863, |
| "learning_rate": 9.451631796018495e-06, |
| "loss": 0.5329, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.17668673887531502, |
| "grad_norm": 2.2211861610412598, |
| "learning_rate": 9.44708186645649e-06, |
| "loss": 0.54, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.17730237211181438, |
| "grad_norm": 2.234879493713379, |
| "learning_rate": 9.442514243400218e-06, |
| "loss": 0.5407, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.17791800534831373, |
| "grad_norm": 2.016080617904663, |
| "learning_rate": 9.437928945022772e-06, |
| "loss": 0.5114, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.1785336385848131, |
| "grad_norm": 2.1441290378570557, |
| "learning_rate": 9.433325989567562e-06, |
| "loss": 0.512, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.17914927182131246, |
| "grad_norm": 2.112748384475708, |
| "learning_rate": 9.428705395348254e-06, |
| "loss": 0.5141, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.1797649050578118, |
| "grad_norm": 1.9823687076568604, |
| "learning_rate": 9.424067180748692e-06, |
| "loss": 0.5006, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.18038053829431117, |
| "grad_norm": 2.2916078567504883, |
| "learning_rate": 9.419411364222826e-06, |
| "loss": 0.5134, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.18099617153081052, |
| "grad_norm": 2.619990110397339, |
| "learning_rate": 9.414737964294636e-06, |
| "loss": 0.4893, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.18161180476730987, |
| "grad_norm": 2.1955621242523193, |
| "learning_rate": 9.410046999558062e-06, |
| "loss": 0.4994, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.18222743800380922, |
| "grad_norm": 2.555013656616211, |
| "learning_rate": 9.40533848867693e-06, |
| "loss": 0.5372, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.18284307124030857, |
| "grad_norm": 2.11368727684021, |
| "learning_rate": 9.400612450384874e-06, |
| "loss": 0.4894, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.18345870447680795, |
| "grad_norm": 2.3000552654266357, |
| "learning_rate": 9.395868903485269e-06, |
| "loss": 0.5129, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.1840743377133073, |
| "grad_norm": 2.0731847286224365, |
| "learning_rate": 9.391107866851143e-06, |
| "loss": 0.4828, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.18468997094980666, |
| "grad_norm": 2.25274395942688, |
| "learning_rate": 9.386329359425117e-06, |
| "loss": 0.5173, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.18468997094980666, |
| "eval_loss": 0.5033955574035645, |
| "eval_runtime": 119.4386, |
| "eval_samples_per_second": 35.173, |
| "eval_steps_per_second": 4.404, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.185305604186306, |
| "grad_norm": 2.0389397144317627, |
| "learning_rate": 9.381533400219319e-06, |
| "loss": 0.4855, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.18592123742280536, |
| "grad_norm": 2.3065874576568604, |
| "learning_rate": 9.376720008315312e-06, |
| "loss": 0.4963, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.18653687065930472, |
| "grad_norm": 2.1684486865997314, |
| "learning_rate": 9.37188920286402e-06, |
| "loss": 0.5139, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.18715250389580407, |
| "grad_norm": 2.176117181777954, |
| "learning_rate": 9.36704100308565e-06, |
| "loss": 0.4582, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.18776813713230342, |
| "grad_norm": 2.326688766479492, |
| "learning_rate": 9.36217542826961e-06, |
| "loss": 0.5286, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.18838377036880277, |
| "grad_norm": 2.441455125808716, |
| "learning_rate": 9.357292497774447e-06, |
| "loss": 0.5235, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.18899940360530215, |
| "grad_norm": 2.0674080848693848, |
| "learning_rate": 9.352392231027752e-06, |
| "loss": 0.4739, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.1896150368418015, |
| "grad_norm": 2.186354637145996, |
| "learning_rate": 9.347474647526095e-06, |
| "loss": 0.5179, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.19023067007830086, |
| "grad_norm": 2.2529890537261963, |
| "learning_rate": 9.342539766834945e-06, |
| "loss": 0.4914, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.1908463033148002, |
| "grad_norm": 2.005094051361084, |
| "learning_rate": 9.337587608588588e-06, |
| "loss": 0.4972, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.19146193655129956, |
| "grad_norm": 2.2308027744293213, |
| "learning_rate": 9.332618192490054e-06, |
| "loss": 0.4884, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.1920775697877989, |
| "grad_norm": 2.4357378482818604, |
| "learning_rate": 9.327631538311036e-06, |
| "loss": 0.5289, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.19269320302429827, |
| "grad_norm": 2.0533759593963623, |
| "learning_rate": 9.322627665891807e-06, |
| "loss": 0.4723, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.19330883626079762, |
| "grad_norm": 2.2002153396606445, |
| "learning_rate": 9.317606595141156e-06, |
| "loss": 0.5384, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.193924469497297, |
| "grad_norm": 2.239171266555786, |
| "learning_rate": 9.312568346036288e-06, |
| "loss": 0.524, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.19454010273379635, |
| "grad_norm": 2.389228343963623, |
| "learning_rate": 9.307512938622762e-06, |
| "loss": 0.5396, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.1951557359702957, |
| "grad_norm": 1.962758183479309, |
| "learning_rate": 9.302440393014402e-06, |
| "loss": 0.4919, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.19577136920679505, |
| "grad_norm": 2.2056610584259033, |
| "learning_rate": 9.29735072939322e-06, |
| "loss": 0.5253, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.1963870024432944, |
| "grad_norm": 2.3771414756774902, |
| "learning_rate": 9.292243968009332e-06, |
| "loss": 0.5048, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.19700263567979376, |
| "grad_norm": 2.1659281253814697, |
| "learning_rate": 9.287120129180884e-06, |
| "loss": 0.4925, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.1976182689162931, |
| "grad_norm": 2.415493965148926, |
| "learning_rate": 9.281979233293966e-06, |
| "loss": 0.5006, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.19823390215279246, |
| "grad_norm": 2.220350980758667, |
| "learning_rate": 9.276821300802535e-06, |
| "loss": 0.5117, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.19884953538929184, |
| "grad_norm": 2.1746761798858643, |
| "learning_rate": 9.271646352228324e-06, |
| "loss": 0.5036, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.1994651686257912, |
| "grad_norm": 2.0392343997955322, |
| "learning_rate": 9.266454408160779e-06, |
| "loss": 0.4946, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.20008080186229055, |
| "grad_norm": 2.0878195762634277, |
| "learning_rate": 9.261245489256956e-06, |
| "loss": 0.5064, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.2006964350987899, |
| "grad_norm": 2.1423075199127197, |
| "learning_rate": 9.25601961624145e-06, |
| "loss": 0.5088, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.20131206833528925, |
| "grad_norm": 2.3595194816589355, |
| "learning_rate": 9.250776809906313e-06, |
| "loss": 0.5523, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.2019277015717886, |
| "grad_norm": 2.1990396976470947, |
| "learning_rate": 9.24551709111097e-06, |
| "loss": 0.499, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.20254333480828796, |
| "grad_norm": 2.1120452880859375, |
| "learning_rate": 9.24024048078213e-06, |
| "loss": 0.5174, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.2031589680447873, |
| "grad_norm": 2.780289649963379, |
| "learning_rate": 9.234946999913717e-06, |
| "loss": 0.5507, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.20377460128128666, |
| "grad_norm": 2.629866123199463, |
| "learning_rate": 9.229636669566769e-06, |
| "loss": 0.5358, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.20439023451778604, |
| "grad_norm": 2.216796636581421, |
| "learning_rate": 9.224309510869364e-06, |
| "loss": 0.4949, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.2050058677542854, |
| "grad_norm": 2.279585838317871, |
| "learning_rate": 9.218965545016538e-06, |
| "loss": 0.4953, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.20562150099078474, |
| "grad_norm": 2.3182451725006104, |
| "learning_rate": 9.213604793270196e-06, |
| "loss": 0.5338, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.2062371342272841, |
| "grad_norm": 2.1451542377471924, |
| "learning_rate": 9.208227276959028e-06, |
| "loss": 0.5004, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.20685276746378345, |
| "grad_norm": 2.218402147293091, |
| "learning_rate": 9.202833017478421e-06, |
| "loss": 0.5257, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.2074684007002828, |
| "grad_norm": 2.235485076904297, |
| "learning_rate": 9.197422036290386e-06, |
| "loss": 0.5282, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.20808403393678215, |
| "grad_norm": 2.1318018436431885, |
| "learning_rate": 9.191994354923459e-06, |
| "loss": 0.4672, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.2086996671732815, |
| "grad_norm": 2.1314377784729004, |
| "learning_rate": 9.186549994972618e-06, |
| "loss": 0.5151, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.20931530040978089, |
| "grad_norm": 2.10489821434021, |
| "learning_rate": 9.181088978099203e-06, |
| "loss": 0.4928, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.20993093364628024, |
| "grad_norm": 2.1959035396575928, |
| "learning_rate": 9.17561132603083e-06, |
| "loss": 0.4913, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.2105465668827796, |
| "grad_norm": 2.1556055545806885, |
| "learning_rate": 9.170117060561296e-06, |
| "loss": 0.4652, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.21116220011927894, |
| "grad_norm": 2.0064280033111572, |
| "learning_rate": 9.164606203550498e-06, |
| "loss": 0.4924, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.2117778333557783, |
| "grad_norm": 2.0715909004211426, |
| "learning_rate": 9.159078776924347e-06, |
| "loss": 0.4939, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.21239346659227765, |
| "grad_norm": 1.9962373971939087, |
| "learning_rate": 9.153534802674675e-06, |
| "loss": 0.5172, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.213009099828777, |
| "grad_norm": 2.0290207862854004, |
| "learning_rate": 9.147974302859158e-06, |
| "loss": 0.4953, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.21362473306527635, |
| "grad_norm": 2.3874406814575195, |
| "learning_rate": 9.142397299601216e-06, |
| "loss": 0.4644, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.21424036630177573, |
| "grad_norm": 2.1734139919281006, |
| "learning_rate": 9.136803815089936e-06, |
| "loss": 0.4563, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.21485599953827508, |
| "grad_norm": 2.2753751277923584, |
| "learning_rate": 9.131193871579975e-06, |
| "loss": 0.4988, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.21547163277477444, |
| "grad_norm": 2.179553508758545, |
| "learning_rate": 9.125567491391476e-06, |
| "loss": 0.5437, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.2160872660112738, |
| "grad_norm": 1.9942052364349365, |
| "learning_rate": 9.119924696909979e-06, |
| "loss": 0.4784, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.21670289924777314, |
| "grad_norm": 2.176542043685913, |
| "learning_rate": 9.114265510586329e-06, |
| "loss": 0.491, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.2173185324842725, |
| "grad_norm": 2.19527268409729, |
| "learning_rate": 9.108589954936592e-06, |
| "loss": 0.5015, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.21793416572077184, |
| "grad_norm": 2.237499475479126, |
| "learning_rate": 9.102898052541959e-06, |
| "loss": 0.5393, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.2185497989572712, |
| "grad_norm": 2.2937676906585693, |
| "learning_rate": 9.09718982604866e-06, |
| "loss": 0.5236, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.21916543219377055, |
| "grad_norm": 2.004369020462036, |
| "learning_rate": 9.091465298167876e-06, |
| "loss": 0.4828, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.21978106543026993, |
| "grad_norm": 2.072390556335449, |
| "learning_rate": 9.085724491675642e-06, |
| "loss": 0.5532, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.22039669866676928, |
| "grad_norm": 2.0293619632720947, |
| "learning_rate": 9.079967429412766e-06, |
| "loss": 0.4947, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.22101233190326863, |
| "grad_norm": 2.168522357940674, |
| "learning_rate": 9.074194134284726e-06, |
| "loss": 0.5111, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.22162796513976799, |
| "grad_norm": 1.933297872543335, |
| "learning_rate": 9.068404629261587e-06, |
| "loss": 0.466, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.22224359837626734, |
| "grad_norm": 1.9404878616333008, |
| "learning_rate": 9.062598937377911e-06, |
| "loss": 0.4857, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.2228592316127667, |
| "grad_norm": 2.2384283542633057, |
| "learning_rate": 9.05677708173266e-06, |
| "loss": 0.5159, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.22347486484926604, |
| "grad_norm": 2.2054219245910645, |
| "learning_rate": 9.050939085489104e-06, |
| "loss": 0.5122, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.2240904980857654, |
| "grad_norm": 2.036163330078125, |
| "learning_rate": 9.045084971874738e-06, |
| "loss": 0.4962, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.22470613132226477, |
| "grad_norm": 2.03003191947937, |
| "learning_rate": 9.039214764181175e-06, |
| "loss": 0.4877, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.22532176455876413, |
| "grad_norm": 2.1945431232452393, |
| "learning_rate": 9.033328485764068e-06, |
| "loss": 0.4999, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.22593739779526348, |
| "grad_norm": 2.0336525440216064, |
| "learning_rate": 9.027426160043005e-06, |
| "loss": 0.4955, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.22655303103176283, |
| "grad_norm": 2.09950590133667, |
| "learning_rate": 9.021507810501422e-06, |
| "loss": 0.5286, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.22716866426826218, |
| "grad_norm": 2.2560412883758545, |
| "learning_rate": 9.01557346068651e-06, |
| "loss": 0.5208, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.22778429750476153, |
| "grad_norm": 2.1105082035064697, |
| "learning_rate": 9.00962313420912e-06, |
| "loss": 0.5187, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.2283999307412609, |
| "grad_norm": 2.1909470558166504, |
| "learning_rate": 9.003656854743667e-06, |
| "loss": 0.5056, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.22901556397776024, |
| "grad_norm": 2.144836187362671, |
| "learning_rate": 8.997674646028044e-06, |
| "loss": 0.5014, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.2296311972142596, |
| "grad_norm": 2.117741823196411, |
| "learning_rate": 8.991676531863507e-06, |
| "loss": 0.489, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.23024683045075897, |
| "grad_norm": 2.0648508071899414, |
| "learning_rate": 8.985662536114614e-06, |
| "loss": 0.5018, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.23086246368725832, |
| "grad_norm": 2.022077798843384, |
| "learning_rate": 8.979632682709093e-06, |
| "loss": 0.5197, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.23147809692375768, |
| "grad_norm": 2.2208433151245117, |
| "learning_rate": 8.973586995637778e-06, |
| "loss": 0.5082, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.23209373016025703, |
| "grad_norm": 2.2770118713378906, |
| "learning_rate": 8.967525498954488e-06, |
| "loss": 0.5106, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.23270936339675638, |
| "grad_norm": 2.2035584449768066, |
| "learning_rate": 8.961448216775955e-06, |
| "loss": 0.5177, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.23332499663325573, |
| "grad_norm": 2.2252767086029053, |
| "learning_rate": 8.955355173281709e-06, |
| "loss": 0.4947, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.23394062986975508, |
| "grad_norm": 2.2952847480773926, |
| "learning_rate": 8.949246392713986e-06, |
| "loss": 0.4586, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.23455626310625444, |
| "grad_norm": 2.498413324356079, |
| "learning_rate": 8.943121899377649e-06, |
| "loss": 0.5331, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.23517189634275382, |
| "grad_norm": 2.176753520965576, |
| "learning_rate": 8.936981717640061e-06, |
| "loss": 0.4954, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.23578752957925317, |
| "grad_norm": 2.187340259552002, |
| "learning_rate": 8.930825871931012e-06, |
| "loss": 0.52, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.23640316281575252, |
| "grad_norm": 2.0732364654541016, |
| "learning_rate": 8.924654386742613e-06, |
| "loss": 0.5076, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.23701879605225187, |
| "grad_norm": 2.4784514904022217, |
| "learning_rate": 8.9184672866292e-06, |
| "loss": 0.4996, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.23763442928875123, |
| "grad_norm": 2.0487194061279297, |
| "learning_rate": 8.912264596207233e-06, |
| "loss": 0.4942, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.23825006252525058, |
| "grad_norm": 2.120363473892212, |
| "learning_rate": 8.906046340155203e-06, |
| "loss": 0.5164, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.23886569576174993, |
| "grad_norm": 2.0596227645874023, |
| "learning_rate": 8.899812543213532e-06, |
| "loss": 0.5168, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.23948132899824928, |
| "grad_norm": 2.095977306365967, |
| "learning_rate": 8.89356323018447e-06, |
| "loss": 0.4889, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.24009696223474866, |
| "grad_norm": 2.0541493892669678, |
| "learning_rate": 8.88729842593201e-06, |
| "loss": 0.5029, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.24071259547124801, |
| "grad_norm": 1.9384243488311768, |
| "learning_rate": 8.881018155381766e-06, |
| "loss": 0.5098, |
| "step": 391 |
| }, |
| { |
| "epoch": 0.24132822870774737, |
| "grad_norm": 2.151761054992676, |
| "learning_rate": 8.874722443520898e-06, |
| "loss": 0.4938, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.24194386194424672, |
| "grad_norm": 1.8888019323349, |
| "learning_rate": 8.868411315398e-06, |
| "loss": 0.4587, |
| "step": 393 |
| }, |
| { |
| "epoch": 0.24255949518074607, |
| "grad_norm": 1.9799542427062988, |
| "learning_rate": 8.862084796122998e-06, |
| "loss": 0.4672, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.24317512841724542, |
| "grad_norm": 2.1893832683563232, |
| "learning_rate": 8.85574291086706e-06, |
| "loss": 0.5069, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.24379076165374478, |
| "grad_norm": 2.151860237121582, |
| "learning_rate": 8.849385684862483e-06, |
| "loss": 0.4905, |
| "step": 396 |
| }, |
| { |
| "epoch": 0.24440639489024413, |
| "grad_norm": 1.8696808815002441, |
| "learning_rate": 8.84301314340261e-06, |
| "loss": 0.4912, |
| "step": 397 |
| }, |
| { |
| "epoch": 0.24502202812674348, |
| "grad_norm": 2.1755058765411377, |
| "learning_rate": 8.836625311841711e-06, |
| "loss": 0.5006, |
| "step": 398 |
| }, |
| { |
| "epoch": 0.24563766136324286, |
| "grad_norm": 2.073040008544922, |
| "learning_rate": 8.83022221559489e-06, |
| "loss": 0.4933, |
| "step": 399 |
| }, |
| { |
| "epoch": 0.2462532945997422, |
| "grad_norm": 2.1384034156799316, |
| "learning_rate": 8.823803880137993e-06, |
| "loss": 0.4981, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.2462532945997422, |
| "eval_loss": 0.49532872438430786, |
| "eval_runtime": 119.469, |
| "eval_samples_per_second": 35.164, |
| "eval_steps_per_second": 4.403, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.24686892783624156, |
| "grad_norm": 2.1016058921813965, |
| "learning_rate": 8.817370331007488e-06, |
| "loss": 0.4962, |
| "step": 401 |
| }, |
| { |
| "epoch": 0.24748456107274092, |
| "grad_norm": 2.3278987407684326, |
| "learning_rate": 8.810921593800377e-06, |
| "loss": 0.5203, |
| "step": 402 |
| }, |
| { |
| "epoch": 0.24810019430924027, |
| "grad_norm": 1.9771771430969238, |
| "learning_rate": 8.804457694174093e-06, |
| "loss": 0.4822, |
| "step": 403 |
| }, |
| { |
| "epoch": 0.24871582754573962, |
| "grad_norm": 2.107381582260132, |
| "learning_rate": 8.797978657846391e-06, |
| "loss": 0.4842, |
| "step": 404 |
| }, |
| { |
| "epoch": 0.24933146078223897, |
| "grad_norm": 2.2924184799194336, |
| "learning_rate": 8.791484510595254e-06, |
| "loss": 0.506, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.24994709401873832, |
| "grad_norm": 2.1443545818328857, |
| "learning_rate": 8.784975278258783e-06, |
| "loss": 0.4708, |
| "step": 406 |
| }, |
| { |
| "epoch": 0.2505627272552377, |
| "grad_norm": 2.0638580322265625, |
| "learning_rate": 8.7784509867351e-06, |
| "loss": 0.4986, |
| "step": 407 |
| }, |
| { |
| "epoch": 0.25117836049173703, |
| "grad_norm": 2.3423922061920166, |
| "learning_rate": 8.77191166198224e-06, |
| "loss": 0.4741, |
| "step": 408 |
| }, |
| { |
| "epoch": 0.2517939937282364, |
| "grad_norm": 2.2074921131134033, |
| "learning_rate": 8.765357330018056e-06, |
| "loss": 0.5047, |
| "step": 409 |
| }, |
| { |
| "epoch": 0.25240962696473573, |
| "grad_norm": 1.9105783700942993, |
| "learning_rate": 8.758788016920102e-06, |
| "loss": 0.4464, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.2530252602012351, |
| "grad_norm": 2.2252490520477295, |
| "learning_rate": 8.752203748825542e-06, |
| "loss": 0.4925, |
| "step": 411 |
| }, |
| { |
| "epoch": 0.2536408934377345, |
| "grad_norm": 2.2548561096191406, |
| "learning_rate": 8.745604551931042e-06, |
| "loss": 0.5135, |
| "step": 412 |
| }, |
| { |
| "epoch": 0.2542565266742338, |
| "grad_norm": 1.8392614126205444, |
| "learning_rate": 8.73899045249266e-06, |
| "loss": 0.5312, |
| "step": 413 |
| }, |
| { |
| "epoch": 0.2548721599107332, |
| "grad_norm": 2.1313741207122803, |
| "learning_rate": 8.732361476825752e-06, |
| "loss": 0.5049, |
| "step": 414 |
| }, |
| { |
| "epoch": 0.2554877931472325, |
| "grad_norm": 2.245140790939331, |
| "learning_rate": 8.725717651304856e-06, |
| "loss": 0.5204, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.2561034263837319, |
| "grad_norm": 2.030898332595825, |
| "learning_rate": 8.719059002363598e-06, |
| "loss": 0.4646, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.2567190596202312, |
| "grad_norm": 1.904626727104187, |
| "learning_rate": 8.71238555649458e-06, |
| "loss": 0.4824, |
| "step": 417 |
| }, |
| { |
| "epoch": 0.2573346928567306, |
| "grad_norm": 2.06482195854187, |
| "learning_rate": 8.705697340249275e-06, |
| "loss": 0.4946, |
| "step": 418 |
| }, |
| { |
| "epoch": 0.25795032609322993, |
| "grad_norm": 1.913489818572998, |
| "learning_rate": 8.698994380237921e-06, |
| "loss": 0.46, |
| "step": 419 |
| }, |
| { |
| "epoch": 0.2585659593297293, |
| "grad_norm": 2.09264874458313, |
| "learning_rate": 8.692276703129421e-06, |
| "loss": 0.4871, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.2591815925662287, |
| "grad_norm": 2.0330116748809814, |
| "learning_rate": 8.685544335651226e-06, |
| "loss": 0.4607, |
| "step": 421 |
| }, |
| { |
| "epoch": 0.259797225802728, |
| "grad_norm": 2.0528652667999268, |
| "learning_rate": 8.678797304589245e-06, |
| "loss": 0.457, |
| "step": 422 |
| }, |
| { |
| "epoch": 0.2604128590392274, |
| "grad_norm": 2.1522958278656006, |
| "learning_rate": 8.672035636787721e-06, |
| "loss": 0.4952, |
| "step": 423 |
| }, |
| { |
| "epoch": 0.2610284922757267, |
| "grad_norm": 1.9825266599655151, |
| "learning_rate": 8.665259359149132e-06, |
| "loss": 0.483, |
| "step": 424 |
| }, |
| { |
| "epoch": 0.2616441255122261, |
| "grad_norm": 2.00447678565979, |
| "learning_rate": 8.658468498634089e-06, |
| "loss": 0.4992, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.2622597587487254, |
| "grad_norm": 2.057898759841919, |
| "learning_rate": 8.651663082261217e-06, |
| "loss": 0.477, |
| "step": 426 |
| }, |
| { |
| "epoch": 0.2628753919852248, |
| "grad_norm": 2.0360608100891113, |
| "learning_rate": 8.644843137107058e-06, |
| "loss": 0.4907, |
| "step": 427 |
| }, |
| { |
| "epoch": 0.2634910252217242, |
| "grad_norm": 2.054442882537842, |
| "learning_rate": 8.638008690305961e-06, |
| "loss": 0.4841, |
| "step": 428 |
| }, |
| { |
| "epoch": 0.2641066584582235, |
| "grad_norm": 2.14125919342041, |
| "learning_rate": 8.631159769049965e-06, |
| "loss": 0.4952, |
| "step": 429 |
| }, |
| { |
| "epoch": 0.2647222916947229, |
| "grad_norm": 1.8915045261383057, |
| "learning_rate": 8.62429640058871e-06, |
| "loss": 0.4932, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.2653379249312222, |
| "grad_norm": 2.1041178703308105, |
| "learning_rate": 8.617418612229303e-06, |
| "loss": 0.5157, |
| "step": 431 |
| }, |
| { |
| "epoch": 0.2659535581677216, |
| "grad_norm": 2.169093132019043, |
| "learning_rate": 8.610526431336235e-06, |
| "loss": 0.4761, |
| "step": 432 |
| }, |
| { |
| "epoch": 0.2665691914042209, |
| "grad_norm": 2.22904896736145, |
| "learning_rate": 8.603619885331251e-06, |
| "loss": 0.4891, |
| "step": 433 |
| }, |
| { |
| "epoch": 0.2671848246407203, |
| "grad_norm": 2.2368030548095703, |
| "learning_rate": 8.596699001693257e-06, |
| "loss": 0.4992, |
| "step": 434 |
| }, |
| { |
| "epoch": 0.2678004578772196, |
| "grad_norm": 2.204688310623169, |
| "learning_rate": 8.589763807958198e-06, |
| "loss": 0.5025, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.268416091113719, |
| "grad_norm": 2.0749876499176025, |
| "learning_rate": 8.582814331718961e-06, |
| "loss": 0.4672, |
| "step": 436 |
| }, |
| { |
| "epoch": 0.2690317243502184, |
| "grad_norm": 2.1938445568084717, |
| "learning_rate": 8.575850600625252e-06, |
| "loss": 0.4928, |
| "step": 437 |
| }, |
| { |
| "epoch": 0.2696473575867177, |
| "grad_norm": 2.2102932929992676, |
| "learning_rate": 8.568872642383497e-06, |
| "loss": 0.5046, |
| "step": 438 |
| }, |
| { |
| "epoch": 0.2702629908232171, |
| "grad_norm": 2.0184459686279297, |
| "learning_rate": 8.561880484756726e-06, |
| "loss": 0.488, |
| "step": 439 |
| }, |
| { |
| "epoch": 0.2708786240597164, |
| "grad_norm": 2.0338387489318848, |
| "learning_rate": 8.554874155564459e-06, |
| "loss": 0.5106, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.2714942572962158, |
| "grad_norm": 1.887638807296753, |
| "learning_rate": 8.547853682682605e-06, |
| "loss": 0.4739, |
| "step": 441 |
| }, |
| { |
| "epoch": 0.2721098905327151, |
| "grad_norm": 1.763267993927002, |
| "learning_rate": 8.540819094043349e-06, |
| "loss": 0.4643, |
| "step": 442 |
| }, |
| { |
| "epoch": 0.2727255237692145, |
| "grad_norm": 1.9619516134262085, |
| "learning_rate": 8.53377041763503e-06, |
| "loss": 0.4846, |
| "step": 443 |
| }, |
| { |
| "epoch": 0.2733411570057138, |
| "grad_norm": 2.0067808628082275, |
| "learning_rate": 8.526707681502045e-06, |
| "loss": 0.4675, |
| "step": 444 |
| }, |
| { |
| "epoch": 0.2739567902422132, |
| "grad_norm": 2.1719822883605957, |
| "learning_rate": 8.519630913744726e-06, |
| "loss": 0.5093, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.2745724234787126, |
| "grad_norm": 2.05495285987854, |
| "learning_rate": 8.512540142519232e-06, |
| "loss": 0.462, |
| "step": 446 |
| }, |
| { |
| "epoch": 0.2751880567152119, |
| "grad_norm": 2.103501081466675, |
| "learning_rate": 8.50543539603744e-06, |
| "loss": 0.4913, |
| "step": 447 |
| }, |
| { |
| "epoch": 0.2758036899517113, |
| "grad_norm": 2.015477418899536, |
| "learning_rate": 8.498316702566828e-06, |
| "loss": 0.5072, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.2764193231882106, |
| "grad_norm": 2.2199337482452393, |
| "learning_rate": 8.491184090430365e-06, |
| "loss": 0.4917, |
| "step": 449 |
| }, |
| { |
| "epoch": 0.27703495642471, |
| "grad_norm": 2.044389009475708, |
| "learning_rate": 8.484037588006398e-06, |
| "loss": 0.4719, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.2776505896612093, |
| "grad_norm": 2.093029260635376, |
| "learning_rate": 8.476877223728539e-06, |
| "loss": 0.4918, |
| "step": 451 |
| }, |
| { |
| "epoch": 0.2782662228977087, |
| "grad_norm": 2.2878201007843018, |
| "learning_rate": 8.469703026085551e-06, |
| "loss": 0.481, |
| "step": 452 |
| }, |
| { |
| "epoch": 0.2788818561342081, |
| "grad_norm": 2.4797415733337402, |
| "learning_rate": 8.462515023621237e-06, |
| "loss": 0.4938, |
| "step": 453 |
| }, |
| { |
| "epoch": 0.2794974893707074, |
| "grad_norm": 2.1992409229278564, |
| "learning_rate": 8.455313244934324e-06, |
| "loss": 0.5019, |
| "step": 454 |
| }, |
| { |
| "epoch": 0.2801131226072068, |
| "grad_norm": 2.093852996826172, |
| "learning_rate": 8.44809771867835e-06, |
| "loss": 0.4769, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.2807287558437061, |
| "grad_norm": 1.909757375717163, |
| "learning_rate": 8.44086847356155e-06, |
| "loss": 0.4817, |
| "step": 456 |
| }, |
| { |
| "epoch": 0.2813443890802055, |
| "grad_norm": 2.1621382236480713, |
| "learning_rate": 8.433625538346742e-06, |
| "loss": 0.5072, |
| "step": 457 |
| }, |
| { |
| "epoch": 0.2819600223167048, |
| "grad_norm": 2.0320560932159424, |
| "learning_rate": 8.426368941851212e-06, |
| "loss": 0.4586, |
| "step": 458 |
| }, |
| { |
| "epoch": 0.2825756555532042, |
| "grad_norm": 2.267420768737793, |
| "learning_rate": 8.4190987129466e-06, |
| "loss": 0.4661, |
| "step": 459 |
| }, |
| { |
| "epoch": 0.2831912887897035, |
| "grad_norm": 2.3038079738616943, |
| "learning_rate": 8.41181488055879e-06, |
| "loss": 0.5029, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.2838069220262029, |
| "grad_norm": 2.2598018646240234, |
| "learning_rate": 8.404517473667779e-06, |
| "loss": 0.4913, |
| "step": 461 |
| }, |
| { |
| "epoch": 0.28442255526270227, |
| "grad_norm": 2.0566563606262207, |
| "learning_rate": 8.397206521307584e-06, |
| "loss": 0.4903, |
| "step": 462 |
| }, |
| { |
| "epoch": 0.2850381884992016, |
| "grad_norm": 2.1668128967285156, |
| "learning_rate": 8.389882052566106e-06, |
| "loss": 0.5088, |
| "step": 463 |
| }, |
| { |
| "epoch": 0.285653821735701, |
| "grad_norm": 2.189140796661377, |
| "learning_rate": 8.382544096585028e-06, |
| "loss": 0.4873, |
| "step": 464 |
| }, |
| { |
| "epoch": 0.2862694549722003, |
| "grad_norm": 1.9882960319519043, |
| "learning_rate": 8.375192682559692e-06, |
| "loss": 0.4853, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.2868850882086997, |
| "grad_norm": 1.9169676303863525, |
| "learning_rate": 8.36782783973899e-06, |
| "loss": 0.4884, |
| "step": 466 |
| }, |
| { |
| "epoch": 0.287500721445199, |
| "grad_norm": 1.9822300672531128, |
| "learning_rate": 8.360449597425236e-06, |
| "loss": 0.4691, |
| "step": 467 |
| }, |
| { |
| "epoch": 0.2881163546816984, |
| "grad_norm": 2.0725347995758057, |
| "learning_rate": 8.353057984974062e-06, |
| "loss": 0.4669, |
| "step": 468 |
| }, |
| { |
| "epoch": 0.2887319879181977, |
| "grad_norm": 2.0785980224609375, |
| "learning_rate": 8.345653031794292e-06, |
| "loss": 0.4813, |
| "step": 469 |
| }, |
| { |
| "epoch": 0.2893476211546971, |
| "grad_norm": 2.356861114501953, |
| "learning_rate": 8.338234767347829e-06, |
| "loss": 0.5462, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.28996325439119647, |
| "grad_norm": 2.0012290477752686, |
| "learning_rate": 8.33080322114954e-06, |
| "loss": 0.4935, |
| "step": 471 |
| }, |
| { |
| "epoch": 0.2905788876276958, |
| "grad_norm": 2.2095272541046143, |
| "learning_rate": 8.32335842276713e-06, |
| "loss": 0.4841, |
| "step": 472 |
| }, |
| { |
| "epoch": 0.29119452086419517, |
| "grad_norm": 2.300325632095337, |
| "learning_rate": 8.315900401821034e-06, |
| "loss": 0.508, |
| "step": 473 |
| }, |
| { |
| "epoch": 0.2918101541006945, |
| "grad_norm": 1.9145146608352661, |
| "learning_rate": 8.308429187984298e-06, |
| "loss": 0.4741, |
| "step": 474 |
| }, |
| { |
| "epoch": 0.2924257873371939, |
| "grad_norm": 2.039344549179077, |
| "learning_rate": 8.300944810982452e-06, |
| "loss": 0.5042, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.2930414205736932, |
| "grad_norm": 2.0209226608276367, |
| "learning_rate": 8.293447300593402e-06, |
| "loss": 0.5022, |
| "step": 476 |
| }, |
| { |
| "epoch": 0.2936570538101926, |
| "grad_norm": 2.110853910446167, |
| "learning_rate": 8.28593668664731e-06, |
| "loss": 0.5069, |
| "step": 477 |
| }, |
| { |
| "epoch": 0.29427268704669196, |
| "grad_norm": 2.0190882682800293, |
| "learning_rate": 8.278412999026462e-06, |
| "loss": 0.4713, |
| "step": 478 |
| }, |
| { |
| "epoch": 0.2948883202831913, |
| "grad_norm": 1.9714604616165161, |
| "learning_rate": 8.270876267665173e-06, |
| "loss": 0.4787, |
| "step": 479 |
| }, |
| { |
| "epoch": 0.29550395351969067, |
| "grad_norm": 2.083486795425415, |
| "learning_rate": 8.263326522549647e-06, |
| "loss": 0.5078, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.29611958675619, |
| "grad_norm": 2.099017381668091, |
| "learning_rate": 8.255763793717868e-06, |
| "loss": 0.5128, |
| "step": 481 |
| }, |
| { |
| "epoch": 0.29673521999268937, |
| "grad_norm": 1.9817054271697998, |
| "learning_rate": 8.248188111259479e-06, |
| "loss": 0.5162, |
| "step": 482 |
| }, |
| { |
| "epoch": 0.2973508532291887, |
| "grad_norm": 1.9473239183425903, |
| "learning_rate": 8.240599505315656e-06, |
| "loss": 0.4715, |
| "step": 483 |
| }, |
| { |
| "epoch": 0.2979664864656881, |
| "grad_norm": 1.8737359046936035, |
| "learning_rate": 8.232998006078998e-06, |
| "loss": 0.4758, |
| "step": 484 |
| }, |
| { |
| "epoch": 0.2985821197021874, |
| "grad_norm": 2.0523834228515625, |
| "learning_rate": 8.225383643793405e-06, |
| "loss": 0.4855, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.2991977529386868, |
| "grad_norm": 2.096587657928467, |
| "learning_rate": 8.217756448753948e-06, |
| "loss": 0.4902, |
| "step": 486 |
| }, |
| { |
| "epoch": 0.29981338617518616, |
| "grad_norm": 1.9058270454406738, |
| "learning_rate": 8.210116451306762e-06, |
| "loss": 0.4588, |
| "step": 487 |
| }, |
| { |
| "epoch": 0.3004290194116855, |
| "grad_norm": 1.8534164428710938, |
| "learning_rate": 8.20246368184891e-06, |
| "loss": 0.463, |
| "step": 488 |
| }, |
| { |
| "epoch": 0.30104465264818486, |
| "grad_norm": 2.1333110332489014, |
| "learning_rate": 8.19479817082828e-06, |
| "loss": 0.4698, |
| "step": 489 |
| }, |
| { |
| "epoch": 0.3016602858846842, |
| "grad_norm": 1.8725422620773315, |
| "learning_rate": 8.18711994874345e-06, |
| "loss": 0.4917, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.30227591912118357, |
| "grad_norm": 1.8624604940414429, |
| "learning_rate": 8.17942904614357e-06, |
| "loss": 0.4325, |
| "step": 491 |
| }, |
| { |
| "epoch": 0.3028915523576829, |
| "grad_norm": 1.9307676553726196, |
| "learning_rate": 8.171725493628244e-06, |
| "loss": 0.5097, |
| "step": 492 |
| }, |
| { |
| "epoch": 0.30350718559418227, |
| "grad_norm": 1.9485713243484497, |
| "learning_rate": 8.164009321847405e-06, |
| "loss": 0.4537, |
| "step": 493 |
| }, |
| { |
| "epoch": 0.3041228188306816, |
| "grad_norm": 1.9698349237442017, |
| "learning_rate": 8.156280561501196e-06, |
| "loss": 0.4916, |
| "step": 494 |
| }, |
| { |
| "epoch": 0.304738452067181, |
| "grad_norm": 1.8933942317962646, |
| "learning_rate": 8.148539243339842e-06, |
| "loss": 0.4615, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.30535408530368036, |
| "grad_norm": 1.9967283010482788, |
| "learning_rate": 8.140785398163535e-06, |
| "loss": 0.5266, |
| "step": 496 |
| }, |
| { |
| "epoch": 0.3059697185401797, |
| "grad_norm": 1.8503018617630005, |
| "learning_rate": 8.133019056822303e-06, |
| "loss": 0.4874, |
| "step": 497 |
| }, |
| { |
| "epoch": 0.30658535177667906, |
| "grad_norm": 2.033705949783325, |
| "learning_rate": 8.1252402502159e-06, |
| "loss": 0.4927, |
| "step": 498 |
| }, |
| { |
| "epoch": 0.3072009850131784, |
| "grad_norm": 1.9999321699142456, |
| "learning_rate": 8.117449009293668e-06, |
| "loss": 0.4997, |
| "step": 499 |
| }, |
| { |
| "epoch": 0.30781661824967776, |
| "grad_norm": 2.0142297744750977, |
| "learning_rate": 8.109645365054426e-06, |
| "loss": 0.4879, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.30781661824967776, |
| "eval_loss": 0.4839506447315216, |
| "eval_runtime": 119.0711, |
| "eval_samples_per_second": 35.281, |
| "eval_steps_per_second": 4.418, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.3084322514861771, |
| "grad_norm": 2.0126116275787354, |
| "learning_rate": 8.101829348546336e-06, |
| "loss": 0.4833, |
| "step": 501 |
| }, |
| { |
| "epoch": 0.30904788472267647, |
| "grad_norm": 1.8968122005462646, |
| "learning_rate": 8.094000990866795e-06, |
| "loss": 0.497, |
| "step": 502 |
| }, |
| { |
| "epoch": 0.3096635179591758, |
| "grad_norm": 2.234069585800171, |
| "learning_rate": 8.086160323162288e-06, |
| "loss": 0.5034, |
| "step": 503 |
| }, |
| { |
| "epoch": 0.3102791511956752, |
| "grad_norm": 2.1748671531677246, |
| "learning_rate": 8.078307376628292e-06, |
| "loss": 0.5222, |
| "step": 504 |
| }, |
| { |
| "epoch": 0.31089478443217455, |
| "grad_norm": 1.9957149028778076, |
| "learning_rate": 8.070442182509127e-06, |
| "loss": 0.4856, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.3115104176686739, |
| "grad_norm": 2.1460072994232178, |
| "learning_rate": 8.062564772097844e-06, |
| "loss": 0.4864, |
| "step": 506 |
| }, |
| { |
| "epoch": 0.31212605090517326, |
| "grad_norm": 1.902100920677185, |
| "learning_rate": 8.054675176736104e-06, |
| "loss": 0.4761, |
| "step": 507 |
| }, |
| { |
| "epoch": 0.3127416841416726, |
| "grad_norm": 2.132812023162842, |
| "learning_rate": 8.046773427814043e-06, |
| "loss": 0.4882, |
| "step": 508 |
| }, |
| { |
| "epoch": 0.31335731737817196, |
| "grad_norm": 1.9943275451660156, |
| "learning_rate": 8.038859556770152e-06, |
| "loss": 0.4956, |
| "step": 509 |
| }, |
| { |
| "epoch": 0.3139729506146713, |
| "grad_norm": 2.189013957977295, |
| "learning_rate": 8.030933595091152e-06, |
| "loss": 0.5044, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.31458858385117067, |
| "grad_norm": 2.0293703079223633, |
| "learning_rate": 8.022995574311876e-06, |
| "loss": 0.4922, |
| "step": 511 |
| }, |
| { |
| "epoch": 0.31520421708767005, |
| "grad_norm": 1.961392879486084, |
| "learning_rate": 8.015045526015124e-06, |
| "loss": 0.4703, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.31581985032416937, |
| "grad_norm": 1.8382376432418823, |
| "learning_rate": 8.00708348183156e-06, |
| "loss": 0.4599, |
| "step": 513 |
| }, |
| { |
| "epoch": 0.31643548356066875, |
| "grad_norm": 2.000192165374756, |
| "learning_rate": 7.99910947343957e-06, |
| "loss": 0.4659, |
| "step": 514 |
| }, |
| { |
| "epoch": 0.3170511167971681, |
| "grad_norm": 2.1506423950195312, |
| "learning_rate": 7.991123532565142e-06, |
| "loss": 0.4923, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.31766675003366746, |
| "grad_norm": 2.114412307739258, |
| "learning_rate": 7.983125690981743e-06, |
| "loss": 0.4632, |
| "step": 516 |
| }, |
| { |
| "epoch": 0.3182823832701668, |
| "grad_norm": 2.0266101360321045, |
| "learning_rate": 7.975115980510187e-06, |
| "loss": 0.4797, |
| "step": 517 |
| }, |
| { |
| "epoch": 0.31889801650666616, |
| "grad_norm": 2.1428349018096924, |
| "learning_rate": 7.967094433018508e-06, |
| "loss": 0.5035, |
| "step": 518 |
| }, |
| { |
| "epoch": 0.3195136497431655, |
| "grad_norm": 2.085111141204834, |
| "learning_rate": 7.95906108042184e-06, |
| "loss": 0.5129, |
| "step": 519 |
| }, |
| { |
| "epoch": 0.32012928297966486, |
| "grad_norm": 2.149914026260376, |
| "learning_rate": 7.951015954682281e-06, |
| "loss": 0.4803, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.32074491621616424, |
| "grad_norm": 1.8125468492507935, |
| "learning_rate": 7.942959087808776e-06, |
| "loss": 0.4819, |
| "step": 521 |
| }, |
| { |
| "epoch": 0.32136054945266357, |
| "grad_norm": 2.132568597793579, |
| "learning_rate": 7.934890511856982e-06, |
| "loss": 0.5151, |
| "step": 522 |
| }, |
| { |
| "epoch": 0.32197618268916295, |
| "grad_norm": 1.9607751369476318, |
| "learning_rate": 7.926810258929138e-06, |
| "loss": 0.453, |
| "step": 523 |
| }, |
| { |
| "epoch": 0.3225918159256623, |
| "grad_norm": 1.891742467880249, |
| "learning_rate": 7.918718361173951e-06, |
| "loss": 0.4499, |
| "step": 524 |
| }, |
| { |
| "epoch": 0.32320744916216165, |
| "grad_norm": 2.0719101428985596, |
| "learning_rate": 7.910614850786448e-06, |
| "loss": 0.4726, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.323823082398661, |
| "grad_norm": 1.9925060272216797, |
| "learning_rate": 7.902499760007867e-06, |
| "loss": 0.4841, |
| "step": 526 |
| }, |
| { |
| "epoch": 0.32443871563516036, |
| "grad_norm": 1.9080984592437744, |
| "learning_rate": 7.89437312112552e-06, |
| "loss": 0.4618, |
| "step": 527 |
| }, |
| { |
| "epoch": 0.3250543488716597, |
| "grad_norm": 2.142263412475586, |
| "learning_rate": 7.886234966472664e-06, |
| "loss": 0.5141, |
| "step": 528 |
| }, |
| { |
| "epoch": 0.32566998210815906, |
| "grad_norm": 2.0402634143829346, |
| "learning_rate": 7.87808532842837e-06, |
| "loss": 0.4661, |
| "step": 529 |
| }, |
| { |
| "epoch": 0.32628561534465844, |
| "grad_norm": 1.8866894245147705, |
| "learning_rate": 7.8699242394174e-06, |
| "loss": 0.471, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.32690124858115777, |
| "grad_norm": 2.106036901473999, |
| "learning_rate": 7.86175173191008e-06, |
| "loss": 0.4684, |
| "step": 531 |
| }, |
| { |
| "epoch": 0.32751688181765715, |
| "grad_norm": 1.9650591611862183, |
| "learning_rate": 7.85356783842216e-06, |
| "loss": 0.4984, |
| "step": 532 |
| }, |
| { |
| "epoch": 0.32813251505415647, |
| "grad_norm": 1.8672831058502197, |
| "learning_rate": 7.845372591514694e-06, |
| "loss": 0.4811, |
| "step": 533 |
| }, |
| { |
| "epoch": 0.32874814829065585, |
| "grad_norm": 1.993276834487915, |
| "learning_rate": 7.83716602379391e-06, |
| "loss": 0.4646, |
| "step": 534 |
| }, |
| { |
| "epoch": 0.3293637815271552, |
| "grad_norm": 2.1027157306671143, |
| "learning_rate": 7.828948167911073e-06, |
| "loss": 0.5203, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.32997941476365455, |
| "grad_norm": 2.2700319290161133, |
| "learning_rate": 7.820719056562363e-06, |
| "loss": 0.5072, |
| "step": 536 |
| }, |
| { |
| "epoch": 0.33059504800015393, |
| "grad_norm": 1.9552809000015259, |
| "learning_rate": 7.812478722488741e-06, |
| "loss": 0.4929, |
| "step": 537 |
| }, |
| { |
| "epoch": 0.33121068123665326, |
| "grad_norm": 2.141432523727417, |
| "learning_rate": 7.804227198475823e-06, |
| "loss": 0.493, |
| "step": 538 |
| }, |
| { |
| "epoch": 0.33182631447315264, |
| "grad_norm": 1.8569056987762451, |
| "learning_rate": 7.795964517353734e-06, |
| "loss": 0.4676, |
| "step": 539 |
| }, |
| { |
| "epoch": 0.33244194770965196, |
| "grad_norm": 1.9998515844345093, |
| "learning_rate": 7.787690711997008e-06, |
| "loss": 0.4817, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.33305758094615134, |
| "grad_norm": 2.048243522644043, |
| "learning_rate": 7.779405815324424e-06, |
| "loss": 0.4783, |
| "step": 541 |
| }, |
| { |
| "epoch": 0.33367321418265067, |
| "grad_norm": 2.198432683944702, |
| "learning_rate": 7.771109860298895e-06, |
| "loss": 0.4748, |
| "step": 542 |
| }, |
| { |
| "epoch": 0.33428884741915005, |
| "grad_norm": 2.0038063526153564, |
| "learning_rate": 7.762802879927333e-06, |
| "loss": 0.4365, |
| "step": 543 |
| }, |
| { |
| "epoch": 0.3349044806556494, |
| "grad_norm": 1.9695496559143066, |
| "learning_rate": 7.754484907260513e-06, |
| "loss": 0.4768, |
| "step": 544 |
| }, |
| { |
| "epoch": 0.33552011389214875, |
| "grad_norm": 2.031867504119873, |
| "learning_rate": 7.746155975392948e-06, |
| "loss": 0.4615, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.33613574712864813, |
| "grad_norm": 2.1375014781951904, |
| "learning_rate": 7.737816117462752e-06, |
| "loss": 0.4713, |
| "step": 546 |
| }, |
| { |
| "epoch": 0.33675138036514746, |
| "grad_norm": 1.867982268333435, |
| "learning_rate": 7.72946536665151e-06, |
| "loss": 0.4907, |
| "step": 547 |
| }, |
| { |
| "epoch": 0.33736701360164684, |
| "grad_norm": 1.8443107604980469, |
| "learning_rate": 7.721103756184147e-06, |
| "loss": 0.4717, |
| "step": 548 |
| }, |
| { |
| "epoch": 0.33798264683814616, |
| "grad_norm": 2.0463571548461914, |
| "learning_rate": 7.712731319328798e-06, |
| "loss": 0.4587, |
| "step": 549 |
| }, |
| { |
| "epoch": 0.33859828007464554, |
| "grad_norm": 2.1210920810699463, |
| "learning_rate": 7.704348089396667e-06, |
| "loss": 0.4736, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.33921391331114487, |
| "grad_norm": 2.04471492767334, |
| "learning_rate": 7.695954099741902e-06, |
| "loss": 0.5262, |
| "step": 551 |
| }, |
| { |
| "epoch": 0.33982954654764425, |
| "grad_norm": 1.95708429813385, |
| "learning_rate": 7.687549383761463e-06, |
| "loss": 0.4963, |
| "step": 552 |
| }, |
| { |
| "epoch": 0.34044517978414357, |
| "grad_norm": 2.132002115249634, |
| "learning_rate": 7.679133974894984e-06, |
| "loss": 0.5178, |
| "step": 553 |
| }, |
| { |
| "epoch": 0.34106081302064295, |
| "grad_norm": 2.2385482788085938, |
| "learning_rate": 7.670707906624644e-06, |
| "loss": 0.4837, |
| "step": 554 |
| }, |
| { |
| "epoch": 0.34167644625714233, |
| "grad_norm": 2.0102343559265137, |
| "learning_rate": 7.662271212475034e-06, |
| "loss": 0.4738, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.34229207949364165, |
| "grad_norm": 1.831471562385559, |
| "learning_rate": 7.653823926013016e-06, |
| "loss": 0.4881, |
| "step": 556 |
| }, |
| { |
| "epoch": 0.34290771273014103, |
| "grad_norm": 1.9880772829055786, |
| "learning_rate": 7.645366080847599e-06, |
| "loss": 0.4866, |
| "step": 557 |
| }, |
| { |
| "epoch": 0.34352334596664036, |
| "grad_norm": 2.1533780097961426, |
| "learning_rate": 7.636897710629804e-06, |
| "loss": 0.4939, |
| "step": 558 |
| }, |
| { |
| "epoch": 0.34413897920313974, |
| "grad_norm": 1.9375075101852417, |
| "learning_rate": 7.628418849052523e-06, |
| "loss": 0.485, |
| "step": 559 |
| }, |
| { |
| "epoch": 0.34475461243963906, |
| "grad_norm": 2.021385431289673, |
| "learning_rate": 7.619929529850397e-06, |
| "loss": 0.4933, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.34537024567613844, |
| "grad_norm": 1.887455940246582, |
| "learning_rate": 7.611429786799664e-06, |
| "loss": 0.5031, |
| "step": 561 |
| }, |
| { |
| "epoch": 0.3459858789126378, |
| "grad_norm": 1.961814284324646, |
| "learning_rate": 7.602919653718044e-06, |
| "loss": 0.4686, |
| "step": 562 |
| }, |
| { |
| "epoch": 0.34660151214913715, |
| "grad_norm": 2.0938830375671387, |
| "learning_rate": 7.5943991644645895e-06, |
| "loss": 0.4776, |
| "step": 563 |
| }, |
| { |
| "epoch": 0.3472171453856365, |
| "grad_norm": 1.9017668962478638, |
| "learning_rate": 7.585868352939564e-06, |
| "loss": 0.446, |
| "step": 564 |
| }, |
| { |
| "epoch": 0.34783277862213585, |
| "grad_norm": 2.077495813369751, |
| "learning_rate": 7.577327253084292e-06, |
| "loss": 0.4819, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.34844841185863523, |
| "grad_norm": 1.9357627630233765, |
| "learning_rate": 7.568775898881038e-06, |
| "loss": 0.4896, |
| "step": 566 |
| }, |
| { |
| "epoch": 0.34906404509513456, |
| "grad_norm": 2.004091501235962, |
| "learning_rate": 7.560214324352858e-06, |
| "loss": 0.5051, |
| "step": 567 |
| }, |
| { |
| "epoch": 0.34967967833163394, |
| "grad_norm": 2.0148847103118896, |
| "learning_rate": 7.551642563563481e-06, |
| "loss": 0.5176, |
| "step": 568 |
| }, |
| { |
| "epoch": 0.35029531156813326, |
| "grad_norm": 2.167863130569458, |
| "learning_rate": 7.543060650617159e-06, |
| "loss": 0.4722, |
| "step": 569 |
| }, |
| { |
| "epoch": 0.35091094480463264, |
| "grad_norm": 2.0419163703918457, |
| "learning_rate": 7.534468619658534e-06, |
| "loss": 0.486, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.351526578041132, |
| "grad_norm": 2.1185479164123535, |
| "learning_rate": 7.5258665048725065e-06, |
| "loss": 0.497, |
| "step": 571 |
| }, |
| { |
| "epoch": 0.35214221127763135, |
| "grad_norm": 1.860520601272583, |
| "learning_rate": 7.517254340484097e-06, |
| "loss": 0.4484, |
| "step": 572 |
| }, |
| { |
| "epoch": 0.3527578445141307, |
| "grad_norm": 2.0788426399230957, |
| "learning_rate": 7.50863216075831e-06, |
| "loss": 0.4737, |
| "step": 573 |
| }, |
| { |
| "epoch": 0.35337347775063005, |
| "grad_norm": 2.152902603149414, |
| "learning_rate": 7.500000000000001e-06, |
| "loss": 0.4998, |
| "step": 574 |
| }, |
| { |
| "epoch": 0.35398911098712943, |
| "grad_norm": 1.9528014659881592, |
| "learning_rate": 7.49135789255373e-06, |
| "loss": 0.4687, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.35460474422362875, |
| "grad_norm": 1.8286877870559692, |
| "learning_rate": 7.482705872803637e-06, |
| "loss": 0.4693, |
| "step": 576 |
| }, |
| { |
| "epoch": 0.35522037746012813, |
| "grad_norm": 2.0849242210388184, |
| "learning_rate": 7.4740439751732994e-06, |
| "loss": 0.4599, |
| "step": 577 |
| }, |
| { |
| "epoch": 0.35583601069662746, |
| "grad_norm": 1.9514186382293701, |
| "learning_rate": 7.465372234125592e-06, |
| "loss": 0.4909, |
| "step": 578 |
| }, |
| { |
| "epoch": 0.35645164393312684, |
| "grad_norm": 1.7071397304534912, |
| "learning_rate": 7.456690684162557e-06, |
| "loss": 0.4237, |
| "step": 579 |
| }, |
| { |
| "epoch": 0.3570672771696262, |
| "grad_norm": 1.916062831878662, |
| "learning_rate": 7.447999359825263e-06, |
| "loss": 0.4683, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.35768291040612554, |
| "grad_norm": 1.9211770296096802, |
| "learning_rate": 7.4392982956936644e-06, |
| "loss": 0.4664, |
| "step": 581 |
| }, |
| { |
| "epoch": 0.3582985436426249, |
| "grad_norm": 1.7628663778305054, |
| "learning_rate": 7.43058752638647e-06, |
| "loss": 0.483, |
| "step": 582 |
| }, |
| { |
| "epoch": 0.35891417687912425, |
| "grad_norm": 1.9189079999923706, |
| "learning_rate": 7.421867086561001e-06, |
| "loss": 0.4593, |
| "step": 583 |
| }, |
| { |
| "epoch": 0.3595298101156236, |
| "grad_norm": 1.860856294631958, |
| "learning_rate": 7.413137010913055e-06, |
| "loss": 0.4742, |
| "step": 584 |
| }, |
| { |
| "epoch": 0.36014544335212295, |
| "grad_norm": 1.8644829988479614, |
| "learning_rate": 7.4043973341767695e-06, |
| "loss": 0.4894, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.36076107658862233, |
| "grad_norm": 1.9820992946624756, |
| "learning_rate": 7.395648091124476e-06, |
| "loss": 0.4886, |
| "step": 586 |
| }, |
| { |
| "epoch": 0.3613767098251217, |
| "grad_norm": 1.976830244064331, |
| "learning_rate": 7.386889316566571e-06, |
| "loss": 0.4399, |
| "step": 587 |
| }, |
| { |
| "epoch": 0.36199234306162104, |
| "grad_norm": 1.9326996803283691, |
| "learning_rate": 7.378121045351378e-06, |
| "loss": 0.5073, |
| "step": 588 |
| }, |
| { |
| "epoch": 0.3626079762981204, |
| "grad_norm": 1.817628026008606, |
| "learning_rate": 7.369343312364994e-06, |
| "loss": 0.4573, |
| "step": 589 |
| }, |
| { |
| "epoch": 0.36322360953461974, |
| "grad_norm": 2.304370880126953, |
| "learning_rate": 7.360556152531171e-06, |
| "loss": 0.4976, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.3638392427711191, |
| "grad_norm": 1.789089322090149, |
| "learning_rate": 7.351759600811163e-06, |
| "loss": 0.4474, |
| "step": 591 |
| }, |
| { |
| "epoch": 0.36445487600761844, |
| "grad_norm": 1.9560165405273438, |
| "learning_rate": 7.342953692203594e-06, |
| "loss": 0.5014, |
| "step": 592 |
| }, |
| { |
| "epoch": 0.3650705092441178, |
| "grad_norm": 1.8216361999511719, |
| "learning_rate": 7.33413846174431e-06, |
| "loss": 0.4896, |
| "step": 593 |
| }, |
| { |
| "epoch": 0.36568614248061715, |
| "grad_norm": 2.0918242931365967, |
| "learning_rate": 7.3253139445062535e-06, |
| "loss": 0.5184, |
| "step": 594 |
| }, |
| { |
| "epoch": 0.36630177571711653, |
| "grad_norm": 2.250962018966675, |
| "learning_rate": 7.31648017559931e-06, |
| "loss": 0.4779, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.3669174089536159, |
| "grad_norm": 2.12443208694458, |
| "learning_rate": 7.307637190170176e-06, |
| "loss": 0.4761, |
| "step": 596 |
| }, |
| { |
| "epoch": 0.36753304219011523, |
| "grad_norm": 2.055332660675049, |
| "learning_rate": 7.29878502340222e-06, |
| "loss": 0.4926, |
| "step": 597 |
| }, |
| { |
| "epoch": 0.3681486754266146, |
| "grad_norm": 1.9562499523162842, |
| "learning_rate": 7.289923710515338e-06, |
| "loss": 0.4574, |
| "step": 598 |
| }, |
| { |
| "epoch": 0.36876430866311394, |
| "grad_norm": 1.8528696298599243, |
| "learning_rate": 7.281053286765816e-06, |
| "loss": 0.4715, |
| "step": 599 |
| }, |
| { |
| "epoch": 0.3693799418996133, |
| "grad_norm": 2.025754928588867, |
| "learning_rate": 7.272173787446188e-06, |
| "loss": 0.5057, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.3693799418996133, |
| "eval_loss": 0.46705466508865356, |
| "eval_runtime": 119.3636, |
| "eval_samples_per_second": 35.195, |
| "eval_steps_per_second": 4.407, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.36999557513611264, |
| "grad_norm": 1.8789739608764648, |
| "learning_rate": 7.263285247885097e-06, |
| "loss": 0.4827, |
| "step": 601 |
| }, |
| { |
| "epoch": 0.370611208372612, |
| "grad_norm": 1.8459640741348267, |
| "learning_rate": 7.254387703447154e-06, |
| "loss": 0.4929, |
| "step": 602 |
| }, |
| { |
| "epoch": 0.37122684160911135, |
| "grad_norm": 1.799880027770996, |
| "learning_rate": 7.245481189532801e-06, |
| "loss": 0.4768, |
| "step": 603 |
| }, |
| { |
| "epoch": 0.3718424748456107, |
| "grad_norm": 1.8112698793411255, |
| "learning_rate": 7.236565741578163e-06, |
| "loss": 0.4871, |
| "step": 604 |
| }, |
| { |
| "epoch": 0.3724581080821101, |
| "grad_norm": 1.7186943292617798, |
| "learning_rate": 7.227641395054913e-06, |
| "loss": 0.4718, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.37307374131860943, |
| "grad_norm": 1.945148229598999, |
| "learning_rate": 7.218708185470122e-06, |
| "loss": 0.53, |
| "step": 606 |
| }, |
| { |
| "epoch": 0.3736893745551088, |
| "grad_norm": 1.7540737390518188, |
| "learning_rate": 7.2097661483661355e-06, |
| "loss": 0.4647, |
| "step": 607 |
| }, |
| { |
| "epoch": 0.37430500779160814, |
| "grad_norm": 1.795328974723816, |
| "learning_rate": 7.200815319320409e-06, |
| "loss": 0.4348, |
| "step": 608 |
| }, |
| { |
| "epoch": 0.3749206410281075, |
| "grad_norm": 1.8775458335876465, |
| "learning_rate": 7.191855733945388e-06, |
| "loss": 0.4866, |
| "step": 609 |
| }, |
| { |
| "epoch": 0.37553627426460684, |
| "grad_norm": 1.7888774871826172, |
| "learning_rate": 7.182887427888351e-06, |
| "loss": 0.4586, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.3761519075011062, |
| "grad_norm": 1.954525351524353, |
| "learning_rate": 7.173910436831274e-06, |
| "loss": 0.4297, |
| "step": 611 |
| }, |
| { |
| "epoch": 0.37676754073760554, |
| "grad_norm": 1.9484002590179443, |
| "learning_rate": 7.164924796490689e-06, |
| "loss": 0.4793, |
| "step": 612 |
| }, |
| { |
| "epoch": 0.3773831739741049, |
| "grad_norm": 1.8956586122512817, |
| "learning_rate": 7.155930542617543e-06, |
| "loss": 0.4552, |
| "step": 613 |
| }, |
| { |
| "epoch": 0.3779988072106043, |
| "grad_norm": 1.8784023523330688, |
| "learning_rate": 7.146927710997047e-06, |
| "loss": 0.4494, |
| "step": 614 |
| }, |
| { |
| "epoch": 0.37861444044710363, |
| "grad_norm": 2.010004997253418, |
| "learning_rate": 7.137916337448544e-06, |
| "loss": 0.4853, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.379230073683603, |
| "grad_norm": 1.7376317977905273, |
| "learning_rate": 7.128896457825364e-06, |
| "loss": 0.4687, |
| "step": 616 |
| }, |
| { |
| "epoch": 0.37984570692010233, |
| "grad_norm": 1.83708918094635, |
| "learning_rate": 7.119868108014677e-06, |
| "loss": 0.4779, |
| "step": 617 |
| }, |
| { |
| "epoch": 0.3804613401566017, |
| "grad_norm": 2.276491165161133, |
| "learning_rate": 7.110831323937356e-06, |
| "loss": 0.4661, |
| "step": 618 |
| }, |
| { |
| "epoch": 0.38107697339310104, |
| "grad_norm": 2.030910015106201, |
| "learning_rate": 7.101786141547829e-06, |
| "loss": 0.4846, |
| "step": 619 |
| }, |
| { |
| "epoch": 0.3816926066296004, |
| "grad_norm": 1.8752516508102417, |
| "learning_rate": 7.092732596833937e-06, |
| "loss": 0.4876, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.3823082398660998, |
| "grad_norm": 1.798912525177002, |
| "learning_rate": 7.083670725816795e-06, |
| "loss": 0.4161, |
| "step": 621 |
| }, |
| { |
| "epoch": 0.3829238731025991, |
| "grad_norm": 1.8931162357330322, |
| "learning_rate": 7.074600564550643e-06, |
| "loss": 0.4608, |
| "step": 622 |
| }, |
| { |
| "epoch": 0.3835395063390985, |
| "grad_norm": 2.037741184234619, |
| "learning_rate": 7.06552214912271e-06, |
| "loss": 0.4546, |
| "step": 623 |
| }, |
| { |
| "epoch": 0.3841551395755978, |
| "grad_norm": 2.005077600479126, |
| "learning_rate": 7.056435515653059e-06, |
| "loss": 0.4921, |
| "step": 624 |
| }, |
| { |
| "epoch": 0.3847707728120972, |
| "grad_norm": 1.9674670696258545, |
| "learning_rate": 7.047340700294454e-06, |
| "loss": 0.4609, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.38538640604859653, |
| "grad_norm": 1.9863380193710327, |
| "learning_rate": 7.03823773923221e-06, |
| "loss": 0.4902, |
| "step": 626 |
| }, |
| { |
| "epoch": 0.3860020392850959, |
| "grad_norm": 2.1521503925323486, |
| "learning_rate": 7.029126668684055e-06, |
| "loss": 0.4818, |
| "step": 627 |
| }, |
| { |
| "epoch": 0.38661767252159523, |
| "grad_norm": 2.2088866233825684, |
| "learning_rate": 7.020007524899976e-06, |
| "loss": 0.5094, |
| "step": 628 |
| }, |
| { |
| "epoch": 0.3872333057580946, |
| "grad_norm": 1.8303382396697998, |
| "learning_rate": 7.010880344162087e-06, |
| "loss": 0.4662, |
| "step": 629 |
| }, |
| { |
| "epoch": 0.387848938994594, |
| "grad_norm": 1.7346774339675903, |
| "learning_rate": 7.0017451627844765e-06, |
| "loss": 0.4348, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.3884645722310933, |
| "grad_norm": 2.2596514225006104, |
| "learning_rate": 6.992602017113058e-06, |
| "loss": 0.4936, |
| "step": 631 |
| }, |
| { |
| "epoch": 0.3890802054675927, |
| "grad_norm": 2.0665206909179688, |
| "learning_rate": 6.983450943525445e-06, |
| "loss": 0.4967, |
| "step": 632 |
| }, |
| { |
| "epoch": 0.389695838704092, |
| "grad_norm": 1.771286964416504, |
| "learning_rate": 6.974291978430783e-06, |
| "loss": 0.4906, |
| "step": 633 |
| }, |
| { |
| "epoch": 0.3903114719405914, |
| "grad_norm": 1.8066425323486328, |
| "learning_rate": 6.965125158269619e-06, |
| "loss": 0.4635, |
| "step": 634 |
| }, |
| { |
| "epoch": 0.39092710517709073, |
| "grad_norm": 1.8853797912597656, |
| "learning_rate": 6.955950519513754e-06, |
| "loss": 0.442, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.3915427384135901, |
| "grad_norm": 1.959373950958252, |
| "learning_rate": 6.946768098666097e-06, |
| "loss": 0.476, |
| "step": 636 |
| }, |
| { |
| "epoch": 0.39215837165008943, |
| "grad_norm": 1.789989948272705, |
| "learning_rate": 6.9375779322605154e-06, |
| "loss": 0.4787, |
| "step": 637 |
| }, |
| { |
| "epoch": 0.3927740048865888, |
| "grad_norm": 1.832987904548645, |
| "learning_rate": 6.9283800568616986e-06, |
| "loss": 0.4614, |
| "step": 638 |
| }, |
| { |
| "epoch": 0.3933896381230882, |
| "grad_norm": 1.9762518405914307, |
| "learning_rate": 6.919174509065003e-06, |
| "loss": 0.4557, |
| "step": 639 |
| }, |
| { |
| "epoch": 0.3940052713595875, |
| "grad_norm": 1.903957724571228, |
| "learning_rate": 6.909961325496312e-06, |
| "loss": 0.4806, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.3946209045960869, |
| "grad_norm": 2.0859627723693848, |
| "learning_rate": 6.900740542811896e-06, |
| "loss": 0.4778, |
| "step": 641 |
| }, |
| { |
| "epoch": 0.3952365378325862, |
| "grad_norm": 2.363494873046875, |
| "learning_rate": 6.891512197698249e-06, |
| "loss": 0.4642, |
| "step": 642 |
| }, |
| { |
| "epoch": 0.3958521710690856, |
| "grad_norm": 2.01015305519104, |
| "learning_rate": 6.88227632687196e-06, |
| "loss": 0.4997, |
| "step": 643 |
| }, |
| { |
| "epoch": 0.3964678043055849, |
| "grad_norm": 1.8750239610671997, |
| "learning_rate": 6.873032967079562e-06, |
| "loss": 0.4757, |
| "step": 644 |
| }, |
| { |
| "epoch": 0.3970834375420843, |
| "grad_norm": 1.866356372833252, |
| "learning_rate": 6.863782155097376e-06, |
| "loss": 0.4753, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.3976990707785837, |
| "grad_norm": 1.8274710178375244, |
| "learning_rate": 6.854523927731383e-06, |
| "loss": 0.474, |
| "step": 646 |
| }, |
| { |
| "epoch": 0.398314704015083, |
| "grad_norm": 1.9891657829284668, |
| "learning_rate": 6.8452583218170575e-06, |
| "loss": 0.4676, |
| "step": 647 |
| }, |
| { |
| "epoch": 0.3989303372515824, |
| "grad_norm": 1.8337827920913696, |
| "learning_rate": 6.835985374219241e-06, |
| "loss": 0.4589, |
| "step": 648 |
| }, |
| { |
| "epoch": 0.3995459704880817, |
| "grad_norm": 1.8421549797058105, |
| "learning_rate": 6.8267051218319766e-06, |
| "loss": 0.4423, |
| "step": 649 |
| }, |
| { |
| "epoch": 0.4001616037245811, |
| "grad_norm": 1.809378743171692, |
| "learning_rate": 6.817417601578375e-06, |
| "loss": 0.4619, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.4007772369610804, |
| "grad_norm": 1.912400722503662, |
| "learning_rate": 6.808122850410461e-06, |
| "loss": 0.4618, |
| "step": 651 |
| }, |
| { |
| "epoch": 0.4013928701975798, |
| "grad_norm": 2.02554988861084, |
| "learning_rate": 6.798820905309036e-06, |
| "loss": 0.4733, |
| "step": 652 |
| }, |
| { |
| "epoch": 0.4020085034340791, |
| "grad_norm": 1.967797040939331, |
| "learning_rate": 6.789511803283512e-06, |
| "loss": 0.4557, |
| "step": 653 |
| }, |
| { |
| "epoch": 0.4026241366705785, |
| "grad_norm": 1.9477007389068604, |
| "learning_rate": 6.780195581371785e-06, |
| "loss": 0.5078, |
| "step": 654 |
| }, |
| { |
| "epoch": 0.4032397699070779, |
| "grad_norm": 2.080044746398926, |
| "learning_rate": 6.7708722766400745e-06, |
| "loss": 0.4631, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.4038554031435772, |
| "grad_norm": 1.8657619953155518, |
| "learning_rate": 6.761541926182783e-06, |
| "loss": 0.4748, |
| "step": 656 |
| }, |
| { |
| "epoch": 0.4044710363800766, |
| "grad_norm": 2.008187770843506, |
| "learning_rate": 6.752204567122343e-06, |
| "loss": 0.4681, |
| "step": 657 |
| }, |
| { |
| "epoch": 0.4050866696165759, |
| "grad_norm": 1.8819204568862915, |
| "learning_rate": 6.7428602366090764e-06, |
| "loss": 0.47, |
| "step": 658 |
| }, |
| { |
| "epoch": 0.4057023028530753, |
| "grad_norm": 2.0878162384033203, |
| "learning_rate": 6.733508971821037e-06, |
| "loss": 0.4637, |
| "step": 659 |
| }, |
| { |
| "epoch": 0.4063179360895746, |
| "grad_norm": 2.172257900238037, |
| "learning_rate": 6.724150809963867e-06, |
| "loss": 0.4755, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.406933569326074, |
| "grad_norm": 1.7706599235534668, |
| "learning_rate": 6.714785788270658e-06, |
| "loss": 0.4294, |
| "step": 661 |
| }, |
| { |
| "epoch": 0.4075492025625733, |
| "grad_norm": 1.698087215423584, |
| "learning_rate": 6.705413944001786e-06, |
| "loss": 0.405, |
| "step": 662 |
| }, |
| { |
| "epoch": 0.4081648357990727, |
| "grad_norm": 2.301974296569824, |
| "learning_rate": 6.696035314444778e-06, |
| "loss": 0.4452, |
| "step": 663 |
| }, |
| { |
| "epoch": 0.4087804690355721, |
| "grad_norm": 1.8717014789581299, |
| "learning_rate": 6.686649936914151e-06, |
| "loss": 0.44, |
| "step": 664 |
| }, |
| { |
| "epoch": 0.4093961022720714, |
| "grad_norm": 1.8877750635147095, |
| "learning_rate": 6.677257848751276e-06, |
| "loss": 0.4753, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.4100117355085708, |
| "grad_norm": 1.8425832986831665, |
| "learning_rate": 6.667859087324221e-06, |
| "loss": 0.452, |
| "step": 666 |
| }, |
| { |
| "epoch": 0.4106273687450701, |
| "grad_norm": 1.7998721599578857, |
| "learning_rate": 6.658453690027604e-06, |
| "loss": 0.4302, |
| "step": 667 |
| }, |
| { |
| "epoch": 0.4112430019815695, |
| "grad_norm": 1.9948549270629883, |
| "learning_rate": 6.6490416942824466e-06, |
| "loss": 0.4542, |
| "step": 668 |
| }, |
| { |
| "epoch": 0.4118586352180688, |
| "grad_norm": 1.861610770225525, |
| "learning_rate": 6.639623137536023e-06, |
| "loss": 0.4816, |
| "step": 669 |
| }, |
| { |
| "epoch": 0.4124742684545682, |
| "grad_norm": 1.737973928451538, |
| "learning_rate": 6.63019805726171e-06, |
| "loss": 0.4309, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.4130899016910676, |
| "grad_norm": 1.927831768989563, |
| "learning_rate": 6.620766490958842e-06, |
| "loss": 0.4732, |
| "step": 671 |
| }, |
| { |
| "epoch": 0.4137055349275669, |
| "grad_norm": 1.807926058769226, |
| "learning_rate": 6.611328476152557e-06, |
| "loss": 0.4507, |
| "step": 672 |
| }, |
| { |
| "epoch": 0.4143211681640663, |
| "grad_norm": 1.7335807085037231, |
| "learning_rate": 6.601884050393649e-06, |
| "loss": 0.4366, |
| "step": 673 |
| }, |
| { |
| "epoch": 0.4149368014005656, |
| "grad_norm": 1.676943302154541, |
| "learning_rate": 6.592433251258423e-06, |
| "loss": 0.4158, |
| "step": 674 |
| }, |
| { |
| "epoch": 0.415552434637065, |
| "grad_norm": 2.2752737998962402, |
| "learning_rate": 6.582976116348538e-06, |
| "loss": 0.4686, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.4161680678735643, |
| "grad_norm": 2.1173386573791504, |
| "learning_rate": 6.57351268329086e-06, |
| "loss": 0.4662, |
| "step": 676 |
| }, |
| { |
| "epoch": 0.4167837011100637, |
| "grad_norm": 1.8290038108825684, |
| "learning_rate": 6.5640429897373195e-06, |
| "loss": 0.4415, |
| "step": 677 |
| }, |
| { |
| "epoch": 0.417399334346563, |
| "grad_norm": 1.850178599357605, |
| "learning_rate": 6.554567073364747e-06, |
| "loss": 0.4806, |
| "step": 678 |
| }, |
| { |
| "epoch": 0.4180149675830624, |
| "grad_norm": 1.913724422454834, |
| "learning_rate": 6.545084971874738e-06, |
| "loss": 0.4714, |
| "step": 679 |
| }, |
| { |
| "epoch": 0.41863060081956177, |
| "grad_norm": 2.002952814102173, |
| "learning_rate": 6.535596722993494e-06, |
| "loss": 0.4673, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.4192462340560611, |
| "grad_norm": 1.8530998229980469, |
| "learning_rate": 6.526102364471675e-06, |
| "loss": 0.4492, |
| "step": 681 |
| }, |
| { |
| "epoch": 0.4198618672925605, |
| "grad_norm": 1.7990001440048218, |
| "learning_rate": 6.51660193408425e-06, |
| "loss": 0.46, |
| "step": 682 |
| }, |
| { |
| "epoch": 0.4204775005290598, |
| "grad_norm": 1.7169803380966187, |
| "learning_rate": 6.507095469630347e-06, |
| "loss": 0.4629, |
| "step": 683 |
| }, |
| { |
| "epoch": 0.4210931337655592, |
| "grad_norm": 2.5550501346588135, |
| "learning_rate": 6.497583008933097e-06, |
| "loss": 0.4674, |
| "step": 684 |
| }, |
| { |
| "epoch": 0.4217087670020585, |
| "grad_norm": 1.9652258157730103, |
| "learning_rate": 6.4880645898394935e-06, |
| "loss": 0.4587, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.4223244002385579, |
| "grad_norm": 2.055323362350464, |
| "learning_rate": 6.4785402502202345e-06, |
| "loss": 0.472, |
| "step": 686 |
| }, |
| { |
| "epoch": 0.4229400334750572, |
| "grad_norm": 1.860081434249878, |
| "learning_rate": 6.469010027969573e-06, |
| "loss": 0.4676, |
| "step": 687 |
| }, |
| { |
| "epoch": 0.4235556667115566, |
| "grad_norm": 1.8490623235702515, |
| "learning_rate": 6.459473961005168e-06, |
| "loss": 0.4637, |
| "step": 688 |
| }, |
| { |
| "epoch": 0.42417129994805597, |
| "grad_norm": 1.8875542879104614, |
| "learning_rate": 6.449932087267932e-06, |
| "loss": 0.5051, |
| "step": 689 |
| }, |
| { |
| "epoch": 0.4247869331845553, |
| "grad_norm": 1.7553805112838745, |
| "learning_rate": 6.440384444721881e-06, |
| "loss": 0.4544, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.4254025664210547, |
| "grad_norm": 1.723633050918579, |
| "learning_rate": 6.4308310713539845e-06, |
| "loss": 0.469, |
| "step": 691 |
| }, |
| { |
| "epoch": 0.426018199657554, |
| "grad_norm": 1.8384066820144653, |
| "learning_rate": 6.4212720051740126e-06, |
| "loss": 0.4348, |
| "step": 692 |
| }, |
| { |
| "epoch": 0.4266338328940534, |
| "grad_norm": 1.7237180471420288, |
| "learning_rate": 6.411707284214384e-06, |
| "loss": 0.4188, |
| "step": 693 |
| }, |
| { |
| "epoch": 0.4272494661305527, |
| "grad_norm": 1.7882165908813477, |
| "learning_rate": 6.402136946530014e-06, |
| "loss": 0.4481, |
| "step": 694 |
| }, |
| { |
| "epoch": 0.4278650993670521, |
| "grad_norm": 1.8759676218032837, |
| "learning_rate": 6.3925610301981726e-06, |
| "loss": 0.4826, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.42848073260355146, |
| "grad_norm": 1.849109411239624, |
| "learning_rate": 6.382979573318317e-06, |
| "loss": 0.4745, |
| "step": 696 |
| }, |
| { |
| "epoch": 0.4290963658400508, |
| "grad_norm": 1.6711199283599854, |
| "learning_rate": 6.373392614011952e-06, |
| "loss": 0.4435, |
| "step": 697 |
| }, |
| { |
| "epoch": 0.42971199907655017, |
| "grad_norm": 2.019181251525879, |
| "learning_rate": 6.3638001904224755e-06, |
| "loss": 0.4867, |
| "step": 698 |
| }, |
| { |
| "epoch": 0.4303276323130495, |
| "grad_norm": 1.7364513874053955, |
| "learning_rate": 6.354202340715027e-06, |
| "loss": 0.4524, |
| "step": 699 |
| }, |
| { |
| "epoch": 0.43094326554954887, |
| "grad_norm": 1.7672978639602661, |
| "learning_rate": 6.344599103076329e-06, |
| "loss": 0.4707, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.43094326554954887, |
| "eval_loss": 0.45585593581199646, |
| "eval_runtime": 119.2986, |
| "eval_samples_per_second": 35.214, |
| "eval_steps_per_second": 4.409, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.4315588987860482, |
| "grad_norm": 2.1754207611083984, |
| "learning_rate": 6.334990515714548e-06, |
| "loss": 0.4533, |
| "step": 701 |
| }, |
| { |
| "epoch": 0.4321745320225476, |
| "grad_norm": 2.133251190185547, |
| "learning_rate": 6.3253766168591315e-06, |
| "loss": 0.4589, |
| "step": 702 |
| }, |
| { |
| "epoch": 0.4327901652590469, |
| "grad_norm": 1.9236661195755005, |
| "learning_rate": 6.315757444760659e-06, |
| "loss": 0.4455, |
| "step": 703 |
| }, |
| { |
| "epoch": 0.4334057984955463, |
| "grad_norm": 1.8091168403625488, |
| "learning_rate": 6.306133037690693e-06, |
| "loss": 0.4474, |
| "step": 704 |
| }, |
| { |
| "epoch": 0.43402143173204566, |
| "grad_norm": 1.7465119361877441, |
| "learning_rate": 6.296503433941622e-06, |
| "loss": 0.4715, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.434637064968545, |
| "grad_norm": 2.0189712047576904, |
| "learning_rate": 6.286868671826513e-06, |
| "loss": 0.4935, |
| "step": 706 |
| }, |
| { |
| "epoch": 0.43525269820504436, |
| "grad_norm": 1.897495985031128, |
| "learning_rate": 6.277228789678953e-06, |
| "loss": 0.4634, |
| "step": 707 |
| }, |
| { |
| "epoch": 0.4358683314415437, |
| "grad_norm": 1.7837578058242798, |
| "learning_rate": 6.2675838258529054e-06, |
| "loss": 0.438, |
| "step": 708 |
| }, |
| { |
| "epoch": 0.43648396467804307, |
| "grad_norm": 1.9282805919647217, |
| "learning_rate": 6.257933818722544e-06, |
| "loss": 0.4495, |
| "step": 709 |
| }, |
| { |
| "epoch": 0.4370995979145424, |
| "grad_norm": 1.834639072418213, |
| "learning_rate": 6.248278806682114e-06, |
| "loss": 0.4832, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.4377152311510418, |
| "grad_norm": 1.8320832252502441, |
| "learning_rate": 6.238618828145775e-06, |
| "loss": 0.4619, |
| "step": 711 |
| }, |
| { |
| "epoch": 0.4383308643875411, |
| "grad_norm": 1.6084938049316406, |
| "learning_rate": 6.228953921547441e-06, |
| "loss": 0.4203, |
| "step": 712 |
| }, |
| { |
| "epoch": 0.4389464976240405, |
| "grad_norm": 1.7636457681655884, |
| "learning_rate": 6.219284125340637e-06, |
| "loss": 0.4569, |
| "step": 713 |
| }, |
| { |
| "epoch": 0.43956213086053986, |
| "grad_norm": 1.8105461597442627, |
| "learning_rate": 6.209609477998339e-06, |
| "loss": 0.4535, |
| "step": 714 |
| }, |
| { |
| "epoch": 0.4401777640970392, |
| "grad_norm": 1.876543641090393, |
| "learning_rate": 6.19993001801283e-06, |
| "loss": 0.4581, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.44079339733353856, |
| "grad_norm": 1.8101094961166382, |
| "learning_rate": 6.190245783895537e-06, |
| "loss": 0.4716, |
| "step": 716 |
| }, |
| { |
| "epoch": 0.4414090305700379, |
| "grad_norm": 1.9081860780715942, |
| "learning_rate": 6.180556814176878e-06, |
| "loss": 0.4828, |
| "step": 717 |
| }, |
| { |
| "epoch": 0.44202466380653727, |
| "grad_norm": 1.996167540550232, |
| "learning_rate": 6.17086314740612e-06, |
| "loss": 0.4573, |
| "step": 718 |
| }, |
| { |
| "epoch": 0.4426402970430366, |
| "grad_norm": 2.116783618927002, |
| "learning_rate": 6.161164822151213e-06, |
| "loss": 0.4308, |
| "step": 719 |
| }, |
| { |
| "epoch": 0.44325593027953597, |
| "grad_norm": 1.809017300605774, |
| "learning_rate": 6.151461876998643e-06, |
| "loss": 0.4475, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.4438715635160353, |
| "grad_norm": 1.7881202697753906, |
| "learning_rate": 6.141754350553279e-06, |
| "loss": 0.4379, |
| "step": 721 |
| }, |
| { |
| "epoch": 0.4444871967525347, |
| "grad_norm": 1.7520116567611694, |
| "learning_rate": 6.1320422814382145e-06, |
| "loss": 0.424, |
| "step": 722 |
| }, |
| { |
| "epoch": 0.44510282998903405, |
| "grad_norm": 1.9185141324996948, |
| "learning_rate": 6.122325708294615e-06, |
| "loss": 0.4646, |
| "step": 723 |
| }, |
| { |
| "epoch": 0.4457184632255334, |
| "grad_norm": 1.9544090032577515, |
| "learning_rate": 6.112604669781572e-06, |
| "loss": 0.4444, |
| "step": 724 |
| }, |
| { |
| "epoch": 0.44633409646203276, |
| "grad_norm": 1.8842326402664185, |
| "learning_rate": 6.102879204575941e-06, |
| "loss": 0.4264, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.4469497296985321, |
| "grad_norm": 1.85416841506958, |
| "learning_rate": 6.093149351372186e-06, |
| "loss": 0.4688, |
| "step": 726 |
| }, |
| { |
| "epoch": 0.44756536293503146, |
| "grad_norm": 1.716825008392334, |
| "learning_rate": 6.083415148882236e-06, |
| "loss": 0.4453, |
| "step": 727 |
| }, |
| { |
| "epoch": 0.4481809961715308, |
| "grad_norm": 1.9674599170684814, |
| "learning_rate": 6.073676635835317e-06, |
| "loss": 0.4903, |
| "step": 728 |
| }, |
| { |
| "epoch": 0.44879662940803017, |
| "grad_norm": 1.8208335638046265, |
| "learning_rate": 6.063933850977811e-06, |
| "loss": 0.4369, |
| "step": 729 |
| }, |
| { |
| "epoch": 0.44941226264452955, |
| "grad_norm": 1.7994003295898438, |
| "learning_rate": 6.054186833073096e-06, |
| "loss": 0.4595, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.45002789588102887, |
| "grad_norm": 1.9228452444076538, |
| "learning_rate": 6.044435620901388e-06, |
| "loss": 0.4366, |
| "step": 731 |
| }, |
| { |
| "epoch": 0.45064352911752825, |
| "grad_norm": 1.9610226154327393, |
| "learning_rate": 6.034680253259594e-06, |
| "loss": 0.4463, |
| "step": 732 |
| }, |
| { |
| "epoch": 0.4512591623540276, |
| "grad_norm": 1.9020127058029175, |
| "learning_rate": 6.024920768961153e-06, |
| "loss": 0.4667, |
| "step": 733 |
| }, |
| { |
| "epoch": 0.45187479559052696, |
| "grad_norm": 1.8535819053649902, |
| "learning_rate": 6.015157206835881e-06, |
| "loss": 0.4452, |
| "step": 734 |
| }, |
| { |
| "epoch": 0.4524904288270263, |
| "grad_norm": 1.96487557888031, |
| "learning_rate": 6.005389605729824e-06, |
| "loss": 0.469, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.45310606206352566, |
| "grad_norm": 1.8634556531906128, |
| "learning_rate": 5.995618004505091e-06, |
| "loss": 0.4395, |
| "step": 736 |
| }, |
| { |
| "epoch": 0.453721695300025, |
| "grad_norm": 1.9039191007614136, |
| "learning_rate": 5.985842442039712e-06, |
| "loss": 0.4955, |
| "step": 737 |
| }, |
| { |
| "epoch": 0.45433732853652437, |
| "grad_norm": 1.8303287029266357, |
| "learning_rate": 5.976062957227472e-06, |
| "loss": 0.459, |
| "step": 738 |
| }, |
| { |
| "epoch": 0.45495296177302375, |
| "grad_norm": 2.0153088569641113, |
| "learning_rate": 5.9662795889777666e-06, |
| "loss": 0.4635, |
| "step": 739 |
| }, |
| { |
| "epoch": 0.45556859500952307, |
| "grad_norm": 1.7339246273040771, |
| "learning_rate": 5.956492376215439e-06, |
| "loss": 0.4648, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.45618422824602245, |
| "grad_norm": 1.8080629110336304, |
| "learning_rate": 5.946701357880632e-06, |
| "loss": 0.448, |
| "step": 741 |
| }, |
| { |
| "epoch": 0.4567998614825218, |
| "grad_norm": 1.7795019149780273, |
| "learning_rate": 5.936906572928625e-06, |
| "loss": 0.4518, |
| "step": 742 |
| }, |
| { |
| "epoch": 0.45741549471902115, |
| "grad_norm": 1.7698966264724731, |
| "learning_rate": 5.927108060329685e-06, |
| "loss": 0.4544, |
| "step": 743 |
| }, |
| { |
| "epoch": 0.4580311279555205, |
| "grad_norm": 1.7339845895767212, |
| "learning_rate": 5.917305859068912e-06, |
| "loss": 0.4603, |
| "step": 744 |
| }, |
| { |
| "epoch": 0.45864676119201986, |
| "grad_norm": 1.6085306406021118, |
| "learning_rate": 5.907500008146082e-06, |
| "loss": 0.4236, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.4592623944285192, |
| "grad_norm": 1.7907172441482544, |
| "learning_rate": 5.897690546575491e-06, |
| "loss": 0.4557, |
| "step": 746 |
| }, |
| { |
| "epoch": 0.45987802766501856, |
| "grad_norm": 1.759875774383545, |
| "learning_rate": 5.887877513385799e-06, |
| "loss": 0.4632, |
| "step": 747 |
| }, |
| { |
| "epoch": 0.46049366090151794, |
| "grad_norm": 2.0250210762023926, |
| "learning_rate": 5.878060947619877e-06, |
| "loss": 0.4826, |
| "step": 748 |
| }, |
| { |
| "epoch": 0.46110929413801727, |
| "grad_norm": 1.8454856872558594, |
| "learning_rate": 5.8682408883346535e-06, |
| "loss": 0.4475, |
| "step": 749 |
| }, |
| { |
| "epoch": 0.46172492737451665, |
| "grad_norm": 1.6851238012313843, |
| "learning_rate": 5.858417374600952e-06, |
| "loss": 0.4461, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.46234056061101597, |
| "grad_norm": 1.6859235763549805, |
| "learning_rate": 5.848590445503345e-06, |
| "loss": 0.4569, |
| "step": 751 |
| }, |
| { |
| "epoch": 0.46295619384751535, |
| "grad_norm": 1.661718726158142, |
| "learning_rate": 5.838760140139993e-06, |
| "loss": 0.4588, |
| "step": 752 |
| }, |
| { |
| "epoch": 0.4635718270840147, |
| "grad_norm": 1.8251081705093384, |
| "learning_rate": 5.828926497622484e-06, |
| "loss": 0.454, |
| "step": 753 |
| }, |
| { |
| "epoch": 0.46418746032051406, |
| "grad_norm": 1.5905681848526, |
| "learning_rate": 5.819089557075689e-06, |
| "loss": 0.4269, |
| "step": 754 |
| }, |
| { |
| "epoch": 0.46480309355701344, |
| "grad_norm": 1.7265760898590088, |
| "learning_rate": 5.809249357637601e-06, |
| "loss": 0.4491, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.46541872679351276, |
| "grad_norm": 1.8297325372695923, |
| "learning_rate": 5.799405938459175e-06, |
| "loss": 0.4729, |
| "step": 756 |
| }, |
| { |
| "epoch": 0.46603436003001214, |
| "grad_norm": 1.8845032453536987, |
| "learning_rate": 5.78955933870418e-06, |
| "loss": 0.4238, |
| "step": 757 |
| }, |
| { |
| "epoch": 0.46664999326651146, |
| "grad_norm": 2.0128591060638428, |
| "learning_rate": 5.779709597549037e-06, |
| "loss": 0.4643, |
| "step": 758 |
| }, |
| { |
| "epoch": 0.46726562650301084, |
| "grad_norm": 1.8747127056121826, |
| "learning_rate": 5.769856754182668e-06, |
| "loss": 0.4703, |
| "step": 759 |
| }, |
| { |
| "epoch": 0.46788125973951017, |
| "grad_norm": 1.7039685249328613, |
| "learning_rate": 5.760000847806337e-06, |
| "loss": 0.4492, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.46849689297600955, |
| "grad_norm": 1.7922077178955078, |
| "learning_rate": 5.750141917633491e-06, |
| "loss": 0.4566, |
| "step": 761 |
| }, |
| { |
| "epoch": 0.4691125262125089, |
| "grad_norm": 1.7171673774719238, |
| "learning_rate": 5.740280002889613e-06, |
| "loss": 0.4379, |
| "step": 762 |
| }, |
| { |
| "epoch": 0.46972815944900825, |
| "grad_norm": 1.6933897733688354, |
| "learning_rate": 5.730415142812059e-06, |
| "loss": 0.4393, |
| "step": 763 |
| }, |
| { |
| "epoch": 0.47034379268550763, |
| "grad_norm": 1.9281322956085205, |
| "learning_rate": 5.720547376649901e-06, |
| "loss": 0.4508, |
| "step": 764 |
| }, |
| { |
| "epoch": 0.47095942592200696, |
| "grad_norm": 1.9108821153640747, |
| "learning_rate": 5.710676743663777e-06, |
| "loss": 0.471, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.47157505915850634, |
| "grad_norm": 1.8311848640441895, |
| "learning_rate": 5.70080328312573e-06, |
| "loss": 0.4702, |
| "step": 766 |
| }, |
| { |
| "epoch": 0.47219069239500566, |
| "grad_norm": 1.7278153896331787, |
| "learning_rate": 5.690927034319051e-06, |
| "loss": 0.4357, |
| "step": 767 |
| }, |
| { |
| "epoch": 0.47280632563150504, |
| "grad_norm": 1.8962657451629639, |
| "learning_rate": 5.681048036538126e-06, |
| "loss": 0.4561, |
| "step": 768 |
| }, |
| { |
| "epoch": 0.47342195886800437, |
| "grad_norm": 1.8439857959747314, |
| "learning_rate": 5.671166329088278e-06, |
| "loss": 0.4298, |
| "step": 769 |
| }, |
| { |
| "epoch": 0.47403759210450375, |
| "grad_norm": 1.9815741777420044, |
| "learning_rate": 5.661281951285613e-06, |
| "loss": 0.4562, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.47465322534100307, |
| "grad_norm": 1.8740676641464233, |
| "learning_rate": 5.6513949424568585e-06, |
| "loss": 0.4719, |
| "step": 771 |
| }, |
| { |
| "epoch": 0.47526885857750245, |
| "grad_norm": 1.8455761671066284, |
| "learning_rate": 5.641505341939212e-06, |
| "loss": 0.4527, |
| "step": 772 |
| }, |
| { |
| "epoch": 0.47588449181400183, |
| "grad_norm": 1.7912147045135498, |
| "learning_rate": 5.631613189080178e-06, |
| "loss": 0.4582, |
| "step": 773 |
| }, |
| { |
| "epoch": 0.47650012505050116, |
| "grad_norm": 1.8444029092788696, |
| "learning_rate": 5.621718523237427e-06, |
| "loss": 0.4389, |
| "step": 774 |
| }, |
| { |
| "epoch": 0.47711575828700054, |
| "grad_norm": 1.8730937242507935, |
| "learning_rate": 5.611821383778614e-06, |
| "loss": 0.4286, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.47773139152349986, |
| "grad_norm": 1.6841018199920654, |
| "learning_rate": 5.601921810081243e-06, |
| "loss": 0.4222, |
| "step": 776 |
| }, |
| { |
| "epoch": 0.47834702475999924, |
| "grad_norm": 1.817315936088562, |
| "learning_rate": 5.592019841532507e-06, |
| "loss": 0.4576, |
| "step": 777 |
| }, |
| { |
| "epoch": 0.47896265799649856, |
| "grad_norm": 1.967796802520752, |
| "learning_rate": 5.582115517529114e-06, |
| "loss": 0.441, |
| "step": 778 |
| }, |
| { |
| "epoch": 0.47957829123299794, |
| "grad_norm": 1.9327470064163208, |
| "learning_rate": 5.57220887747716e-06, |
| "loss": 0.4482, |
| "step": 779 |
| }, |
| { |
| "epoch": 0.4801939244694973, |
| "grad_norm": 2.1096456050872803, |
| "learning_rate": 5.562299960791946e-06, |
| "loss": 0.4721, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.48080955770599665, |
| "grad_norm": 1.8861747980117798, |
| "learning_rate": 5.55238880689783e-06, |
| "loss": 0.4171, |
| "step": 781 |
| }, |
| { |
| "epoch": 0.48142519094249603, |
| "grad_norm": 1.9005745649337769, |
| "learning_rate": 5.542475455228077e-06, |
| "loss": 0.4449, |
| "step": 782 |
| }, |
| { |
| "epoch": 0.48204082417899535, |
| "grad_norm": 1.7999687194824219, |
| "learning_rate": 5.532559945224692e-06, |
| "loss": 0.429, |
| "step": 783 |
| }, |
| { |
| "epoch": 0.48265645741549473, |
| "grad_norm": 2.0499937534332275, |
| "learning_rate": 5.522642316338268e-06, |
| "loss": 0.4536, |
| "step": 784 |
| }, |
| { |
| "epoch": 0.48327209065199406, |
| "grad_norm": 1.913791537284851, |
| "learning_rate": 5.51272260802783e-06, |
| "loss": 0.4458, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.48388772388849344, |
| "grad_norm": 1.9510321617126465, |
| "learning_rate": 5.502800859760676e-06, |
| "loss": 0.4638, |
| "step": 786 |
| }, |
| { |
| "epoch": 0.48450335712499276, |
| "grad_norm": 1.9328547716140747, |
| "learning_rate": 5.4928771110122185e-06, |
| "loss": 0.4792, |
| "step": 787 |
| }, |
| { |
| "epoch": 0.48511899036149214, |
| "grad_norm": 1.892099380493164, |
| "learning_rate": 5.48295140126583e-06, |
| "loss": 0.4676, |
| "step": 788 |
| }, |
| { |
| "epoch": 0.4857346235979915, |
| "grad_norm": 1.8072856664657593, |
| "learning_rate": 5.473023770012686e-06, |
| "loss": 0.4944, |
| "step": 789 |
| }, |
| { |
| "epoch": 0.48635025683449085, |
| "grad_norm": 1.7051112651824951, |
| "learning_rate": 5.463094256751608e-06, |
| "loss": 0.4385, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.4869658900709902, |
| "grad_norm": 1.9088255167007446, |
| "learning_rate": 5.453162900988902e-06, |
| "loss": 0.4404, |
| "step": 791 |
| }, |
| { |
| "epoch": 0.48758152330748955, |
| "grad_norm": 1.7755197286605835, |
| "learning_rate": 5.443229742238207e-06, |
| "loss": 0.4596, |
| "step": 792 |
| }, |
| { |
| "epoch": 0.48819715654398893, |
| "grad_norm": 2.002506732940674, |
| "learning_rate": 5.433294820020335e-06, |
| "loss": 0.4381, |
| "step": 793 |
| }, |
| { |
| "epoch": 0.48881278978048825, |
| "grad_norm": 1.7047615051269531, |
| "learning_rate": 5.423358173863117e-06, |
| "loss": 0.4504, |
| "step": 794 |
| }, |
| { |
| "epoch": 0.48942842301698763, |
| "grad_norm": 1.7884583473205566, |
| "learning_rate": 5.413419843301238e-06, |
| "loss": 0.4247, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.49004405625348696, |
| "grad_norm": 1.8708890676498413, |
| "learning_rate": 5.403479867876087e-06, |
| "loss": 0.4512, |
| "step": 796 |
| }, |
| { |
| "epoch": 0.49065968948998634, |
| "grad_norm": 1.8814146518707275, |
| "learning_rate": 5.3935382871356004e-06, |
| "loss": 0.4624, |
| "step": 797 |
| }, |
| { |
| "epoch": 0.4912753227264857, |
| "grad_norm": 1.8110054731369019, |
| "learning_rate": 5.383595140634093e-06, |
| "loss": 0.428, |
| "step": 798 |
| }, |
| { |
| "epoch": 0.49189095596298504, |
| "grad_norm": 1.8073616027832031, |
| "learning_rate": 5.373650467932122e-06, |
| "loss": 0.4319, |
| "step": 799 |
| }, |
| { |
| "epoch": 0.4925065891994844, |
| "grad_norm": 2.0299525260925293, |
| "learning_rate": 5.363704308596306e-06, |
| "loss": 0.4161, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.4925065891994844, |
| "eval_loss": 0.4422464072704315, |
| "eval_runtime": 118.6207, |
| "eval_samples_per_second": 35.415, |
| "eval_steps_per_second": 4.434, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.49312222243598375, |
| "grad_norm": 1.728995442390442, |
| "learning_rate": 5.3537567021991825e-06, |
| "loss": 0.4256, |
| "step": 801 |
| }, |
| { |
| "epoch": 0.49373785567248313, |
| "grad_norm": 1.9535176753997803, |
| "learning_rate": 5.343807688319047e-06, |
| "loss": 0.4604, |
| "step": 802 |
| }, |
| { |
| "epoch": 0.49435348890898245, |
| "grad_norm": 1.6464016437530518, |
| "learning_rate": 5.3338573065397936e-06, |
| "loss": 0.4201, |
| "step": 803 |
| }, |
| { |
| "epoch": 0.49496912214548183, |
| "grad_norm": 1.7924641370773315, |
| "learning_rate": 5.323905596450759e-06, |
| "loss": 0.4414, |
| "step": 804 |
| }, |
| { |
| "epoch": 0.4955847553819812, |
| "grad_norm": 1.6692326068878174, |
| "learning_rate": 5.3139525976465675e-06, |
| "loss": 0.4621, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.49620038861848054, |
| "grad_norm": 1.7795913219451904, |
| "learning_rate": 5.303998349726966e-06, |
| "loss": 0.4365, |
| "step": 806 |
| }, |
| { |
| "epoch": 0.4968160218549799, |
| "grad_norm": 1.6701775789260864, |
| "learning_rate": 5.294042892296675e-06, |
| "loss": 0.4358, |
| "step": 807 |
| }, |
| { |
| "epoch": 0.49743165509147924, |
| "grad_norm": 1.8857059478759766, |
| "learning_rate": 5.284086264965224e-06, |
| "loss": 0.4596, |
| "step": 808 |
| }, |
| { |
| "epoch": 0.4980472883279786, |
| "grad_norm": 1.9470977783203125, |
| "learning_rate": 5.274128507346801e-06, |
| "loss": 0.4549, |
| "step": 809 |
| }, |
| { |
| "epoch": 0.49866292156447795, |
| "grad_norm": 1.7852569818496704, |
| "learning_rate": 5.264169659060087e-06, |
| "loss": 0.4562, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.4992785548009773, |
| "grad_norm": 2.10684871673584, |
| "learning_rate": 5.2542097597281095e-06, |
| "loss": 0.4646, |
| "step": 811 |
| }, |
| { |
| "epoch": 0.49989418803747665, |
| "grad_norm": 1.738823413848877, |
| "learning_rate": 5.244248848978067e-06, |
| "loss": 0.4538, |
| "step": 812 |
| }, |
| { |
| "epoch": 0.500509821273976, |
| "grad_norm": 1.7574955224990845, |
| "learning_rate": 5.234286966441191e-06, |
| "loss": 0.4268, |
| "step": 813 |
| }, |
| { |
| "epoch": 0.5011254545104754, |
| "grad_norm": 1.9219170808792114, |
| "learning_rate": 5.224324151752575e-06, |
| "loss": 0.4719, |
| "step": 814 |
| }, |
| { |
| "epoch": 0.5017410877469748, |
| "grad_norm": 1.7875138521194458, |
| "learning_rate": 5.214360444551024e-06, |
| "loss": 0.4516, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.5023567209834741, |
| "grad_norm": 1.7680673599243164, |
| "learning_rate": 5.2043958844788925e-06, |
| "loss": 0.4378, |
| "step": 816 |
| }, |
| { |
| "epoch": 0.5029723542199734, |
| "grad_norm": 1.9988130331039429, |
| "learning_rate": 5.194430511181925e-06, |
| "loss": 0.4332, |
| "step": 817 |
| }, |
| { |
| "epoch": 0.5035879874564728, |
| "grad_norm": 1.7063559293746948, |
| "learning_rate": 5.184464364309109e-06, |
| "loss": 0.4367, |
| "step": 818 |
| }, |
| { |
| "epoch": 0.5042036206929722, |
| "grad_norm": 1.628308653831482, |
| "learning_rate": 5.174497483512506e-06, |
| "loss": 0.4326, |
| "step": 819 |
| }, |
| { |
| "epoch": 0.5048192539294715, |
| "grad_norm": 1.7696188688278198, |
| "learning_rate": 5.1645299084470936e-06, |
| "loss": 0.4391, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.5054348871659708, |
| "grad_norm": 1.741668462753296, |
| "learning_rate": 5.1545616787706186e-06, |
| "loss": 0.4462, |
| "step": 821 |
| }, |
| { |
| "epoch": 0.5060505204024702, |
| "grad_norm": 1.745593786239624, |
| "learning_rate": 5.144592834143427e-06, |
| "loss": 0.426, |
| "step": 822 |
| }, |
| { |
| "epoch": 0.5066661536389696, |
| "grad_norm": 1.8672295808792114, |
| "learning_rate": 5.134623414228315e-06, |
| "loss": 0.4332, |
| "step": 823 |
| }, |
| { |
| "epoch": 0.507281786875469, |
| "grad_norm": 1.8388646841049194, |
| "learning_rate": 5.1246534586903655e-06, |
| "loss": 0.4686, |
| "step": 824 |
| }, |
| { |
| "epoch": 0.5078974201119683, |
| "grad_norm": 1.799299955368042, |
| "learning_rate": 5.114683007196793e-06, |
| "loss": 0.4461, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.5085130533484676, |
| "grad_norm": 1.7791926860809326, |
| "learning_rate": 5.1047120994167855e-06, |
| "loss": 0.4314, |
| "step": 826 |
| }, |
| { |
| "epoch": 0.509128686584967, |
| "grad_norm": 1.9311397075653076, |
| "learning_rate": 5.094740775021348e-06, |
| "loss": 0.4418, |
| "step": 827 |
| }, |
| { |
| "epoch": 0.5097443198214664, |
| "grad_norm": 1.6415823698043823, |
| "learning_rate": 5.084769073683138e-06, |
| "loss": 0.4164, |
| "step": 828 |
| }, |
| { |
| "epoch": 0.5103599530579657, |
| "grad_norm": 1.9113596677780151, |
| "learning_rate": 5.074797035076319e-06, |
| "loss": 0.4284, |
| "step": 829 |
| }, |
| { |
| "epoch": 0.510975586294465, |
| "grad_norm": 1.8482359647750854, |
| "learning_rate": 5.064824698876393e-06, |
| "loss": 0.4498, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.5115912195309644, |
| "grad_norm": 1.7767953872680664, |
| "learning_rate": 5.0548521047600465e-06, |
| "loss": 0.4228, |
| "step": 831 |
| }, |
| { |
| "epoch": 0.5122068527674638, |
| "grad_norm": 1.8928037881851196, |
| "learning_rate": 5.04487929240499e-06, |
| "loss": 0.4414, |
| "step": 832 |
| }, |
| { |
| "epoch": 0.5128224860039632, |
| "grad_norm": 1.6631807088851929, |
| "learning_rate": 5.034906301489808e-06, |
| "loss": 0.4069, |
| "step": 833 |
| }, |
| { |
| "epoch": 0.5134381192404625, |
| "grad_norm": 1.8700741529464722, |
| "learning_rate": 5.024933171693791e-06, |
| "loss": 0.4274, |
| "step": 834 |
| }, |
| { |
| "epoch": 0.5140537524769618, |
| "grad_norm": 1.8563249111175537, |
| "learning_rate": 5.014959942696782e-06, |
| "loss": 0.4591, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.5146693857134612, |
| "grad_norm": 1.7255792617797852, |
| "learning_rate": 5.00498665417902e-06, |
| "loss": 0.4255, |
| "step": 836 |
| }, |
| { |
| "epoch": 0.5152850189499606, |
| "grad_norm": 1.871659517288208, |
| "learning_rate": 4.995013345820982e-06, |
| "loss": 0.495, |
| "step": 837 |
| }, |
| { |
| "epoch": 0.5159006521864599, |
| "grad_norm": 1.9164929389953613, |
| "learning_rate": 4.98504005730322e-06, |
| "loss": 0.4417, |
| "step": 838 |
| }, |
| { |
| "epoch": 0.5165162854229592, |
| "grad_norm": 1.8884233236312866, |
| "learning_rate": 4.9750668283062104e-06, |
| "loss": 0.4669, |
| "step": 839 |
| }, |
| { |
| "epoch": 0.5171319186594586, |
| "grad_norm": 1.838681697845459, |
| "learning_rate": 4.965093698510192e-06, |
| "loss": 0.4591, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.517747551895958, |
| "grad_norm": 1.8403676748275757, |
| "learning_rate": 4.955120707595011e-06, |
| "loss": 0.4791, |
| "step": 841 |
| }, |
| { |
| "epoch": 0.5183631851324574, |
| "grad_norm": 1.7576521635055542, |
| "learning_rate": 4.945147895239956e-06, |
| "loss": 0.4303, |
| "step": 842 |
| }, |
| { |
| "epoch": 0.5189788183689567, |
| "grad_norm": 1.7351022958755493, |
| "learning_rate": 4.935175301123609e-06, |
| "loss": 0.4444, |
| "step": 843 |
| }, |
| { |
| "epoch": 0.519594451605456, |
| "grad_norm": 1.892082929611206, |
| "learning_rate": 4.9252029649236835e-06, |
| "loss": 0.4557, |
| "step": 844 |
| }, |
| { |
| "epoch": 0.5202100848419554, |
| "grad_norm": 1.8198317289352417, |
| "learning_rate": 4.915230926316864e-06, |
| "loss": 0.4411, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.5208257180784548, |
| "grad_norm": 1.693459153175354, |
| "learning_rate": 4.905259224978655e-06, |
| "loss": 0.4461, |
| "step": 846 |
| }, |
| { |
| "epoch": 0.5214413513149542, |
| "grad_norm": 1.8052204847335815, |
| "learning_rate": 4.895287900583216e-06, |
| "loss": 0.4581, |
| "step": 847 |
| }, |
| { |
| "epoch": 0.5220569845514534, |
| "grad_norm": 1.957545280456543, |
| "learning_rate": 4.8853169928032094e-06, |
| "loss": 0.4593, |
| "step": 848 |
| }, |
| { |
| "epoch": 0.5226726177879528, |
| "grad_norm": 1.9859813451766968, |
| "learning_rate": 4.875346541309637e-06, |
| "loss": 0.4198, |
| "step": 849 |
| }, |
| { |
| "epoch": 0.5232882510244522, |
| "grad_norm": 1.8037396669387817, |
| "learning_rate": 4.865376585771687e-06, |
| "loss": 0.4379, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.5239038842609516, |
| "grad_norm": 1.760682463645935, |
| "learning_rate": 4.8554071658565745e-06, |
| "loss": 0.4436, |
| "step": 851 |
| }, |
| { |
| "epoch": 0.5245195174974508, |
| "grad_norm": 1.7362818717956543, |
| "learning_rate": 4.845438321229382e-06, |
| "loss": 0.4445, |
| "step": 852 |
| }, |
| { |
| "epoch": 0.5251351507339502, |
| "grad_norm": 1.8464460372924805, |
| "learning_rate": 4.835470091552906e-06, |
| "loss": 0.4302, |
| "step": 853 |
| }, |
| { |
| "epoch": 0.5257507839704496, |
| "grad_norm": 1.7991427183151245, |
| "learning_rate": 4.825502516487497e-06, |
| "loss": 0.4433, |
| "step": 854 |
| }, |
| { |
| "epoch": 0.526366417206949, |
| "grad_norm": 1.824872374534607, |
| "learning_rate": 4.815535635690892e-06, |
| "loss": 0.4563, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.5269820504434484, |
| "grad_norm": 1.8947278261184692, |
| "learning_rate": 4.805569488818077e-06, |
| "loss": 0.4206, |
| "step": 856 |
| }, |
| { |
| "epoch": 0.5275976836799476, |
| "grad_norm": 1.9948461055755615, |
| "learning_rate": 4.795604115521109e-06, |
| "loss": 0.45, |
| "step": 857 |
| }, |
| { |
| "epoch": 0.528213316916447, |
| "grad_norm": 1.8243073225021362, |
| "learning_rate": 4.785639555448977e-06, |
| "loss": 0.4614, |
| "step": 858 |
| }, |
| { |
| "epoch": 0.5288289501529464, |
| "grad_norm": 1.8149497509002686, |
| "learning_rate": 4.775675848247427e-06, |
| "loss": 0.4322, |
| "step": 859 |
| }, |
| { |
| "epoch": 0.5294445833894458, |
| "grad_norm": 1.8010841608047485, |
| "learning_rate": 4.7657130335588115e-06, |
| "loss": 0.4637, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.530060216625945, |
| "grad_norm": 1.6604598760604858, |
| "learning_rate": 4.755751151021934e-06, |
| "loss": 0.4391, |
| "step": 861 |
| }, |
| { |
| "epoch": 0.5306758498624444, |
| "grad_norm": 1.7927626371383667, |
| "learning_rate": 4.745790240271892e-06, |
| "loss": 0.4417, |
| "step": 862 |
| }, |
| { |
| "epoch": 0.5312914830989438, |
| "grad_norm": 1.9684895277023315, |
| "learning_rate": 4.735830340939913e-06, |
| "loss": 0.4599, |
| "step": 863 |
| }, |
| { |
| "epoch": 0.5319071163354432, |
| "grad_norm": 1.9318605661392212, |
| "learning_rate": 4.7258714926532e-06, |
| "loss": 0.4308, |
| "step": 864 |
| }, |
| { |
| "epoch": 0.5325227495719426, |
| "grad_norm": 1.8322875499725342, |
| "learning_rate": 4.715913735034779e-06, |
| "loss": 0.4454, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.5331383828084418, |
| "grad_norm": 1.7331942319869995, |
| "learning_rate": 4.705957107703327e-06, |
| "loss": 0.4114, |
| "step": 866 |
| }, |
| { |
| "epoch": 0.5337540160449412, |
| "grad_norm": 1.7381324768066406, |
| "learning_rate": 4.6960016502730354e-06, |
| "loss": 0.4206, |
| "step": 867 |
| }, |
| { |
| "epoch": 0.5343696492814406, |
| "grad_norm": 1.6991071701049805, |
| "learning_rate": 4.686047402353433e-06, |
| "loss": 0.4413, |
| "step": 868 |
| }, |
| { |
| "epoch": 0.53498528251794, |
| "grad_norm": 1.9641220569610596, |
| "learning_rate": 4.676094403549241e-06, |
| "loss": 0.4381, |
| "step": 869 |
| }, |
| { |
| "epoch": 0.5356009157544392, |
| "grad_norm": 1.9879289865493774, |
| "learning_rate": 4.666142693460208e-06, |
| "loss": 0.4182, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.5362165489909386, |
| "grad_norm": 1.7053484916687012, |
| "learning_rate": 4.6561923116809545e-06, |
| "loss": 0.413, |
| "step": 871 |
| }, |
| { |
| "epoch": 0.536832182227438, |
| "grad_norm": 1.6974514722824097, |
| "learning_rate": 4.646243297800818e-06, |
| "loss": 0.4231, |
| "step": 872 |
| }, |
| { |
| "epoch": 0.5374478154639374, |
| "grad_norm": 1.9661206007003784, |
| "learning_rate": 4.636295691403696e-06, |
| "loss": 0.4395, |
| "step": 873 |
| }, |
| { |
| "epoch": 0.5380634487004368, |
| "grad_norm": 2.0033390522003174, |
| "learning_rate": 4.626349532067879e-06, |
| "loss": 0.4263, |
| "step": 874 |
| }, |
| { |
| "epoch": 0.538679081936936, |
| "grad_norm": 1.7665894031524658, |
| "learning_rate": 4.6164048593659076e-06, |
| "loss": 0.4375, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.5392947151734354, |
| "grad_norm": 1.7947988510131836, |
| "learning_rate": 4.606461712864403e-06, |
| "loss": 0.4257, |
| "step": 876 |
| }, |
| { |
| "epoch": 0.5399103484099348, |
| "grad_norm": 1.890939712524414, |
| "learning_rate": 4.596520132123915e-06, |
| "loss": 0.4112, |
| "step": 877 |
| }, |
| { |
| "epoch": 0.5405259816464342, |
| "grad_norm": 2.065849781036377, |
| "learning_rate": 4.586580156698764e-06, |
| "loss": 0.4441, |
| "step": 878 |
| }, |
| { |
| "epoch": 0.5411416148829334, |
| "grad_norm": 1.8077067136764526, |
| "learning_rate": 4.576641826136884e-06, |
| "loss": 0.3964, |
| "step": 879 |
| }, |
| { |
| "epoch": 0.5417572481194328, |
| "grad_norm": 1.7111256122589111, |
| "learning_rate": 4.566705179979665e-06, |
| "loss": 0.4366, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.5423728813559322, |
| "grad_norm": 2.0317494869232178, |
| "learning_rate": 4.556770257761794e-06, |
| "loss": 0.4585, |
| "step": 881 |
| }, |
| { |
| "epoch": 0.5429885145924316, |
| "grad_norm": 1.7759976387023926, |
| "learning_rate": 4.546837099011101e-06, |
| "loss": 0.4066, |
| "step": 882 |
| }, |
| { |
| "epoch": 0.543604147828931, |
| "grad_norm": 1.768035888671875, |
| "learning_rate": 4.536905743248394e-06, |
| "loss": 0.4295, |
| "step": 883 |
| }, |
| { |
| "epoch": 0.5442197810654302, |
| "grad_norm": 1.805371880531311, |
| "learning_rate": 4.526976229987315e-06, |
| "loss": 0.4489, |
| "step": 884 |
| }, |
| { |
| "epoch": 0.5448354143019296, |
| "grad_norm": 1.6467334032058716, |
| "learning_rate": 4.517048598734171e-06, |
| "loss": 0.4053, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.545451047538429, |
| "grad_norm": 1.7442387342453003, |
| "learning_rate": 4.507122888987782e-06, |
| "loss": 0.4275, |
| "step": 886 |
| }, |
| { |
| "epoch": 0.5460666807749284, |
| "grad_norm": 1.9833625555038452, |
| "learning_rate": 4.497199140239326e-06, |
| "loss": 0.4501, |
| "step": 887 |
| }, |
| { |
| "epoch": 0.5466823140114276, |
| "grad_norm": 1.791882872581482, |
| "learning_rate": 4.487277391972171e-06, |
| "loss": 0.4346, |
| "step": 888 |
| }, |
| { |
| "epoch": 0.547297947247927, |
| "grad_norm": 1.6458256244659424, |
| "learning_rate": 4.477357683661734e-06, |
| "loss": 0.3962, |
| "step": 889 |
| }, |
| { |
| "epoch": 0.5479135804844264, |
| "grad_norm": 1.7302979230880737, |
| "learning_rate": 4.467440054775311e-06, |
| "loss": 0.4273, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.5485292137209258, |
| "grad_norm": 1.7547414302825928, |
| "learning_rate": 4.457524544771925e-06, |
| "loss": 0.4467, |
| "step": 891 |
| }, |
| { |
| "epoch": 0.5491448469574252, |
| "grad_norm": 1.7946034669876099, |
| "learning_rate": 4.447611193102171e-06, |
| "loss": 0.4531, |
| "step": 892 |
| }, |
| { |
| "epoch": 0.5497604801939244, |
| "grad_norm": 1.863038420677185, |
| "learning_rate": 4.437700039208056e-06, |
| "loss": 0.4395, |
| "step": 893 |
| }, |
| { |
| "epoch": 0.5503761134304238, |
| "grad_norm": 1.7706995010375977, |
| "learning_rate": 4.427791122522841e-06, |
| "loss": 0.4301, |
| "step": 894 |
| }, |
| { |
| "epoch": 0.5509917466669232, |
| "grad_norm": 3.401134967803955, |
| "learning_rate": 4.417884482470887e-06, |
| "loss": 0.4546, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.5516073799034226, |
| "grad_norm": 1.8848762512207031, |
| "learning_rate": 4.4079801584674955e-06, |
| "loss": 0.4771, |
| "step": 896 |
| }, |
| { |
| "epoch": 0.5522230131399218, |
| "grad_norm": 1.8655364513397217, |
| "learning_rate": 4.398078189918756e-06, |
| "loss": 0.4374, |
| "step": 897 |
| }, |
| { |
| "epoch": 0.5528386463764212, |
| "grad_norm": 1.812591791152954, |
| "learning_rate": 4.388178616221389e-06, |
| "loss": 0.4679, |
| "step": 898 |
| }, |
| { |
| "epoch": 0.5534542796129206, |
| "grad_norm": 1.71664559841156, |
| "learning_rate": 4.3782814767625755e-06, |
| "loss": 0.4305, |
| "step": 899 |
| }, |
| { |
| "epoch": 0.55406991284942, |
| "grad_norm": 2.093832015991211, |
| "learning_rate": 4.3683868109198225e-06, |
| "loss": 0.4307, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.55406991284942, |
| "eval_loss": 0.430372953414917, |
| "eval_runtime": 118.7087, |
| "eval_samples_per_second": 35.389, |
| "eval_steps_per_second": 4.431, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.5546855460859194, |
| "grad_norm": 1.8110665082931519, |
| "learning_rate": 4.35849465806079e-06, |
| "loss": 0.4191, |
| "step": 901 |
| }, |
| { |
| "epoch": 0.5553011793224186, |
| "grad_norm": 1.8144805431365967, |
| "learning_rate": 4.348605057543142e-06, |
| "loss": 0.4629, |
| "step": 902 |
| }, |
| { |
| "epoch": 0.555916812558918, |
| "grad_norm": 1.7565912008285522, |
| "learning_rate": 4.3387180487143875e-06, |
| "loss": 0.4358, |
| "step": 903 |
| }, |
| { |
| "epoch": 0.5565324457954174, |
| "grad_norm": 1.7798113822937012, |
| "learning_rate": 4.3288336709117246e-06, |
| "loss": 0.441, |
| "step": 904 |
| }, |
| { |
| "epoch": 0.5571480790319168, |
| "grad_norm": 1.789359211921692, |
| "learning_rate": 4.318951963461876e-06, |
| "loss": 0.4234, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.5577637122684161, |
| "grad_norm": 1.7896876335144043, |
| "learning_rate": 4.309072965680951e-06, |
| "loss": 0.4281, |
| "step": 906 |
| }, |
| { |
| "epoch": 0.5583793455049154, |
| "grad_norm": 1.8061071634292603, |
| "learning_rate": 4.299196716874271e-06, |
| "loss": 0.4444, |
| "step": 907 |
| }, |
| { |
| "epoch": 0.5589949787414148, |
| "grad_norm": 1.637271523475647, |
| "learning_rate": 4.289323256336223e-06, |
| "loss": 0.443, |
| "step": 908 |
| }, |
| { |
| "epoch": 0.5596106119779142, |
| "grad_norm": 1.7649685144424438, |
| "learning_rate": 4.279452623350101e-06, |
| "loss": 0.4477, |
| "step": 909 |
| }, |
| { |
| "epoch": 0.5602262452144136, |
| "grad_norm": 1.7707200050354004, |
| "learning_rate": 4.269584857187942e-06, |
| "loss": 0.4373, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.5608418784509128, |
| "grad_norm": 1.8373018503189087, |
| "learning_rate": 4.259719997110388e-06, |
| "loss": 0.4122, |
| "step": 911 |
| }, |
| { |
| "epoch": 0.5614575116874122, |
| "grad_norm": 1.7104779481887817, |
| "learning_rate": 4.24985808236651e-06, |
| "loss": 0.4227, |
| "step": 912 |
| }, |
| { |
| "epoch": 0.5620731449239116, |
| "grad_norm": 1.7874691486358643, |
| "learning_rate": 4.239999152193664e-06, |
| "loss": 0.4341, |
| "step": 913 |
| }, |
| { |
| "epoch": 0.562688778160411, |
| "grad_norm": 2.1757819652557373, |
| "learning_rate": 4.230143245817332e-06, |
| "loss": 0.4018, |
| "step": 914 |
| }, |
| { |
| "epoch": 0.5633044113969103, |
| "grad_norm": 1.6143423318862915, |
| "learning_rate": 4.2202904024509635e-06, |
| "loss": 0.409, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.5639200446334096, |
| "grad_norm": 1.7682405710220337, |
| "learning_rate": 4.2104406612958216e-06, |
| "loss": 0.4158, |
| "step": 916 |
| }, |
| { |
| "epoch": 0.564535677869909, |
| "grad_norm": 1.7131983041763306, |
| "learning_rate": 4.200594061540827e-06, |
| "loss": 0.4337, |
| "step": 917 |
| }, |
| { |
| "epoch": 0.5651513111064084, |
| "grad_norm": 1.7463741302490234, |
| "learning_rate": 4.1907506423624006e-06, |
| "loss": 0.4275, |
| "step": 918 |
| }, |
| { |
| "epoch": 0.5657669443429078, |
| "grad_norm": 1.743363857269287, |
| "learning_rate": 4.180910442924312e-06, |
| "loss": 0.4138, |
| "step": 919 |
| }, |
| { |
| "epoch": 0.566382577579407, |
| "grad_norm": 1.5896950960159302, |
| "learning_rate": 4.171073502377519e-06, |
| "loss": 0.39, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.5669982108159064, |
| "grad_norm": 1.815305471420288, |
| "learning_rate": 4.16123985986001e-06, |
| "loss": 0.4447, |
| "step": 921 |
| }, |
| { |
| "epoch": 0.5676138440524058, |
| "grad_norm": 1.758428931236267, |
| "learning_rate": 4.1514095544966556e-06, |
| "loss": 0.4344, |
| "step": 922 |
| }, |
| { |
| "epoch": 0.5682294772889052, |
| "grad_norm": 1.9296027421951294, |
| "learning_rate": 4.141582625399049e-06, |
| "loss": 0.4423, |
| "step": 923 |
| }, |
| { |
| "epoch": 0.5688451105254045, |
| "grad_norm": 2.148974895477295, |
| "learning_rate": 4.131759111665349e-06, |
| "loss": 0.4594, |
| "step": 924 |
| }, |
| { |
| "epoch": 0.5694607437619038, |
| "grad_norm": 1.6873085498809814, |
| "learning_rate": 4.121939052380125e-06, |
| "loss": 0.4355, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.5700763769984032, |
| "grad_norm": 1.6905049085617065, |
| "learning_rate": 4.112122486614204e-06, |
| "loss": 0.4036, |
| "step": 926 |
| }, |
| { |
| "epoch": 0.5706920102349026, |
| "grad_norm": 1.883623719215393, |
| "learning_rate": 4.102309453424511e-06, |
| "loss": 0.4391, |
| "step": 927 |
| }, |
| { |
| "epoch": 0.571307643471402, |
| "grad_norm": 1.7885838747024536, |
| "learning_rate": 4.092499991853919e-06, |
| "loss": 0.435, |
| "step": 928 |
| }, |
| { |
| "epoch": 0.5719232767079012, |
| "grad_norm": 1.7965201139450073, |
| "learning_rate": 4.0826941409310885e-06, |
| "loss": 0.4319, |
| "step": 929 |
| }, |
| { |
| "epoch": 0.5725389099444006, |
| "grad_norm": 1.7711795568466187, |
| "learning_rate": 4.072891939670317e-06, |
| "loss": 0.4279, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.5731545431809, |
| "grad_norm": 1.7697465419769287, |
| "learning_rate": 4.063093427071376e-06, |
| "loss": 0.4278, |
| "step": 931 |
| }, |
| { |
| "epoch": 0.5737701764173994, |
| "grad_norm": 1.721031904220581, |
| "learning_rate": 4.05329864211937e-06, |
| "loss": 0.4315, |
| "step": 932 |
| }, |
| { |
| "epoch": 0.5743858096538987, |
| "grad_norm": 1.8830902576446533, |
| "learning_rate": 4.043507623784562e-06, |
| "loss": 0.4151, |
| "step": 933 |
| }, |
| { |
| "epoch": 0.575001442890398, |
| "grad_norm": 1.7482315301895142, |
| "learning_rate": 4.033720411022235e-06, |
| "loss": 0.3916, |
| "step": 934 |
| }, |
| { |
| "epoch": 0.5756170761268974, |
| "grad_norm": 1.5488115549087524, |
| "learning_rate": 4.023937042772529e-06, |
| "loss": 0.415, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.5762327093633968, |
| "grad_norm": 1.8000409603118896, |
| "learning_rate": 4.014157557960289e-06, |
| "loss": 0.4446, |
| "step": 936 |
| }, |
| { |
| "epoch": 0.5768483425998961, |
| "grad_norm": 1.7415274381637573, |
| "learning_rate": 4.0043819954949105e-06, |
| "loss": 0.4254, |
| "step": 937 |
| }, |
| { |
| "epoch": 0.5774639758363954, |
| "grad_norm": 1.8254972696304321, |
| "learning_rate": 3.994610394270178e-06, |
| "loss": 0.4395, |
| "step": 938 |
| }, |
| { |
| "epoch": 0.5780796090728948, |
| "grad_norm": 1.7846405506134033, |
| "learning_rate": 3.98484279316412e-06, |
| "loss": 0.432, |
| "step": 939 |
| }, |
| { |
| "epoch": 0.5786952423093942, |
| "grad_norm": 1.882558822631836, |
| "learning_rate": 3.975079231038848e-06, |
| "loss": 0.4496, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.5793108755458936, |
| "grad_norm": 1.6487857103347778, |
| "learning_rate": 3.965319746740407e-06, |
| "loss": 0.4255, |
| "step": 941 |
| }, |
| { |
| "epoch": 0.5799265087823929, |
| "grad_norm": 1.9286268949508667, |
| "learning_rate": 3.955564379098613e-06, |
| "loss": 0.4313, |
| "step": 942 |
| }, |
| { |
| "epoch": 0.5805421420188922, |
| "grad_norm": 1.7310795783996582, |
| "learning_rate": 3.9458131669269066e-06, |
| "loss": 0.4536, |
| "step": 943 |
| }, |
| { |
| "epoch": 0.5811577752553916, |
| "grad_norm": 1.6216704845428467, |
| "learning_rate": 3.936066149022191e-06, |
| "loss": 0.4125, |
| "step": 944 |
| }, |
| { |
| "epoch": 0.581773408491891, |
| "grad_norm": 1.9235811233520508, |
| "learning_rate": 3.926323364164684e-06, |
| "loss": 0.4337, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.5823890417283903, |
| "grad_norm": 1.5660535097122192, |
| "learning_rate": 3.916584851117766e-06, |
| "loss": 0.4164, |
| "step": 946 |
| }, |
| { |
| "epoch": 0.5830046749648896, |
| "grad_norm": 1.82460355758667, |
| "learning_rate": 3.906850648627814e-06, |
| "loss": 0.4192, |
| "step": 947 |
| }, |
| { |
| "epoch": 0.583620308201389, |
| "grad_norm": 1.855008840560913, |
| "learning_rate": 3.897120795424062e-06, |
| "loss": 0.4453, |
| "step": 948 |
| }, |
| { |
| "epoch": 0.5842359414378884, |
| "grad_norm": 1.7206461429595947, |
| "learning_rate": 3.887395330218429e-06, |
| "loss": 0.4161, |
| "step": 949 |
| }, |
| { |
| "epoch": 0.5848515746743878, |
| "grad_norm": 1.8171783685684204, |
| "learning_rate": 3.877674291705386e-06, |
| "loss": 0.4232, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.5854672079108871, |
| "grad_norm": 1.8228973150253296, |
| "learning_rate": 3.867957718561787e-06, |
| "loss": 0.425, |
| "step": 951 |
| }, |
| { |
| "epoch": 0.5860828411473864, |
| "grad_norm": 1.7684314250946045, |
| "learning_rate": 3.8582456494467214e-06, |
| "loss": 0.4122, |
| "step": 952 |
| }, |
| { |
| "epoch": 0.5866984743838858, |
| "grad_norm": 1.9155278205871582, |
| "learning_rate": 3.848538123001356e-06, |
| "loss": 0.4661, |
| "step": 953 |
| }, |
| { |
| "epoch": 0.5873141076203852, |
| "grad_norm": 1.8282909393310547, |
| "learning_rate": 3.8388351778487884e-06, |
| "loss": 0.4492, |
| "step": 954 |
| }, |
| { |
| "epoch": 0.5879297408568845, |
| "grad_norm": 1.8336620330810547, |
| "learning_rate": 3.829136852593881e-06, |
| "loss": 0.4211, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.5885453740933839, |
| "grad_norm": 1.7195727825164795, |
| "learning_rate": 3.8194431858231226e-06, |
| "loss": 0.4237, |
| "step": 956 |
| }, |
| { |
| "epoch": 0.5891610073298832, |
| "grad_norm": 1.6625844240188599, |
| "learning_rate": 3.8097542161044653e-06, |
| "loss": 0.4292, |
| "step": 957 |
| }, |
| { |
| "epoch": 0.5897766405663826, |
| "grad_norm": 1.9300004243850708, |
| "learning_rate": 3.8000699819871704e-06, |
| "loss": 0.4464, |
| "step": 958 |
| }, |
| { |
| "epoch": 0.590392273802882, |
| "grad_norm": 1.8018397092819214, |
| "learning_rate": 3.790390522001662e-06, |
| "loss": 0.4212, |
| "step": 959 |
| }, |
| { |
| "epoch": 0.5910079070393813, |
| "grad_norm": 1.8984296321868896, |
| "learning_rate": 3.780715874659366e-06, |
| "loss": 0.3802, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.5916235402758806, |
| "grad_norm": 1.6990512609481812, |
| "learning_rate": 3.7710460784525617e-06, |
| "loss": 0.4089, |
| "step": 961 |
| }, |
| { |
| "epoch": 0.59223917351238, |
| "grad_norm": 1.6612354516983032, |
| "learning_rate": 3.761381171854227e-06, |
| "loss": 0.4378, |
| "step": 962 |
| }, |
| { |
| "epoch": 0.5928548067488794, |
| "grad_norm": 1.6652956008911133, |
| "learning_rate": 3.751721193317887e-06, |
| "loss": 0.4194, |
| "step": 963 |
| }, |
| { |
| "epoch": 0.5934704399853787, |
| "grad_norm": 1.7413960695266724, |
| "learning_rate": 3.7420661812774577e-06, |
| "loss": 0.4403, |
| "step": 964 |
| }, |
| { |
| "epoch": 0.5940860732218781, |
| "grad_norm": 1.7049193382263184, |
| "learning_rate": 3.7324161741470975e-06, |
| "loss": 0.4331, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.5947017064583774, |
| "grad_norm": 1.7444761991500854, |
| "learning_rate": 3.7227712103210485e-06, |
| "loss": 0.4205, |
| "step": 966 |
| }, |
| { |
| "epoch": 0.5953173396948768, |
| "grad_norm": 1.8145601749420166, |
| "learning_rate": 3.7131313281734895e-06, |
| "loss": 0.4234, |
| "step": 967 |
| }, |
| { |
| "epoch": 0.5959329729313761, |
| "grad_norm": 1.758312702178955, |
| "learning_rate": 3.7034965660583794e-06, |
| "loss": 0.4213, |
| "step": 968 |
| }, |
| { |
| "epoch": 0.5965486061678755, |
| "grad_norm": 1.9307377338409424, |
| "learning_rate": 3.6938669623093086e-06, |
| "loss": 0.4348, |
| "step": 969 |
| }, |
| { |
| "epoch": 0.5971642394043748, |
| "grad_norm": 1.7513355016708374, |
| "learning_rate": 3.6842425552393424e-06, |
| "loss": 0.4503, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.5977798726408742, |
| "grad_norm": 1.624816656112671, |
| "learning_rate": 3.6746233831408706e-06, |
| "loss": 0.4049, |
| "step": 971 |
| }, |
| { |
| "epoch": 0.5983955058773736, |
| "grad_norm": 1.6518126726150513, |
| "learning_rate": 3.6650094842854532e-06, |
| "loss": 0.4209, |
| "step": 972 |
| }, |
| { |
| "epoch": 0.5990111391138729, |
| "grad_norm": 1.6552377939224243, |
| "learning_rate": 3.655400896923672e-06, |
| "loss": 0.4019, |
| "step": 973 |
| }, |
| { |
| "epoch": 0.5996267723503723, |
| "grad_norm": 1.7360631227493286, |
| "learning_rate": 3.6457976592849753e-06, |
| "loss": 0.4301, |
| "step": 974 |
| }, |
| { |
| "epoch": 0.6002424055868716, |
| "grad_norm": 1.5934597253799438, |
| "learning_rate": 3.636199809577524e-06, |
| "loss": 0.4172, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.600858038823371, |
| "grad_norm": 1.6730518341064453, |
| "learning_rate": 3.62660738598805e-06, |
| "loss": 0.4075, |
| "step": 976 |
| }, |
| { |
| "epoch": 0.6014736720598703, |
| "grad_norm": 1.6485260725021362, |
| "learning_rate": 3.6170204266816854e-06, |
| "loss": 0.3908, |
| "step": 977 |
| }, |
| { |
| "epoch": 0.6020893052963697, |
| "grad_norm": 1.7933323383331299, |
| "learning_rate": 3.6074389698018295e-06, |
| "loss": 0.4214, |
| "step": 978 |
| }, |
| { |
| "epoch": 0.602704938532869, |
| "grad_norm": 1.7532880306243896, |
| "learning_rate": 3.5978630534699873e-06, |
| "loss": 0.4064, |
| "step": 979 |
| }, |
| { |
| "epoch": 0.6033205717693684, |
| "grad_norm": 1.749861478805542, |
| "learning_rate": 3.5882927157856175e-06, |
| "loss": 0.3909, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.6039362050058678, |
| "grad_norm": 1.7551836967468262, |
| "learning_rate": 3.578727994825988e-06, |
| "loss": 0.421, |
| "step": 981 |
| }, |
| { |
| "epoch": 0.6045518382423671, |
| "grad_norm": 1.8642972707748413, |
| "learning_rate": 3.5691689286460172e-06, |
| "loss": 0.4359, |
| "step": 982 |
| }, |
| { |
| "epoch": 0.6051674714788665, |
| "grad_norm": 1.601855993270874, |
| "learning_rate": 3.5596155552781207e-06, |
| "loss": 0.4009, |
| "step": 983 |
| }, |
| { |
| "epoch": 0.6057831047153658, |
| "grad_norm": 1.6237114667892456, |
| "learning_rate": 3.550067912732069e-06, |
| "loss": 0.4126, |
| "step": 984 |
| }, |
| { |
| "epoch": 0.6063987379518652, |
| "grad_norm": 1.7239121198654175, |
| "learning_rate": 3.540526038994834e-06, |
| "loss": 0.414, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.6070143711883645, |
| "grad_norm": 1.7628268003463745, |
| "learning_rate": 3.530989972030428e-06, |
| "loss": 0.4464, |
| "step": 986 |
| }, |
| { |
| "epoch": 0.6076300044248639, |
| "grad_norm": 1.675179362297058, |
| "learning_rate": 3.521459749779769e-06, |
| "loss": 0.3922, |
| "step": 987 |
| }, |
| { |
| "epoch": 0.6082456376613632, |
| "grad_norm": 1.7558987140655518, |
| "learning_rate": 3.5119354101605086e-06, |
| "loss": 0.4214, |
| "step": 988 |
| }, |
| { |
| "epoch": 0.6088612708978626, |
| "grad_norm": 1.877814531326294, |
| "learning_rate": 3.502416991066904e-06, |
| "loss": 0.4259, |
| "step": 989 |
| }, |
| { |
| "epoch": 0.609476904134362, |
| "grad_norm": 1.7786641120910645, |
| "learning_rate": 3.492904530369655e-06, |
| "loss": 0.4338, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.6100925373708613, |
| "grad_norm": 1.812157154083252, |
| "learning_rate": 3.4833980659157507e-06, |
| "loss": 0.4233, |
| "step": 991 |
| }, |
| { |
| "epoch": 0.6107081706073607, |
| "grad_norm": 1.7805922031402588, |
| "learning_rate": 3.4738976355283257e-06, |
| "loss": 0.4639, |
| "step": 992 |
| }, |
| { |
| "epoch": 0.61132380384386, |
| "grad_norm": 1.6990723609924316, |
| "learning_rate": 3.464403277006508e-06, |
| "loss": 0.4129, |
| "step": 993 |
| }, |
| { |
| "epoch": 0.6119394370803594, |
| "grad_norm": 1.7403292655944824, |
| "learning_rate": 3.4549150281252635e-06, |
| "loss": 0.429, |
| "step": 994 |
| }, |
| { |
| "epoch": 0.6125550703168587, |
| "grad_norm": 1.8647472858428955, |
| "learning_rate": 3.4454329266352543e-06, |
| "loss": 0.4163, |
| "step": 995 |
| }, |
| { |
| "epoch": 0.6131707035533581, |
| "grad_norm": 1.6785216331481934, |
| "learning_rate": 3.435957010262682e-06, |
| "loss": 0.4122, |
| "step": 996 |
| }, |
| { |
| "epoch": 0.6137863367898574, |
| "grad_norm": 1.7292088270187378, |
| "learning_rate": 3.4264873167091405e-06, |
| "loss": 0.4378, |
| "step": 997 |
| }, |
| { |
| "epoch": 0.6144019700263568, |
| "grad_norm": 1.7312982082366943, |
| "learning_rate": 3.4170238836514645e-06, |
| "loss": 0.3968, |
| "step": 998 |
| }, |
| { |
| "epoch": 0.6150176032628561, |
| "grad_norm": 1.7593203783035278, |
| "learning_rate": 3.4075667487415785e-06, |
| "loss": 0.4182, |
| "step": 999 |
| }, |
| { |
| "epoch": 0.6156332364993555, |
| "grad_norm": 1.7004156112670898, |
| "learning_rate": 3.398115949606352e-06, |
| "loss": 0.3914, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.6156332364993555, |
| "eval_loss": 0.41942450404167175, |
| "eval_runtime": 118.6467, |
| "eval_samples_per_second": 35.408, |
| "eval_steps_per_second": 4.433, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.6162488697358549, |
| "grad_norm": 1.7762300968170166, |
| "learning_rate": 3.3886715238474454e-06, |
| "loss": 0.4441, |
| "step": 1001 |
| }, |
| { |
| "epoch": 0.6168645029723542, |
| "grad_norm": 1.6758389472961426, |
| "learning_rate": 3.37923350904116e-06, |
| "loss": 0.4116, |
| "step": 1002 |
| }, |
| { |
| "epoch": 0.6174801362088536, |
| "grad_norm": 1.741321325302124, |
| "learning_rate": 3.3698019427382912e-06, |
| "loss": 0.4383, |
| "step": 1003 |
| }, |
| { |
| "epoch": 0.6180957694453529, |
| "grad_norm": 1.7947500944137573, |
| "learning_rate": 3.3603768624639786e-06, |
| "loss": 0.4261, |
| "step": 1004 |
| }, |
| { |
| "epoch": 0.6187114026818523, |
| "grad_norm": 1.74982750415802, |
| "learning_rate": 3.3509583057175547e-06, |
| "loss": 0.4293, |
| "step": 1005 |
| }, |
| { |
| "epoch": 0.6193270359183516, |
| "grad_norm": 1.919725775718689, |
| "learning_rate": 3.341546309972398e-06, |
| "loss": 0.4156, |
| "step": 1006 |
| }, |
| { |
| "epoch": 0.619942669154851, |
| "grad_norm": 2.013151168823242, |
| "learning_rate": 3.3321409126757807e-06, |
| "loss": 0.4306, |
| "step": 1007 |
| }, |
| { |
| "epoch": 0.6205583023913503, |
| "grad_norm": 1.8035351037979126, |
| "learning_rate": 3.322742151248726e-06, |
| "loss": 0.4317, |
| "step": 1008 |
| }, |
| { |
| "epoch": 0.6211739356278497, |
| "grad_norm": 1.722594141960144, |
| "learning_rate": 3.3133500630858507e-06, |
| "loss": 0.3966, |
| "step": 1009 |
| }, |
| { |
| "epoch": 0.6217895688643491, |
| "grad_norm": 1.719792366027832, |
| "learning_rate": 3.3039646855552243e-06, |
| "loss": 0.391, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.6224052021008484, |
| "grad_norm": 1.7098640203475952, |
| "learning_rate": 3.2945860559982153e-06, |
| "loss": 0.3938, |
| "step": 1011 |
| }, |
| { |
| "epoch": 0.6230208353373478, |
| "grad_norm": 1.6516406536102295, |
| "learning_rate": 3.2852142117293435e-06, |
| "loss": 0.4288, |
| "step": 1012 |
| }, |
| { |
| "epoch": 0.6236364685738471, |
| "grad_norm": 1.7048057317733765, |
| "learning_rate": 3.275849190036133e-06, |
| "loss": 0.4214, |
| "step": 1013 |
| }, |
| { |
| "epoch": 0.6242521018103465, |
| "grad_norm": 1.7411932945251465, |
| "learning_rate": 3.266491028178964e-06, |
| "loss": 0.4102, |
| "step": 1014 |
| }, |
| { |
| "epoch": 0.6248677350468459, |
| "grad_norm": 1.5722557306289673, |
| "learning_rate": 3.2571397633909252e-06, |
| "loss": 0.4009, |
| "step": 1015 |
| }, |
| { |
| "epoch": 0.6254833682833452, |
| "grad_norm": 1.5852289199829102, |
| "learning_rate": 3.2477954328776574e-06, |
| "loss": 0.3832, |
| "step": 1016 |
| }, |
| { |
| "epoch": 0.6260990015198445, |
| "grad_norm": 1.6573041677474976, |
| "learning_rate": 3.2384580738172185e-06, |
| "loss": 0.4145, |
| "step": 1017 |
| }, |
| { |
| "epoch": 0.6267146347563439, |
| "grad_norm": 1.798135757446289, |
| "learning_rate": 3.229127723359927e-06, |
| "loss": 0.4606, |
| "step": 1018 |
| }, |
| { |
| "epoch": 0.6273302679928433, |
| "grad_norm": 1.8369131088256836, |
| "learning_rate": 3.219804418628216e-06, |
| "loss": 0.4171, |
| "step": 1019 |
| }, |
| { |
| "epoch": 0.6279459012293426, |
| "grad_norm": 1.6728371381759644, |
| "learning_rate": 3.2104881967164886e-06, |
| "loss": 0.3942, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.628561534465842, |
| "grad_norm": 1.8083055019378662, |
| "learning_rate": 3.2011790946909673e-06, |
| "loss": 0.448, |
| "step": 1021 |
| }, |
| { |
| "epoch": 0.6291771677023413, |
| "grad_norm": 1.6701570749282837, |
| "learning_rate": 3.1918771495895395e-06, |
| "loss": 0.4179, |
| "step": 1022 |
| }, |
| { |
| "epoch": 0.6297928009388407, |
| "grad_norm": 1.777587652206421, |
| "learning_rate": 3.1825823984216264e-06, |
| "loss": 0.4296, |
| "step": 1023 |
| }, |
| { |
| "epoch": 0.6304084341753401, |
| "grad_norm": 1.6975241899490356, |
| "learning_rate": 3.173294878168025e-06, |
| "loss": 0.3934, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.6310240674118394, |
| "grad_norm": 1.6508057117462158, |
| "learning_rate": 3.1640146257807604e-06, |
| "loss": 0.3989, |
| "step": 1025 |
| }, |
| { |
| "epoch": 0.6316397006483387, |
| "grad_norm": 1.7512353658676147, |
| "learning_rate": 3.154741678182945e-06, |
| "loss": 0.4084, |
| "step": 1026 |
| }, |
| { |
| "epoch": 0.6322553338848381, |
| "grad_norm": 1.6738899946212769, |
| "learning_rate": 3.1454760722686206e-06, |
| "loss": 0.4062, |
| "step": 1027 |
| }, |
| { |
| "epoch": 0.6328709671213375, |
| "grad_norm": 1.9070992469787598, |
| "learning_rate": 3.1362178449026246e-06, |
| "loss": 0.4404, |
| "step": 1028 |
| }, |
| { |
| "epoch": 0.6334866003578368, |
| "grad_norm": 1.5783123970031738, |
| "learning_rate": 3.12696703292044e-06, |
| "loss": 0.4073, |
| "step": 1029 |
| }, |
| { |
| "epoch": 0.6341022335943362, |
| "grad_norm": 1.8654526472091675, |
| "learning_rate": 3.11772367312804e-06, |
| "loss": 0.4228, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.6347178668308355, |
| "grad_norm": 1.5824333429336548, |
| "learning_rate": 3.1084878023017517e-06, |
| "loss": 0.3902, |
| "step": 1031 |
| }, |
| { |
| "epoch": 0.6353335000673349, |
| "grad_norm": 1.7520995140075684, |
| "learning_rate": 3.0992594571881056e-06, |
| "loss": 0.4149, |
| "step": 1032 |
| }, |
| { |
| "epoch": 0.6359491333038343, |
| "grad_norm": 1.7872405052185059, |
| "learning_rate": 3.090038674503688e-06, |
| "loss": 0.4332, |
| "step": 1033 |
| }, |
| { |
| "epoch": 0.6365647665403336, |
| "grad_norm": 1.7681427001953125, |
| "learning_rate": 3.0808254909349987e-06, |
| "loss": 0.404, |
| "step": 1034 |
| }, |
| { |
| "epoch": 0.6371803997768329, |
| "grad_norm": 1.7980319261550903, |
| "learning_rate": 3.071619943138303e-06, |
| "loss": 0.4751, |
| "step": 1035 |
| }, |
| { |
| "epoch": 0.6377960330133323, |
| "grad_norm": 1.615838646888733, |
| "learning_rate": 3.0624220677394854e-06, |
| "loss": 0.3881, |
| "step": 1036 |
| }, |
| { |
| "epoch": 0.6384116662498317, |
| "grad_norm": 1.9394729137420654, |
| "learning_rate": 3.0532319013339053e-06, |
| "loss": 0.3998, |
| "step": 1037 |
| }, |
| { |
| "epoch": 0.639027299486331, |
| "grad_norm": 1.7381932735443115, |
| "learning_rate": 3.044049480486247e-06, |
| "loss": 0.4133, |
| "step": 1038 |
| }, |
| { |
| "epoch": 0.6396429327228303, |
| "grad_norm": 2.063239097595215, |
| "learning_rate": 3.0348748417303826e-06, |
| "loss": 0.4368, |
| "step": 1039 |
| }, |
| { |
| "epoch": 0.6402585659593297, |
| "grad_norm": 1.744970679283142, |
| "learning_rate": 3.025708021569219e-06, |
| "loss": 0.4323, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.6408741991958291, |
| "grad_norm": 1.686367154121399, |
| "learning_rate": 3.016549056474557e-06, |
| "loss": 0.4231, |
| "step": 1041 |
| }, |
| { |
| "epoch": 0.6414898324323285, |
| "grad_norm": 1.6732068061828613, |
| "learning_rate": 3.007397982886942e-06, |
| "loss": 0.3948, |
| "step": 1042 |
| }, |
| { |
| "epoch": 0.6421054656688278, |
| "grad_norm": 1.747349500656128, |
| "learning_rate": 2.9982548372155264e-06, |
| "loss": 0.447, |
| "step": 1043 |
| }, |
| { |
| "epoch": 0.6427210989053271, |
| "grad_norm": 1.8045332431793213, |
| "learning_rate": 2.989119655837913e-06, |
| "loss": 0.4287, |
| "step": 1044 |
| }, |
| { |
| "epoch": 0.6433367321418265, |
| "grad_norm": 1.688730001449585, |
| "learning_rate": 2.979992475100024e-06, |
| "loss": 0.4378, |
| "step": 1045 |
| }, |
| { |
| "epoch": 0.6439523653783259, |
| "grad_norm": 1.8829971551895142, |
| "learning_rate": 2.9708733313159464e-06, |
| "loss": 0.416, |
| "step": 1046 |
| }, |
| { |
| "epoch": 0.6445679986148252, |
| "grad_norm": 1.7018674612045288, |
| "learning_rate": 2.961762260767791e-06, |
| "loss": 0.4234, |
| "step": 1047 |
| }, |
| { |
| "epoch": 0.6451836318513245, |
| "grad_norm": 1.718910813331604, |
| "learning_rate": 2.9526592997055488e-06, |
| "loss": 0.3954, |
| "step": 1048 |
| }, |
| { |
| "epoch": 0.6457992650878239, |
| "grad_norm": 1.749856948852539, |
| "learning_rate": 2.9435644843469434e-06, |
| "loss": 0.4294, |
| "step": 1049 |
| }, |
| { |
| "epoch": 0.6464148983243233, |
| "grad_norm": 1.732338786125183, |
| "learning_rate": 2.934477850877292e-06, |
| "loss": 0.4071, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.6470305315608227, |
| "grad_norm": 1.71662175655365, |
| "learning_rate": 2.9253994354493575e-06, |
| "loss": 0.3991, |
| "step": 1051 |
| }, |
| { |
| "epoch": 0.647646164797322, |
| "grad_norm": 1.7738405466079712, |
| "learning_rate": 2.916329274183206e-06, |
| "loss": 0.4385, |
| "step": 1052 |
| }, |
| { |
| "epoch": 0.6482617980338213, |
| "grad_norm": 1.7057359218597412, |
| "learning_rate": 2.9072674031660647e-06, |
| "loss": 0.4068, |
| "step": 1053 |
| }, |
| { |
| "epoch": 0.6488774312703207, |
| "grad_norm": 1.8498355150222778, |
| "learning_rate": 2.8982138584521734e-06, |
| "loss": 0.4146, |
| "step": 1054 |
| }, |
| { |
| "epoch": 0.6494930645068201, |
| "grad_norm": 1.7028414011001587, |
| "learning_rate": 2.8891686760626445e-06, |
| "loss": 0.4142, |
| "step": 1055 |
| }, |
| { |
| "epoch": 0.6501086977433194, |
| "grad_norm": 1.7531042098999023, |
| "learning_rate": 2.8801318919853237e-06, |
| "loss": 0.3989, |
| "step": 1056 |
| }, |
| { |
| "epoch": 0.6507243309798187, |
| "grad_norm": 1.7394646406173706, |
| "learning_rate": 2.871103542174637e-06, |
| "loss": 0.4068, |
| "step": 1057 |
| }, |
| { |
| "epoch": 0.6513399642163181, |
| "grad_norm": 1.8246972560882568, |
| "learning_rate": 2.8620836625514577e-06, |
| "loss": 0.4311, |
| "step": 1058 |
| }, |
| { |
| "epoch": 0.6519555974528175, |
| "grad_norm": 1.6580798625946045, |
| "learning_rate": 2.853072289002954e-06, |
| "loss": 0.4046, |
| "step": 1059 |
| }, |
| { |
| "epoch": 0.6525712306893169, |
| "grad_norm": 1.8089441061019897, |
| "learning_rate": 2.844069457382459e-06, |
| "loss": 0.4322, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.6531868639258162, |
| "grad_norm": 1.7561008930206299, |
| "learning_rate": 2.835075203509312e-06, |
| "loss": 0.407, |
| "step": 1061 |
| }, |
| { |
| "epoch": 0.6538024971623155, |
| "grad_norm": 1.74833345413208, |
| "learning_rate": 2.8260895631687267e-06, |
| "loss": 0.4334, |
| "step": 1062 |
| }, |
| { |
| "epoch": 0.6544181303988149, |
| "grad_norm": 1.6814000606536865, |
| "learning_rate": 2.817112572111651e-06, |
| "loss": 0.4253, |
| "step": 1063 |
| }, |
| { |
| "epoch": 0.6550337636353143, |
| "grad_norm": 1.8488044738769531, |
| "learning_rate": 2.8081442660546126e-06, |
| "loss": 0.4284, |
| "step": 1064 |
| }, |
| { |
| "epoch": 0.6556493968718137, |
| "grad_norm": 1.7568581104278564, |
| "learning_rate": 2.799184680679592e-06, |
| "loss": 0.3943, |
| "step": 1065 |
| }, |
| { |
| "epoch": 0.6562650301083129, |
| "grad_norm": 1.7765746116638184, |
| "learning_rate": 2.790233851633868e-06, |
| "loss": 0.4156, |
| "step": 1066 |
| }, |
| { |
| "epoch": 0.6568806633448123, |
| "grad_norm": 1.9872164726257324, |
| "learning_rate": 2.7812918145298785e-06, |
| "loss": 0.4239, |
| "step": 1067 |
| }, |
| { |
| "epoch": 0.6574962965813117, |
| "grad_norm": 1.6706302165985107, |
| "learning_rate": 2.7723586049450902e-06, |
| "loss": 0.3858, |
| "step": 1068 |
| }, |
| { |
| "epoch": 0.6581119298178111, |
| "grad_norm": 1.7720205783843994, |
| "learning_rate": 2.7634342584218364e-06, |
| "loss": 0.4312, |
| "step": 1069 |
| }, |
| { |
| "epoch": 0.6587275630543103, |
| "grad_norm": 1.5793696641921997, |
| "learning_rate": 2.7545188104671995e-06, |
| "loss": 0.3945, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.6593431962908097, |
| "grad_norm": 1.6533647775650024, |
| "learning_rate": 2.7456122965528475e-06, |
| "loss": 0.4268, |
| "step": 1071 |
| }, |
| { |
| "epoch": 0.6599588295273091, |
| "grad_norm": 1.676277756690979, |
| "learning_rate": 2.7367147521149052e-06, |
| "loss": 0.4068, |
| "step": 1072 |
| }, |
| { |
| "epoch": 0.6605744627638085, |
| "grad_norm": 1.6618707180023193, |
| "learning_rate": 2.7278262125538153e-06, |
| "loss": 0.4057, |
| "step": 1073 |
| }, |
| { |
| "epoch": 0.6611900960003079, |
| "grad_norm": 1.6752638816833496, |
| "learning_rate": 2.718946713234185e-06, |
| "loss": 0.404, |
| "step": 1074 |
| }, |
| { |
| "epoch": 0.6618057292368071, |
| "grad_norm": 1.6323987245559692, |
| "learning_rate": 2.7100762894846633e-06, |
| "loss": 0.3856, |
| "step": 1075 |
| }, |
| { |
| "epoch": 0.6624213624733065, |
| "grad_norm": 1.6033183336257935, |
| "learning_rate": 2.7012149765977823e-06, |
| "loss": 0.3717, |
| "step": 1076 |
| }, |
| { |
| "epoch": 0.6630369957098059, |
| "grad_norm": 1.7118803262710571, |
| "learning_rate": 2.692362809829825e-06, |
| "loss": 0.3968, |
| "step": 1077 |
| }, |
| { |
| "epoch": 0.6636526289463053, |
| "grad_norm": 1.763137936592102, |
| "learning_rate": 2.683519824400693e-06, |
| "loss": 0.4166, |
| "step": 1078 |
| }, |
| { |
| "epoch": 0.6642682621828045, |
| "grad_norm": 1.8000075817108154, |
| "learning_rate": 2.674686055493748e-06, |
| "loss": 0.4143, |
| "step": 1079 |
| }, |
| { |
| "epoch": 0.6648838954193039, |
| "grad_norm": 1.8939229249954224, |
| "learning_rate": 2.66586153825569e-06, |
| "loss": 0.4026, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.6654995286558033, |
| "grad_norm": 1.8988964557647705, |
| "learning_rate": 2.657046307796407e-06, |
| "loss": 0.4102, |
| "step": 1081 |
| }, |
| { |
| "epoch": 0.6661151618923027, |
| "grad_norm": 1.603047490119934, |
| "learning_rate": 2.648240399188837e-06, |
| "loss": 0.4035, |
| "step": 1082 |
| }, |
| { |
| "epoch": 0.6667307951288021, |
| "grad_norm": 1.6859813928604126, |
| "learning_rate": 2.639443847468831e-06, |
| "loss": 0.4147, |
| "step": 1083 |
| }, |
| { |
| "epoch": 0.6673464283653013, |
| "grad_norm": 1.7707592248916626, |
| "learning_rate": 2.6306566876350072e-06, |
| "loss": 0.3923, |
| "step": 1084 |
| }, |
| { |
| "epoch": 0.6679620616018007, |
| "grad_norm": 1.7900476455688477, |
| "learning_rate": 2.6218789546486235e-06, |
| "loss": 0.4022, |
| "step": 1085 |
| }, |
| { |
| "epoch": 0.6685776948383001, |
| "grad_norm": 1.7816258668899536, |
| "learning_rate": 2.6131106834334296e-06, |
| "loss": 0.419, |
| "step": 1086 |
| }, |
| { |
| "epoch": 0.6691933280747995, |
| "grad_norm": 1.775720477104187, |
| "learning_rate": 2.6043519088755263e-06, |
| "loss": 0.415, |
| "step": 1087 |
| }, |
| { |
| "epoch": 0.6698089613112987, |
| "grad_norm": 1.8259687423706055, |
| "learning_rate": 2.5956026658232347e-06, |
| "loss": 0.4257, |
| "step": 1088 |
| }, |
| { |
| "epoch": 0.6704245945477981, |
| "grad_norm": 1.7508234977722168, |
| "learning_rate": 2.5868629890869467e-06, |
| "loss": 0.4181, |
| "step": 1089 |
| }, |
| { |
| "epoch": 0.6710402277842975, |
| "grad_norm": 1.6744227409362793, |
| "learning_rate": 2.578132913439e-06, |
| "loss": 0.3982, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.6716558610207969, |
| "grad_norm": 1.828457236289978, |
| "learning_rate": 2.5694124736135315e-06, |
| "loss": 0.4321, |
| "step": 1091 |
| }, |
| { |
| "epoch": 0.6722714942572963, |
| "grad_norm": 1.7128944396972656, |
| "learning_rate": 2.560701704306336e-06, |
| "loss": 0.4008, |
| "step": 1092 |
| }, |
| { |
| "epoch": 0.6728871274937955, |
| "grad_norm": 1.7066082954406738, |
| "learning_rate": 2.55200064017474e-06, |
| "loss": 0.4134, |
| "step": 1093 |
| }, |
| { |
| "epoch": 0.6735027607302949, |
| "grad_norm": 1.647888422012329, |
| "learning_rate": 2.543309315837444e-06, |
| "loss": 0.3792, |
| "step": 1094 |
| }, |
| { |
| "epoch": 0.6741183939667943, |
| "grad_norm": 1.7262251377105713, |
| "learning_rate": 2.5346277658744083e-06, |
| "loss": 0.4137, |
| "step": 1095 |
| }, |
| { |
| "epoch": 0.6747340272032937, |
| "grad_norm": 1.782806158065796, |
| "learning_rate": 2.5259560248267022e-06, |
| "loss": 0.4341, |
| "step": 1096 |
| }, |
| { |
| "epoch": 0.6753496604397929, |
| "grad_norm": 1.7016490697860718, |
| "learning_rate": 2.5172941271963626e-06, |
| "loss": 0.4176, |
| "step": 1097 |
| }, |
| { |
| "epoch": 0.6759652936762923, |
| "grad_norm": 1.7586191892623901, |
| "learning_rate": 2.5086421074462707e-06, |
| "loss": 0.4121, |
| "step": 1098 |
| }, |
| { |
| "epoch": 0.6765809269127917, |
| "grad_norm": 1.7193186283111572, |
| "learning_rate": 2.5000000000000015e-06, |
| "loss": 0.4088, |
| "step": 1099 |
| }, |
| { |
| "epoch": 0.6771965601492911, |
| "grad_norm": 1.7770181894302368, |
| "learning_rate": 2.49136783924169e-06, |
| "loss": 0.4464, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.6771965601492911, |
| "eval_loss": 0.40752142667770386, |
| "eval_runtime": 118.9248, |
| "eval_samples_per_second": 35.325, |
| "eval_steps_per_second": 4.423, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.6778121933857905, |
| "grad_norm": 1.7708814144134521, |
| "learning_rate": 2.482745659515905e-06, |
| "loss": 0.3943, |
| "step": 1101 |
| }, |
| { |
| "epoch": 0.6784278266222897, |
| "grad_norm": 1.7782667875289917, |
| "learning_rate": 2.4741334951274948e-06, |
| "loss": 0.411, |
| "step": 1102 |
| }, |
| { |
| "epoch": 0.6790434598587891, |
| "grad_norm": 1.8407803773880005, |
| "learning_rate": 2.4655313803414676e-06, |
| "loss": 0.4085, |
| "step": 1103 |
| }, |
| { |
| "epoch": 0.6796590930952885, |
| "grad_norm": 1.6154685020446777, |
| "learning_rate": 2.4569393493828433e-06, |
| "loss": 0.4043, |
| "step": 1104 |
| }, |
| { |
| "epoch": 0.6802747263317879, |
| "grad_norm": 1.7624921798706055, |
| "learning_rate": 2.448357436436519e-06, |
| "loss": 0.4309, |
| "step": 1105 |
| }, |
| { |
| "epoch": 0.6808903595682871, |
| "grad_norm": 1.7305024862289429, |
| "learning_rate": 2.4397856756471435e-06, |
| "loss": 0.3809, |
| "step": 1106 |
| }, |
| { |
| "epoch": 0.6815059928047865, |
| "grad_norm": 1.7794702053070068, |
| "learning_rate": 2.4312241011189643e-06, |
| "loss": 0.4138, |
| "step": 1107 |
| }, |
| { |
| "epoch": 0.6821216260412859, |
| "grad_norm": 1.7242809534072876, |
| "learning_rate": 2.4226727469157097e-06, |
| "loss": 0.4045, |
| "step": 1108 |
| }, |
| { |
| "epoch": 0.6827372592777853, |
| "grad_norm": 1.867807149887085, |
| "learning_rate": 2.4141316470604362e-06, |
| "loss": 0.4258, |
| "step": 1109 |
| }, |
| { |
| "epoch": 0.6833528925142847, |
| "grad_norm": 1.7269940376281738, |
| "learning_rate": 2.405600835535411e-06, |
| "loss": 0.4283, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.6839685257507839, |
| "grad_norm": 1.7357679605484009, |
| "learning_rate": 2.3970803462819586e-06, |
| "loss": 0.4161, |
| "step": 1111 |
| }, |
| { |
| "epoch": 0.6845841589872833, |
| "grad_norm": 1.7404741048812866, |
| "learning_rate": 2.388570213200337e-06, |
| "loss": 0.3982, |
| "step": 1112 |
| }, |
| { |
| "epoch": 0.6851997922237827, |
| "grad_norm": 1.899558186531067, |
| "learning_rate": 2.380070470149605e-06, |
| "loss": 0.4103, |
| "step": 1113 |
| }, |
| { |
| "epoch": 0.6858154254602821, |
| "grad_norm": 1.8454153537750244, |
| "learning_rate": 2.371581150947476e-06, |
| "loss": 0.3985, |
| "step": 1114 |
| }, |
| { |
| "epoch": 0.6864310586967813, |
| "grad_norm": 1.6742509603500366, |
| "learning_rate": 2.363102289370198e-06, |
| "loss": 0.3938, |
| "step": 1115 |
| }, |
| { |
| "epoch": 0.6870466919332807, |
| "grad_norm": 1.6200331449508667, |
| "learning_rate": 2.354633919152404e-06, |
| "loss": 0.3638, |
| "step": 1116 |
| }, |
| { |
| "epoch": 0.6876623251697801, |
| "grad_norm": 1.6609104871749878, |
| "learning_rate": 2.3461760739869865e-06, |
| "loss": 0.4015, |
| "step": 1117 |
| }, |
| { |
| "epoch": 0.6882779584062795, |
| "grad_norm": 1.658146619796753, |
| "learning_rate": 2.3377287875249694e-06, |
| "loss": 0.3951, |
| "step": 1118 |
| }, |
| { |
| "epoch": 0.6888935916427789, |
| "grad_norm": 1.9421802759170532, |
| "learning_rate": 2.3292920933753566e-06, |
| "loss": 0.4385, |
| "step": 1119 |
| }, |
| { |
| "epoch": 0.6895092248792781, |
| "grad_norm": 1.8629512786865234, |
| "learning_rate": 2.320866025105016e-06, |
| "loss": 0.4117, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.6901248581157775, |
| "grad_norm": 1.7256838083267212, |
| "learning_rate": 2.31245061623854e-06, |
| "loss": 0.4164, |
| "step": 1121 |
| }, |
| { |
| "epoch": 0.6907404913522769, |
| "grad_norm": 2.03247332572937, |
| "learning_rate": 2.3040459002581e-06, |
| "loss": 0.4423, |
| "step": 1122 |
| }, |
| { |
| "epoch": 0.6913561245887763, |
| "grad_norm": 1.765477180480957, |
| "learning_rate": 2.2956519106033366e-06, |
| "loss": 0.4036, |
| "step": 1123 |
| }, |
| { |
| "epoch": 0.6919717578252756, |
| "grad_norm": 1.7879197597503662, |
| "learning_rate": 2.2872686806712037e-06, |
| "loss": 0.4357, |
| "step": 1124 |
| }, |
| { |
| "epoch": 0.6925873910617749, |
| "grad_norm": 1.898209571838379, |
| "learning_rate": 2.278896243815852e-06, |
| "loss": 0.4032, |
| "step": 1125 |
| }, |
| { |
| "epoch": 0.6932030242982743, |
| "grad_norm": 1.812808871269226, |
| "learning_rate": 2.2705346333484925e-06, |
| "loss": 0.459, |
| "step": 1126 |
| }, |
| { |
| "epoch": 0.6938186575347737, |
| "grad_norm": 1.8547736406326294, |
| "learning_rate": 2.2621838825372496e-06, |
| "loss": 0.4036, |
| "step": 1127 |
| }, |
| { |
| "epoch": 0.694434290771273, |
| "grad_norm": 1.6805411577224731, |
| "learning_rate": 2.253844024607054e-06, |
| "loss": 0.3908, |
| "step": 1128 |
| }, |
| { |
| "epoch": 0.6950499240077723, |
| "grad_norm": 1.666832685470581, |
| "learning_rate": 2.245515092739488e-06, |
| "loss": 0.4003, |
| "step": 1129 |
| }, |
| { |
| "epoch": 0.6956655572442717, |
| "grad_norm": 1.7172799110412598, |
| "learning_rate": 2.237197120072667e-06, |
| "loss": 0.4229, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.6962811904807711, |
| "grad_norm": 1.6013420820236206, |
| "learning_rate": 2.228890139701106e-06, |
| "loss": 0.3989, |
| "step": 1131 |
| }, |
| { |
| "epoch": 0.6968968237172705, |
| "grad_norm": 1.7002962827682495, |
| "learning_rate": 2.2205941846755787e-06, |
| "loss": 0.3897, |
| "step": 1132 |
| }, |
| { |
| "epoch": 0.6975124569537698, |
| "grad_norm": 1.7835053205490112, |
| "learning_rate": 2.2123092880029928e-06, |
| "loss": 0.3967, |
| "step": 1133 |
| }, |
| { |
| "epoch": 0.6981280901902691, |
| "grad_norm": 1.798115849494934, |
| "learning_rate": 2.204035482646267e-06, |
| "loss": 0.4067, |
| "step": 1134 |
| }, |
| { |
| "epoch": 0.6987437234267685, |
| "grad_norm": 1.706468939781189, |
| "learning_rate": 2.1957728015241793e-06, |
| "loss": 0.4123, |
| "step": 1135 |
| }, |
| { |
| "epoch": 0.6993593566632679, |
| "grad_norm": 1.8424924612045288, |
| "learning_rate": 2.187521277511259e-06, |
| "loss": 0.4145, |
| "step": 1136 |
| }, |
| { |
| "epoch": 0.6999749898997673, |
| "grad_norm": 1.9657038450241089, |
| "learning_rate": 2.1792809434376366e-06, |
| "loss": 0.4245, |
| "step": 1137 |
| }, |
| { |
| "epoch": 0.7005906231362665, |
| "grad_norm": 1.906627893447876, |
| "learning_rate": 2.171051832088928e-06, |
| "loss": 0.4311, |
| "step": 1138 |
| }, |
| { |
| "epoch": 0.7012062563727659, |
| "grad_norm": 2.0402626991271973, |
| "learning_rate": 2.162833976206092e-06, |
| "loss": 0.3795, |
| "step": 1139 |
| }, |
| { |
| "epoch": 0.7018218896092653, |
| "grad_norm": 1.7367284297943115, |
| "learning_rate": 2.1546274084853062e-06, |
| "loss": 0.4367, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.7024375228457647, |
| "grad_norm": 1.8020879030227661, |
| "learning_rate": 2.146432161577842e-06, |
| "loss": 0.4153, |
| "step": 1141 |
| }, |
| { |
| "epoch": 0.703053156082264, |
| "grad_norm": 1.6975538730621338, |
| "learning_rate": 2.1382482680899213e-06, |
| "loss": 0.4141, |
| "step": 1142 |
| }, |
| { |
| "epoch": 0.7036687893187633, |
| "grad_norm": 1.5046709775924683, |
| "learning_rate": 2.130075760582602e-06, |
| "loss": 0.3734, |
| "step": 1143 |
| }, |
| { |
| "epoch": 0.7042844225552627, |
| "grad_norm": 1.6403920650482178, |
| "learning_rate": 2.1219146715716332e-06, |
| "loss": 0.3891, |
| "step": 1144 |
| }, |
| { |
| "epoch": 0.7049000557917621, |
| "grad_norm": 1.7096176147460938, |
| "learning_rate": 2.113765033527338e-06, |
| "loss": 0.4007, |
| "step": 1145 |
| }, |
| { |
| "epoch": 0.7055156890282615, |
| "grad_norm": 1.775561809539795, |
| "learning_rate": 2.1056268788744803e-06, |
| "loss": 0.3982, |
| "step": 1146 |
| }, |
| { |
| "epoch": 0.7061313222647607, |
| "grad_norm": 1.9706265926361084, |
| "learning_rate": 2.097500239992132e-06, |
| "loss": 0.4374, |
| "step": 1147 |
| }, |
| { |
| "epoch": 0.7067469555012601, |
| "grad_norm": 1.670867681503296, |
| "learning_rate": 2.0893851492135536e-06, |
| "loss": 0.3741, |
| "step": 1148 |
| }, |
| { |
| "epoch": 0.7073625887377595, |
| "grad_norm": 2.0540754795074463, |
| "learning_rate": 2.081281638826052e-06, |
| "loss": 0.4138, |
| "step": 1149 |
| }, |
| { |
| "epoch": 0.7079782219742589, |
| "grad_norm": 1.6522064208984375, |
| "learning_rate": 2.0731897410708618e-06, |
| "loss": 0.4081, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.7085938552107582, |
| "grad_norm": 1.7104498147964478, |
| "learning_rate": 2.0651094881430194e-06, |
| "loss": 0.3942, |
| "step": 1151 |
| }, |
| { |
| "epoch": 0.7092094884472575, |
| "grad_norm": 1.8993293046951294, |
| "learning_rate": 2.0570409121912233e-06, |
| "loss": 0.3972, |
| "step": 1152 |
| }, |
| { |
| "epoch": 0.7098251216837569, |
| "grad_norm": 1.670290470123291, |
| "learning_rate": 2.0489840453177198e-06, |
| "loss": 0.3816, |
| "step": 1153 |
| }, |
| { |
| "epoch": 0.7104407549202563, |
| "grad_norm": 1.8099826574325562, |
| "learning_rate": 2.0409389195781627e-06, |
| "loss": 0.375, |
| "step": 1154 |
| }, |
| { |
| "epoch": 0.7110563881567556, |
| "grad_norm": 1.8723803758621216, |
| "learning_rate": 2.0329055669814936e-06, |
| "loss": 0.4192, |
| "step": 1155 |
| }, |
| { |
| "epoch": 0.7116720213932549, |
| "grad_norm": 1.5666253566741943, |
| "learning_rate": 2.0248840194898155e-06, |
| "loss": 0.3741, |
| "step": 1156 |
| }, |
| { |
| "epoch": 0.7122876546297543, |
| "grad_norm": 1.806522011756897, |
| "learning_rate": 2.0168743090182574e-06, |
| "loss": 0.3924, |
| "step": 1157 |
| }, |
| { |
| "epoch": 0.7129032878662537, |
| "grad_norm": 1.791751742362976, |
| "learning_rate": 2.0088764674348593e-06, |
| "loss": 0.4237, |
| "step": 1158 |
| }, |
| { |
| "epoch": 0.7135189211027531, |
| "grad_norm": 1.632794737815857, |
| "learning_rate": 2.0008905265604316e-06, |
| "loss": 0.4081, |
| "step": 1159 |
| }, |
| { |
| "epoch": 0.7141345543392524, |
| "grad_norm": 1.7562607526779175, |
| "learning_rate": 1.992916518168442e-06, |
| "loss": 0.3939, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.7147501875757517, |
| "grad_norm": 1.7803467512130737, |
| "learning_rate": 1.9849544739848782e-06, |
| "loss": 0.4032, |
| "step": 1161 |
| }, |
| { |
| "epoch": 0.7153658208122511, |
| "grad_norm": 1.7394850254058838, |
| "learning_rate": 1.977004425688126e-06, |
| "loss": 0.4083, |
| "step": 1162 |
| }, |
| { |
| "epoch": 0.7159814540487505, |
| "grad_norm": 1.5431199073791504, |
| "learning_rate": 1.9690664049088494e-06, |
| "loss": 0.366, |
| "step": 1163 |
| }, |
| { |
| "epoch": 0.7165970872852498, |
| "grad_norm": 1.7097880840301514, |
| "learning_rate": 1.9611404432298505e-06, |
| "loss": 0.4058, |
| "step": 1164 |
| }, |
| { |
| "epoch": 0.7172127205217491, |
| "grad_norm": 1.768742561340332, |
| "learning_rate": 1.95322657218596e-06, |
| "loss": 0.4349, |
| "step": 1165 |
| }, |
| { |
| "epoch": 0.7178283537582485, |
| "grad_norm": 1.8069496154785156, |
| "learning_rate": 1.945324823263899e-06, |
| "loss": 0.4023, |
| "step": 1166 |
| }, |
| { |
| "epoch": 0.7184439869947479, |
| "grad_norm": 1.7423423528671265, |
| "learning_rate": 1.937435227902157e-06, |
| "loss": 0.3918, |
| "step": 1167 |
| }, |
| { |
| "epoch": 0.7190596202312473, |
| "grad_norm": 1.7018494606018066, |
| "learning_rate": 1.929557817490874e-06, |
| "loss": 0.4077, |
| "step": 1168 |
| }, |
| { |
| "epoch": 0.7196752534677466, |
| "grad_norm": 1.7701889276504517, |
| "learning_rate": 1.9216926233717087e-06, |
| "loss": 0.4062, |
| "step": 1169 |
| }, |
| { |
| "epoch": 0.7202908867042459, |
| "grad_norm": 1.7747608423233032, |
| "learning_rate": 1.9138396768377106e-06, |
| "loss": 0.4047, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.7209065199407453, |
| "grad_norm": 1.7562189102172852, |
| "learning_rate": 1.9059990091332082e-06, |
| "loss": 0.3998, |
| "step": 1171 |
| }, |
| { |
| "epoch": 0.7215221531772447, |
| "grad_norm": 1.805248498916626, |
| "learning_rate": 1.8981706514536641e-06, |
| "loss": 0.4238, |
| "step": 1172 |
| }, |
| { |
| "epoch": 0.722137786413744, |
| "grad_norm": 1.671569585800171, |
| "learning_rate": 1.8903546349455748e-06, |
| "loss": 0.3735, |
| "step": 1173 |
| }, |
| { |
| "epoch": 0.7227534196502434, |
| "grad_norm": 1.866377830505371, |
| "learning_rate": 1.8825509907063328e-06, |
| "loss": 0.4411, |
| "step": 1174 |
| }, |
| { |
| "epoch": 0.7233690528867427, |
| "grad_norm": 1.6782642602920532, |
| "learning_rate": 1.8747597497841003e-06, |
| "loss": 0.3815, |
| "step": 1175 |
| }, |
| { |
| "epoch": 0.7239846861232421, |
| "grad_norm": 1.7450333833694458, |
| "learning_rate": 1.8669809431776991e-06, |
| "loss": 0.3848, |
| "step": 1176 |
| }, |
| { |
| "epoch": 0.7246003193597415, |
| "grad_norm": 1.8355008363723755, |
| "learning_rate": 1.8592146018364682e-06, |
| "loss": 0.4183, |
| "step": 1177 |
| }, |
| { |
| "epoch": 0.7252159525962408, |
| "grad_norm": 1.8200833797454834, |
| "learning_rate": 1.851460756660159e-06, |
| "loss": 0.3915, |
| "step": 1178 |
| }, |
| { |
| "epoch": 0.7258315858327401, |
| "grad_norm": 1.7371735572814941, |
| "learning_rate": 1.843719438498806e-06, |
| "loss": 0.4138, |
| "step": 1179 |
| }, |
| { |
| "epoch": 0.7264472190692395, |
| "grad_norm": 1.702866554260254, |
| "learning_rate": 1.8359906781525955e-06, |
| "loss": 0.3959, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.7270628523057389, |
| "grad_norm": 1.6576327085494995, |
| "learning_rate": 1.8282745063717577e-06, |
| "loss": 0.3961, |
| "step": 1181 |
| }, |
| { |
| "epoch": 0.7276784855422382, |
| "grad_norm": 1.7897869348526, |
| "learning_rate": 1.8205709538564326e-06, |
| "loss": 0.4128, |
| "step": 1182 |
| }, |
| { |
| "epoch": 0.7282941187787376, |
| "grad_norm": 1.6971567869186401, |
| "learning_rate": 1.8128800512565514e-06, |
| "loss": 0.391, |
| "step": 1183 |
| }, |
| { |
| "epoch": 0.7289097520152369, |
| "grad_norm": 1.7428371906280518, |
| "learning_rate": 1.8052018291717216e-06, |
| "loss": 0.3992, |
| "step": 1184 |
| }, |
| { |
| "epoch": 0.7295253852517363, |
| "grad_norm": 1.6395870447158813, |
| "learning_rate": 1.7975363181510901e-06, |
| "loss": 0.384, |
| "step": 1185 |
| }, |
| { |
| "epoch": 0.7301410184882356, |
| "grad_norm": 1.7006916999816895, |
| "learning_rate": 1.7898835486932398e-06, |
| "loss": 0.4237, |
| "step": 1186 |
| }, |
| { |
| "epoch": 0.730756651724735, |
| "grad_norm": 1.6358023881912231, |
| "learning_rate": 1.7822435512460512e-06, |
| "loss": 0.372, |
| "step": 1187 |
| }, |
| { |
| "epoch": 0.7313722849612343, |
| "grad_norm": 1.7687275409698486, |
| "learning_rate": 1.7746163562065955e-06, |
| "loss": 0.4198, |
| "step": 1188 |
| }, |
| { |
| "epoch": 0.7319879181977337, |
| "grad_norm": 1.8274449110031128, |
| "learning_rate": 1.7670019939210025e-06, |
| "loss": 0.4086, |
| "step": 1189 |
| }, |
| { |
| "epoch": 0.7326035514342331, |
| "grad_norm": 1.8345245122909546, |
| "learning_rate": 1.7594004946843458e-06, |
| "loss": 0.3979, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.7332191846707324, |
| "grad_norm": 1.7566828727722168, |
| "learning_rate": 1.7518118887405239e-06, |
| "loss": 0.4244, |
| "step": 1191 |
| }, |
| { |
| "epoch": 0.7338348179072318, |
| "grad_norm": 1.7326489686965942, |
| "learning_rate": 1.7442362062821323e-06, |
| "loss": 0.4081, |
| "step": 1192 |
| }, |
| { |
| "epoch": 0.7344504511437311, |
| "grad_norm": 1.7257455587387085, |
| "learning_rate": 1.7366734774503541e-06, |
| "loss": 0.3989, |
| "step": 1193 |
| }, |
| { |
| "epoch": 0.7350660843802305, |
| "grad_norm": 1.5269023180007935, |
| "learning_rate": 1.7291237323348287e-06, |
| "loss": 0.3905, |
| "step": 1194 |
| }, |
| { |
| "epoch": 0.7356817176167298, |
| "grad_norm": 1.63347589969635, |
| "learning_rate": 1.7215870009735386e-06, |
| "loss": 0.3853, |
| "step": 1195 |
| }, |
| { |
| "epoch": 0.7362973508532292, |
| "grad_norm": 1.6561665534973145, |
| "learning_rate": 1.714063313352693e-06, |
| "loss": 0.3871, |
| "step": 1196 |
| }, |
| { |
| "epoch": 0.7369129840897285, |
| "grad_norm": 1.7448550462722778, |
| "learning_rate": 1.7065526994065973e-06, |
| "loss": 0.4134, |
| "step": 1197 |
| }, |
| { |
| "epoch": 0.7375286173262279, |
| "grad_norm": 1.7241095304489136, |
| "learning_rate": 1.6990551890175488e-06, |
| "loss": 0.3957, |
| "step": 1198 |
| }, |
| { |
| "epoch": 0.7381442505627273, |
| "grad_norm": 1.7178313732147217, |
| "learning_rate": 1.6915708120157042e-06, |
| "loss": 0.4008, |
| "step": 1199 |
| }, |
| { |
| "epoch": 0.7387598837992266, |
| "grad_norm": 1.663047432899475, |
| "learning_rate": 1.684099598178967e-06, |
| "loss": 0.3922, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.7387598837992266, |
| "eval_loss": 0.39609086513519287, |
| "eval_runtime": 118.1732, |
| "eval_samples_per_second": 35.55, |
| "eval_steps_per_second": 4.451, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.739375517035726, |
| "grad_norm": 1.841245412826538, |
| "learning_rate": 1.6766415772328732e-06, |
| "loss": 0.4101, |
| "step": 1201 |
| }, |
| { |
| "epoch": 0.7399911502722253, |
| "grad_norm": 1.8247209787368774, |
| "learning_rate": 1.669196778850462e-06, |
| "loss": 0.3869, |
| "step": 1202 |
| }, |
| { |
| "epoch": 0.7406067835087247, |
| "grad_norm": 1.6148180961608887, |
| "learning_rate": 1.6617652326521705e-06, |
| "loss": 0.3857, |
| "step": 1203 |
| }, |
| { |
| "epoch": 0.741222416745224, |
| "grad_norm": 1.8191235065460205, |
| "learning_rate": 1.6543469682057105e-06, |
| "loss": 0.4058, |
| "step": 1204 |
| }, |
| { |
| "epoch": 0.7418380499817234, |
| "grad_norm": 1.7104837894439697, |
| "learning_rate": 1.6469420150259396e-06, |
| "loss": 0.3987, |
| "step": 1205 |
| }, |
| { |
| "epoch": 0.7424536832182227, |
| "grad_norm": 1.7302885055541992, |
| "learning_rate": 1.639550402574766e-06, |
| "loss": 0.3967, |
| "step": 1206 |
| }, |
| { |
| "epoch": 0.7430693164547221, |
| "grad_norm": 1.7323321104049683, |
| "learning_rate": 1.632172160261012e-06, |
| "loss": 0.4132, |
| "step": 1207 |
| }, |
| { |
| "epoch": 0.7436849496912215, |
| "grad_norm": 1.627414345741272, |
| "learning_rate": 1.6248073174403083e-06, |
| "loss": 0.4317, |
| "step": 1208 |
| }, |
| { |
| "epoch": 0.7443005829277208, |
| "grad_norm": 1.7372418642044067, |
| "learning_rate": 1.617455903414974e-06, |
| "loss": 0.4288, |
| "step": 1209 |
| }, |
| { |
| "epoch": 0.7449162161642202, |
| "grad_norm": 1.571689248085022, |
| "learning_rate": 1.610117947433897e-06, |
| "loss": 0.3923, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.7455318494007195, |
| "grad_norm": 1.6416131258010864, |
| "learning_rate": 1.6027934786924187e-06, |
| "loss": 0.3731, |
| "step": 1211 |
| }, |
| { |
| "epoch": 0.7461474826372189, |
| "grad_norm": 1.8294082880020142, |
| "learning_rate": 1.5954825263322215e-06, |
| "loss": 0.416, |
| "step": 1212 |
| }, |
| { |
| "epoch": 0.7467631158737182, |
| "grad_norm": 1.5999701023101807, |
| "learning_rate": 1.5881851194412106e-06, |
| "loss": 0.3864, |
| "step": 1213 |
| }, |
| { |
| "epoch": 0.7473787491102176, |
| "grad_norm": 1.582100749015808, |
| "learning_rate": 1.5809012870533996e-06, |
| "loss": 0.4004, |
| "step": 1214 |
| }, |
| { |
| "epoch": 0.7479943823467169, |
| "grad_norm": 1.7439690828323364, |
| "learning_rate": 1.57363105814879e-06, |
| "loss": 0.3662, |
| "step": 1215 |
| }, |
| { |
| "epoch": 0.7486100155832163, |
| "grad_norm": 1.7250139713287354, |
| "learning_rate": 1.5663744616532612e-06, |
| "loss": 0.3711, |
| "step": 1216 |
| }, |
| { |
| "epoch": 0.7492256488197157, |
| "grad_norm": 1.5566151142120361, |
| "learning_rate": 1.559131526438452e-06, |
| "loss": 0.3768, |
| "step": 1217 |
| }, |
| { |
| "epoch": 0.749841282056215, |
| "grad_norm": 1.6487715244293213, |
| "learning_rate": 1.551902281321651e-06, |
| "loss": 0.4079, |
| "step": 1218 |
| }, |
| { |
| "epoch": 0.7504569152927144, |
| "grad_norm": 1.6890857219696045, |
| "learning_rate": 1.544686755065677e-06, |
| "loss": 0.4214, |
| "step": 1219 |
| }, |
| { |
| "epoch": 0.7510725485292137, |
| "grad_norm": 1.7031358480453491, |
| "learning_rate": 1.537484976378763e-06, |
| "loss": 0.3975, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.7516881817657131, |
| "grad_norm": 1.5813488960266113, |
| "learning_rate": 1.5302969739144497e-06, |
| "loss": 0.3636, |
| "step": 1221 |
| }, |
| { |
| "epoch": 0.7523038150022124, |
| "grad_norm": 1.6897863149642944, |
| "learning_rate": 1.523122776271463e-06, |
| "loss": 0.4046, |
| "step": 1222 |
| }, |
| { |
| "epoch": 0.7529194482387118, |
| "grad_norm": 1.6395469903945923, |
| "learning_rate": 1.5159624119936028e-06, |
| "loss": 0.401, |
| "step": 1223 |
| }, |
| { |
| "epoch": 0.7535350814752111, |
| "grad_norm": 1.9050360918045044, |
| "learning_rate": 1.5088159095696365e-06, |
| "loss": 0.4201, |
| "step": 1224 |
| }, |
| { |
| "epoch": 0.7541507147117105, |
| "grad_norm": 1.8043394088745117, |
| "learning_rate": 1.5016832974331725e-06, |
| "loss": 0.3948, |
| "step": 1225 |
| }, |
| { |
| "epoch": 0.7547663479482098, |
| "grad_norm": 1.6688209772109985, |
| "learning_rate": 1.4945646039625611e-06, |
| "loss": 0.3852, |
| "step": 1226 |
| }, |
| { |
| "epoch": 0.7553819811847092, |
| "grad_norm": 1.732325553894043, |
| "learning_rate": 1.4874598574807697e-06, |
| "loss": 0.3934, |
| "step": 1227 |
| }, |
| { |
| "epoch": 0.7559976144212086, |
| "grad_norm": 1.6879240274429321, |
| "learning_rate": 1.4803690862552755e-06, |
| "loss": 0.4096, |
| "step": 1228 |
| }, |
| { |
| "epoch": 0.7566132476577079, |
| "grad_norm": 1.6550501585006714, |
| "learning_rate": 1.4732923184979563e-06, |
| "loss": 0.3892, |
| "step": 1229 |
| }, |
| { |
| "epoch": 0.7572288808942073, |
| "grad_norm": 1.8862385749816895, |
| "learning_rate": 1.4662295823649702e-06, |
| "loss": 0.4022, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.7578445141307066, |
| "grad_norm": 2.008464813232422, |
| "learning_rate": 1.459180905956653e-06, |
| "loss": 0.4218, |
| "step": 1231 |
| }, |
| { |
| "epoch": 0.758460147367206, |
| "grad_norm": 1.6959370374679565, |
| "learning_rate": 1.4521463173173966e-06, |
| "loss": 0.3896, |
| "step": 1232 |
| }, |
| { |
| "epoch": 0.7590757806037054, |
| "grad_norm": 1.7890487909317017, |
| "learning_rate": 1.4451258444355432e-06, |
| "loss": 0.3469, |
| "step": 1233 |
| }, |
| { |
| "epoch": 0.7596914138402047, |
| "grad_norm": 1.7809585332870483, |
| "learning_rate": 1.438119515243277e-06, |
| "loss": 0.3953, |
| "step": 1234 |
| }, |
| { |
| "epoch": 0.760307047076704, |
| "grad_norm": 1.6403446197509766, |
| "learning_rate": 1.431127357616503e-06, |
| "loss": 0.3852, |
| "step": 1235 |
| }, |
| { |
| "epoch": 0.7609226803132034, |
| "grad_norm": 1.7148042917251587, |
| "learning_rate": 1.424149399374748e-06, |
| "loss": 0.3885, |
| "step": 1236 |
| }, |
| { |
| "epoch": 0.7615383135497028, |
| "grad_norm": 1.762613296508789, |
| "learning_rate": 1.4171856682810386e-06, |
| "loss": 0.4068, |
| "step": 1237 |
| }, |
| { |
| "epoch": 0.7621539467862021, |
| "grad_norm": 1.8621405363082886, |
| "learning_rate": 1.4102361920418022e-06, |
| "loss": 0.3884, |
| "step": 1238 |
| }, |
| { |
| "epoch": 0.7627695800227015, |
| "grad_norm": 1.9017481803894043, |
| "learning_rate": 1.4033009983067454e-06, |
| "loss": 0.3999, |
| "step": 1239 |
| }, |
| { |
| "epoch": 0.7633852132592008, |
| "grad_norm": 1.7529352903366089, |
| "learning_rate": 1.39638011466875e-06, |
| "loss": 0.3954, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.7640008464957002, |
| "grad_norm": 1.744629144668579, |
| "learning_rate": 1.3894735686637672e-06, |
| "loss": 0.3868, |
| "step": 1241 |
| }, |
| { |
| "epoch": 0.7646164797321996, |
| "grad_norm": 1.6554515361785889, |
| "learning_rate": 1.3825813877706973e-06, |
| "loss": 0.3851, |
| "step": 1242 |
| }, |
| { |
| "epoch": 0.7652321129686989, |
| "grad_norm": 1.637499451637268, |
| "learning_rate": 1.3757035994112915e-06, |
| "loss": 0.4003, |
| "step": 1243 |
| }, |
| { |
| "epoch": 0.7658477462051982, |
| "grad_norm": 1.6990104913711548, |
| "learning_rate": 1.3688402309500353e-06, |
| "loss": 0.3871, |
| "step": 1244 |
| }, |
| { |
| "epoch": 0.7664633794416976, |
| "grad_norm": 1.6374468803405762, |
| "learning_rate": 1.3619913096940408e-06, |
| "loss": 0.3956, |
| "step": 1245 |
| }, |
| { |
| "epoch": 0.767079012678197, |
| "grad_norm": 1.8268284797668457, |
| "learning_rate": 1.3551568628929434e-06, |
| "loss": 0.4302, |
| "step": 1246 |
| }, |
| { |
| "epoch": 0.7676946459146963, |
| "grad_norm": 1.6786978244781494, |
| "learning_rate": 1.3483369177387845e-06, |
| "loss": 0.3645, |
| "step": 1247 |
| }, |
| { |
| "epoch": 0.7683102791511957, |
| "grad_norm": 1.6917263269424438, |
| "learning_rate": 1.341531501365912e-06, |
| "loss": 0.4048, |
| "step": 1248 |
| }, |
| { |
| "epoch": 0.768925912387695, |
| "grad_norm": 1.892749547958374, |
| "learning_rate": 1.3347406408508695e-06, |
| "loss": 0.4066, |
| "step": 1249 |
| }, |
| { |
| "epoch": 0.7695415456241944, |
| "grad_norm": 1.8319532871246338, |
| "learning_rate": 1.3279643632122807e-06, |
| "loss": 0.4067, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.7701571788606938, |
| "grad_norm": 1.621427297592163, |
| "learning_rate": 1.3212026954107564e-06, |
| "loss": 0.3982, |
| "step": 1251 |
| }, |
| { |
| "epoch": 0.7707728120971931, |
| "grad_norm": 1.7341312170028687, |
| "learning_rate": 1.3144556643487743e-06, |
| "loss": 0.4222, |
| "step": 1252 |
| }, |
| { |
| "epoch": 0.7713884453336924, |
| "grad_norm": 1.6124165058135986, |
| "learning_rate": 1.3077232968705805e-06, |
| "loss": 0.371, |
| "step": 1253 |
| }, |
| { |
| "epoch": 0.7720040785701918, |
| "grad_norm": 1.6049258708953857, |
| "learning_rate": 1.3010056197620813e-06, |
| "loss": 0.3728, |
| "step": 1254 |
| }, |
| { |
| "epoch": 0.7726197118066912, |
| "grad_norm": 1.7597967386245728, |
| "learning_rate": 1.2943026597507268e-06, |
| "loss": 0.4174, |
| "step": 1255 |
| }, |
| { |
| "epoch": 0.7732353450431905, |
| "grad_norm": 1.5948874950408936, |
| "learning_rate": 1.2876144435054194e-06, |
| "loss": 0.3633, |
| "step": 1256 |
| }, |
| { |
| "epoch": 0.7738509782796898, |
| "grad_norm": 1.8157576322555542, |
| "learning_rate": 1.2809409976364017e-06, |
| "loss": 0.4034, |
| "step": 1257 |
| }, |
| { |
| "epoch": 0.7744666115161892, |
| "grad_norm": 1.7797636985778809, |
| "learning_rate": 1.2742823486951434e-06, |
| "loss": 0.3928, |
| "step": 1258 |
| }, |
| { |
| "epoch": 0.7750822447526886, |
| "grad_norm": 1.8237395286560059, |
| "learning_rate": 1.2676385231742493e-06, |
| "loss": 0.4087, |
| "step": 1259 |
| }, |
| { |
| "epoch": 0.775697877989188, |
| "grad_norm": 1.768092393875122, |
| "learning_rate": 1.2610095475073415e-06, |
| "loss": 0.3719, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.7763135112256873, |
| "grad_norm": 1.7261672019958496, |
| "learning_rate": 1.254395448068959e-06, |
| "loss": 0.3819, |
| "step": 1261 |
| }, |
| { |
| "epoch": 0.7769291444621866, |
| "grad_norm": 1.8468579053878784, |
| "learning_rate": 1.247796251174459e-06, |
| "loss": 0.4232, |
| "step": 1262 |
| }, |
| { |
| "epoch": 0.777544777698686, |
| "grad_norm": 1.6958353519439697, |
| "learning_rate": 1.2412119830798992e-06, |
| "loss": 0.3827, |
| "step": 1263 |
| }, |
| { |
| "epoch": 0.7781604109351854, |
| "grad_norm": 1.7623149156570435, |
| "learning_rate": 1.234642669981946e-06, |
| "loss": 0.4027, |
| "step": 1264 |
| }, |
| { |
| "epoch": 0.7787760441716847, |
| "grad_norm": 1.7944426536560059, |
| "learning_rate": 1.2280883380177593e-06, |
| "loss": 0.3931, |
| "step": 1265 |
| }, |
| { |
| "epoch": 0.779391677408184, |
| "grad_norm": 1.720826268196106, |
| "learning_rate": 1.2215490132649016e-06, |
| "loss": 0.4437, |
| "step": 1266 |
| }, |
| { |
| "epoch": 0.7800073106446834, |
| "grad_norm": 1.7686612606048584, |
| "learning_rate": 1.2150247217412186e-06, |
| "loss": 0.381, |
| "step": 1267 |
| }, |
| { |
| "epoch": 0.7806229438811828, |
| "grad_norm": 1.734942078590393, |
| "learning_rate": 1.2085154894047468e-06, |
| "loss": 0.413, |
| "step": 1268 |
| }, |
| { |
| "epoch": 0.7812385771176822, |
| "grad_norm": 1.618408203125, |
| "learning_rate": 1.2020213421536103e-06, |
| "loss": 0.3706, |
| "step": 1269 |
| }, |
| { |
| "epoch": 0.7818542103541815, |
| "grad_norm": 1.7088454961776733, |
| "learning_rate": 1.195542305825908e-06, |
| "loss": 0.3694, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.7824698435906808, |
| "grad_norm": 1.6714153289794922, |
| "learning_rate": 1.189078406199624e-06, |
| "loss": 0.4032, |
| "step": 1271 |
| }, |
| { |
| "epoch": 0.7830854768271802, |
| "grad_norm": 1.786453127861023, |
| "learning_rate": 1.1826296689925142e-06, |
| "loss": 0.3906, |
| "step": 1272 |
| }, |
| { |
| "epoch": 0.7837011100636796, |
| "grad_norm": 1.66269850730896, |
| "learning_rate": 1.1761961198620081e-06, |
| "loss": 0.3647, |
| "step": 1273 |
| }, |
| { |
| "epoch": 0.7843167433001789, |
| "grad_norm": 1.7173188924789429, |
| "learning_rate": 1.1697777844051105e-06, |
| "loss": 0.3923, |
| "step": 1274 |
| }, |
| { |
| "epoch": 0.7849323765366782, |
| "grad_norm": 1.6236497163772583, |
| "learning_rate": 1.1633746881582902e-06, |
| "loss": 0.3736, |
| "step": 1275 |
| }, |
| { |
| "epoch": 0.7855480097731776, |
| "grad_norm": 1.8804194927215576, |
| "learning_rate": 1.1569868565973912e-06, |
| "loss": 0.4031, |
| "step": 1276 |
| }, |
| { |
| "epoch": 0.786163643009677, |
| "grad_norm": 1.5298935174942017, |
| "learning_rate": 1.1506143151375177e-06, |
| "loss": 0.3728, |
| "step": 1277 |
| }, |
| { |
| "epoch": 0.7867792762461764, |
| "grad_norm": 1.8781102895736694, |
| "learning_rate": 1.144257089132942e-06, |
| "loss": 0.393, |
| "step": 1278 |
| }, |
| { |
| "epoch": 0.7873949094826757, |
| "grad_norm": 1.7233707904815674, |
| "learning_rate": 1.137915203877003e-06, |
| "loss": 0.3832, |
| "step": 1279 |
| }, |
| { |
| "epoch": 0.788010542719175, |
| "grad_norm": 1.775540828704834, |
| "learning_rate": 1.1315886846020008e-06, |
| "loss": 0.3932, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.7886261759556744, |
| "grad_norm": 1.8409913778305054, |
| "learning_rate": 1.1252775564791023e-06, |
| "loss": 0.4121, |
| "step": 1281 |
| }, |
| { |
| "epoch": 0.7892418091921738, |
| "grad_norm": 1.746534824371338, |
| "learning_rate": 1.118981844618236e-06, |
| "loss": 0.387, |
| "step": 1282 |
| }, |
| { |
| "epoch": 0.7898574424286732, |
| "grad_norm": 1.7044163942337036, |
| "learning_rate": 1.1127015740679925e-06, |
| "loss": 0.3775, |
| "step": 1283 |
| }, |
| { |
| "epoch": 0.7904730756651724, |
| "grad_norm": 1.6011128425598145, |
| "learning_rate": 1.1064367698155303e-06, |
| "loss": 0.3737, |
| "step": 1284 |
| }, |
| { |
| "epoch": 0.7910887089016718, |
| "grad_norm": 1.5867910385131836, |
| "learning_rate": 1.1001874567864696e-06, |
| "loss": 0.3783, |
| "step": 1285 |
| }, |
| { |
| "epoch": 0.7917043421381712, |
| "grad_norm": 1.633167028427124, |
| "learning_rate": 1.0939536598447986e-06, |
| "loss": 0.3673, |
| "step": 1286 |
| }, |
| { |
| "epoch": 0.7923199753746706, |
| "grad_norm": 1.6005995273590088, |
| "learning_rate": 1.087735403792768e-06, |
| "loss": 0.3728, |
| "step": 1287 |
| }, |
| { |
| "epoch": 0.7929356086111699, |
| "grad_norm": 1.7536711692810059, |
| "learning_rate": 1.0815327133708015e-06, |
| "loss": 0.3928, |
| "step": 1288 |
| }, |
| { |
| "epoch": 0.7935512418476692, |
| "grad_norm": 1.7381296157836914, |
| "learning_rate": 1.0753456132573886e-06, |
| "loss": 0.3931, |
| "step": 1289 |
| }, |
| { |
| "epoch": 0.7941668750841686, |
| "grad_norm": 1.7327688932418823, |
| "learning_rate": 1.0691741280689894e-06, |
| "loss": 0.3898, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.794782508320668, |
| "grad_norm": 1.7023015022277832, |
| "learning_rate": 1.06301828235994e-06, |
| "loss": 0.4062, |
| "step": 1291 |
| }, |
| { |
| "epoch": 0.7953981415571674, |
| "grad_norm": 1.8886052370071411, |
| "learning_rate": 1.0568781006223528e-06, |
| "loss": 0.4037, |
| "step": 1292 |
| }, |
| { |
| "epoch": 0.7960137747936666, |
| "grad_norm": 1.603112816810608, |
| "learning_rate": 1.0507536072860141e-06, |
| "loss": 0.3525, |
| "step": 1293 |
| }, |
| { |
| "epoch": 0.796629408030166, |
| "grad_norm": 1.7494615316390991, |
| "learning_rate": 1.044644826718295e-06, |
| "loss": 0.4175, |
| "step": 1294 |
| }, |
| { |
| "epoch": 0.7972450412666654, |
| "grad_norm": 1.6311700344085693, |
| "learning_rate": 1.0385517832240472e-06, |
| "loss": 0.3842, |
| "step": 1295 |
| }, |
| { |
| "epoch": 0.7978606745031648, |
| "grad_norm": 1.7615406513214111, |
| "learning_rate": 1.0324745010455124e-06, |
| "loss": 0.3754, |
| "step": 1296 |
| }, |
| { |
| "epoch": 0.798476307739664, |
| "grad_norm": 1.6909494400024414, |
| "learning_rate": 1.0264130043622245e-06, |
| "loss": 0.3909, |
| "step": 1297 |
| }, |
| { |
| "epoch": 0.7990919409761634, |
| "grad_norm": 1.6405009031295776, |
| "learning_rate": 1.0203673172909068e-06, |
| "loss": 0.3877, |
| "step": 1298 |
| }, |
| { |
| "epoch": 0.7997075742126628, |
| "grad_norm": 1.6983451843261719, |
| "learning_rate": 1.0143374638853892e-06, |
| "loss": 0.3881, |
| "step": 1299 |
| }, |
| { |
| "epoch": 0.8003232074491622, |
| "grad_norm": 1.7770990133285522, |
| "learning_rate": 1.0083234681364934e-06, |
| "loss": 0.4142, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.8003232074491622, |
| "eval_loss": 0.3866761028766632, |
| "eval_runtime": 118.3992, |
| "eval_samples_per_second": 35.482, |
| "eval_steps_per_second": 4.443, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.8009388406856616, |
| "grad_norm": 1.518217921257019, |
| "learning_rate": 1.002325353971958e-06, |
| "loss": 0.3526, |
| "step": 1301 |
| }, |
| { |
| "epoch": 0.8015544739221608, |
| "grad_norm": 1.729003667831421, |
| "learning_rate": 9.963431452563331e-07, |
| "loss": 0.4206, |
| "step": 1302 |
| }, |
| { |
| "epoch": 0.8021701071586602, |
| "grad_norm": 1.719579815864563, |
| "learning_rate": 9.903768657908803e-07, |
| "loss": 0.3843, |
| "step": 1303 |
| }, |
| { |
| "epoch": 0.8027857403951596, |
| "grad_norm": 1.7568597793579102, |
| "learning_rate": 9.844265393134927e-07, |
| "loss": 0.3944, |
| "step": 1304 |
| }, |
| { |
| "epoch": 0.803401373631659, |
| "grad_norm": 1.6757253408432007, |
| "learning_rate": 9.784921894985799e-07, |
| "loss": 0.3816, |
| "step": 1305 |
| }, |
| { |
| "epoch": 0.8040170068681582, |
| "grad_norm": 1.6806457042694092, |
| "learning_rate": 9.725738399569968e-07, |
| "loss": 0.3851, |
| "step": 1306 |
| }, |
| { |
| "epoch": 0.8046326401046576, |
| "grad_norm": 1.6344943046569824, |
| "learning_rate": 9.666715142359334e-07, |
| "loss": 0.371, |
| "step": 1307 |
| }, |
| { |
| "epoch": 0.805248273341157, |
| "grad_norm": 1.7062585353851318, |
| "learning_rate": 9.607852358188247e-07, |
| "loss": 0.4154, |
| "step": 1308 |
| }, |
| { |
| "epoch": 0.8058639065776564, |
| "grad_norm": 1.660595417022705, |
| "learning_rate": 9.549150281252633e-07, |
| "loss": 0.388, |
| "step": 1309 |
| }, |
| { |
| "epoch": 0.8064795398141558, |
| "grad_norm": 1.7375619411468506, |
| "learning_rate": 9.490609145108976e-07, |
| "loss": 0.3945, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.807095173050655, |
| "grad_norm": 1.7544565200805664, |
| "learning_rate": 9.43222918267342e-07, |
| "loss": 0.4139, |
| "step": 1311 |
| }, |
| { |
| "epoch": 0.8077108062871544, |
| "grad_norm": 1.699823021888733, |
| "learning_rate": 9.374010626220908e-07, |
| "loss": 0.3667, |
| "step": 1312 |
| }, |
| { |
| "epoch": 0.8083264395236538, |
| "grad_norm": 1.675127387046814, |
| "learning_rate": 9.31595370738414e-07, |
| "loss": 0.3797, |
| "step": 1313 |
| }, |
| { |
| "epoch": 0.8089420727601532, |
| "grad_norm": 1.7964922189712524, |
| "learning_rate": 9.258058657152763e-07, |
| "loss": 0.4127, |
| "step": 1314 |
| }, |
| { |
| "epoch": 0.8095577059966524, |
| "grad_norm": 1.7282572984695435, |
| "learning_rate": 9.200325705872342e-07, |
| "loss": 0.3814, |
| "step": 1315 |
| }, |
| { |
| "epoch": 0.8101733392331518, |
| "grad_norm": 1.6478873491287231, |
| "learning_rate": 9.142755083243577e-07, |
| "loss": 0.3766, |
| "step": 1316 |
| }, |
| { |
| "epoch": 0.8107889724696512, |
| "grad_norm": 1.785675287246704, |
| "learning_rate": 9.085347018321255e-07, |
| "loss": 0.4374, |
| "step": 1317 |
| }, |
| { |
| "epoch": 0.8114046057061506, |
| "grad_norm": 1.7040303945541382, |
| "learning_rate": 9.028101739513406e-07, |
| "loss": 0.3802, |
| "step": 1318 |
| }, |
| { |
| "epoch": 0.81202023894265, |
| "grad_norm": 1.6498903036117554, |
| "learning_rate": 8.971019474580428e-07, |
| "loss": 0.3946, |
| "step": 1319 |
| }, |
| { |
| "epoch": 0.8126358721791492, |
| "grad_norm": 1.7775410413742065, |
| "learning_rate": 8.914100450634089e-07, |
| "loss": 0.3782, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.8132515054156486, |
| "grad_norm": 1.778093695640564, |
| "learning_rate": 8.857344894136715e-07, |
| "loss": 0.4015, |
| "step": 1321 |
| }, |
| { |
| "epoch": 0.813867138652148, |
| "grad_norm": 1.5908254384994507, |
| "learning_rate": 8.800753030900228e-07, |
| "loss": 0.3668, |
| "step": 1322 |
| }, |
| { |
| "epoch": 0.8144827718886474, |
| "grad_norm": 1.6788358688354492, |
| "learning_rate": 8.744325086085248e-07, |
| "loss": 0.3819, |
| "step": 1323 |
| }, |
| { |
| "epoch": 0.8150984051251466, |
| "grad_norm": 1.5763933658599854, |
| "learning_rate": 8.688061284200266e-07, |
| "loss": 0.3579, |
| "step": 1324 |
| }, |
| { |
| "epoch": 0.815714038361646, |
| "grad_norm": 1.725440263748169, |
| "learning_rate": 8.631961849100651e-07, |
| "loss": 0.371, |
| "step": 1325 |
| }, |
| { |
| "epoch": 0.8163296715981454, |
| "grad_norm": 1.601870059967041, |
| "learning_rate": 8.576027003987842e-07, |
| "loss": 0.3488, |
| "step": 1326 |
| }, |
| { |
| "epoch": 0.8169453048346448, |
| "grad_norm": 1.610034704208374, |
| "learning_rate": 8.520256971408453e-07, |
| "loss": 0.3793, |
| "step": 1327 |
| }, |
| { |
| "epoch": 0.8175609380711442, |
| "grad_norm": 1.7768540382385254, |
| "learning_rate": 8.464651973253269e-07, |
| "loss": 0.3988, |
| "step": 1328 |
| }, |
| { |
| "epoch": 0.8181765713076434, |
| "grad_norm": 1.7675001621246338, |
| "learning_rate": 8.409212230756564e-07, |
| "loss": 0.4025, |
| "step": 1329 |
| }, |
| { |
| "epoch": 0.8187922045441428, |
| "grad_norm": 1.7898603677749634, |
| "learning_rate": 8.353937964495029e-07, |
| "loss": 0.3879, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.8194078377806422, |
| "grad_norm": 1.6345689296722412, |
| "learning_rate": 8.298829394387032e-07, |
| "loss": 0.3738, |
| "step": 1331 |
| }, |
| { |
| "epoch": 0.8200234710171416, |
| "grad_norm": 1.5670359134674072, |
| "learning_rate": 8.243886739691703e-07, |
| "loss": 0.373, |
| "step": 1332 |
| }, |
| { |
| "epoch": 0.8206391042536408, |
| "grad_norm": 1.689761757850647, |
| "learning_rate": 8.189110219007967e-07, |
| "loss": 0.3823, |
| "step": 1333 |
| }, |
| { |
| "epoch": 0.8212547374901402, |
| "grad_norm": 1.6478036642074585, |
| "learning_rate": 8.134500050273841e-07, |
| "loss": 0.3727, |
| "step": 1334 |
| }, |
| { |
| "epoch": 0.8218703707266396, |
| "grad_norm": 1.914725422859192, |
| "learning_rate": 8.080056450765427e-07, |
| "loss": 0.4018, |
| "step": 1335 |
| }, |
| { |
| "epoch": 0.822486003963139, |
| "grad_norm": 1.8748279809951782, |
| "learning_rate": 8.025779637096138e-07, |
| "loss": 0.4253, |
| "step": 1336 |
| }, |
| { |
| "epoch": 0.8231016371996384, |
| "grad_norm": 1.621164321899414, |
| "learning_rate": 7.971669825215789e-07, |
| "loss": 0.3821, |
| "step": 1337 |
| }, |
| { |
| "epoch": 0.8237172704361376, |
| "grad_norm": 1.6918919086456299, |
| "learning_rate": 7.917727230409739e-07, |
| "loss": 0.3825, |
| "step": 1338 |
| }, |
| { |
| "epoch": 0.824332903672637, |
| "grad_norm": 1.6017346382141113, |
| "learning_rate": 7.863952067298042e-07, |
| "loss": 0.3639, |
| "step": 1339 |
| }, |
| { |
| "epoch": 0.8249485369091364, |
| "grad_norm": 1.6089705228805542, |
| "learning_rate": 7.810344549834625e-07, |
| "loss": 0.3662, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.8255641701456358, |
| "grad_norm": 1.766654372215271, |
| "learning_rate": 7.756904891306366e-07, |
| "loss": 0.3862, |
| "step": 1341 |
| }, |
| { |
| "epoch": 0.8261798033821351, |
| "grad_norm": 1.6718283891677856, |
| "learning_rate": 7.70363330433233e-07, |
| "loss": 0.3908, |
| "step": 1342 |
| }, |
| { |
| "epoch": 0.8267954366186344, |
| "grad_norm": 1.629204511642456, |
| "learning_rate": 7.650530000862849e-07, |
| "loss": 0.3617, |
| "step": 1343 |
| }, |
| { |
| "epoch": 0.8274110698551338, |
| "grad_norm": 1.6336185932159424, |
| "learning_rate": 7.597595192178702e-07, |
| "loss": 0.3991, |
| "step": 1344 |
| }, |
| { |
| "epoch": 0.8280267030916332, |
| "grad_norm": 1.7665934562683105, |
| "learning_rate": 7.544829088890326e-07, |
| "loss": 0.3823, |
| "step": 1345 |
| }, |
| { |
| "epoch": 0.8286423363281326, |
| "grad_norm": 1.7317899465560913, |
| "learning_rate": 7.492231900936886e-07, |
| "loss": 0.3805, |
| "step": 1346 |
| }, |
| { |
| "epoch": 0.8292579695646318, |
| "grad_norm": 1.8492108583450317, |
| "learning_rate": 7.439803837585524e-07, |
| "loss": 0.3662, |
| "step": 1347 |
| }, |
| { |
| "epoch": 0.8298736028011312, |
| "grad_norm": 1.671828269958496, |
| "learning_rate": 7.387545107430455e-07, |
| "loss": 0.3933, |
| "step": 1348 |
| }, |
| { |
| "epoch": 0.8304892360376306, |
| "grad_norm": 1.697264552116394, |
| "learning_rate": 7.33545591839222e-07, |
| "loss": 0.3553, |
| "step": 1349 |
| }, |
| { |
| "epoch": 0.83110486927413, |
| "grad_norm": 1.8597784042358398, |
| "learning_rate": 7.283536477716763e-07, |
| "loss": 0.402, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.8317205025106293, |
| "grad_norm": 1.7416291236877441, |
| "learning_rate": 7.23178699197467e-07, |
| "loss": 0.3691, |
| "step": 1351 |
| }, |
| { |
| "epoch": 0.8323361357471286, |
| "grad_norm": 1.6027207374572754, |
| "learning_rate": 7.180207667060352e-07, |
| "loss": 0.3671, |
| "step": 1352 |
| }, |
| { |
| "epoch": 0.832951768983628, |
| "grad_norm": 1.605884075164795, |
| "learning_rate": 7.12879870819117e-07, |
| "loss": 0.3578, |
| "step": 1353 |
| }, |
| { |
| "epoch": 0.8335674022201274, |
| "grad_norm": 1.5832176208496094, |
| "learning_rate": 7.077560319906696e-07, |
| "loss": 0.3545, |
| "step": 1354 |
| }, |
| { |
| "epoch": 0.8341830354566268, |
| "grad_norm": 1.7259970903396606, |
| "learning_rate": 7.026492706067823e-07, |
| "loss": 0.3544, |
| "step": 1355 |
| }, |
| { |
| "epoch": 0.834798668693126, |
| "grad_norm": 1.8300843238830566, |
| "learning_rate": 6.975596069855983e-07, |
| "loss": 0.3984, |
| "step": 1356 |
| }, |
| { |
| "epoch": 0.8354143019296254, |
| "grad_norm": 1.8081567287445068, |
| "learning_rate": 6.924870613772388e-07, |
| "loss": 0.395, |
| "step": 1357 |
| }, |
| { |
| "epoch": 0.8360299351661248, |
| "grad_norm": 1.7496634721755981, |
| "learning_rate": 6.874316539637127e-07, |
| "loss": 0.3751, |
| "step": 1358 |
| }, |
| { |
| "epoch": 0.8366455684026242, |
| "grad_norm": 1.7504856586456299, |
| "learning_rate": 6.82393404858846e-07, |
| "loss": 0.4058, |
| "step": 1359 |
| }, |
| { |
| "epoch": 0.8372612016391235, |
| "grad_norm": 1.6938621997833252, |
| "learning_rate": 6.773723341081945e-07, |
| "loss": 0.3958, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.8378768348756228, |
| "grad_norm": 1.6554718017578125, |
| "learning_rate": 6.723684616889664e-07, |
| "loss": 0.3718, |
| "step": 1361 |
| }, |
| { |
| "epoch": 0.8384924681121222, |
| "grad_norm": 1.7190313339233398, |
| "learning_rate": 6.673818075099475e-07, |
| "loss": 0.3915, |
| "step": 1362 |
| }, |
| { |
| "epoch": 0.8391081013486216, |
| "grad_norm": 1.721051812171936, |
| "learning_rate": 6.624123914114122e-07, |
| "loss": 0.3565, |
| "step": 1363 |
| }, |
| { |
| "epoch": 0.839723734585121, |
| "grad_norm": 1.6990845203399658, |
| "learning_rate": 6.574602331650559e-07, |
| "loss": 0.3871, |
| "step": 1364 |
| }, |
| { |
| "epoch": 0.8403393678216202, |
| "grad_norm": 1.6603196859359741, |
| "learning_rate": 6.52525352473905e-07, |
| "loss": 0.3791, |
| "step": 1365 |
| }, |
| { |
| "epoch": 0.8409550010581196, |
| "grad_norm": 1.9718493223190308, |
| "learning_rate": 6.476077689722487e-07, |
| "loss": 0.3968, |
| "step": 1366 |
| }, |
| { |
| "epoch": 0.841570634294619, |
| "grad_norm": 1.744043231010437, |
| "learning_rate": 6.427075022255547e-07, |
| "loss": 0.3897, |
| "step": 1367 |
| }, |
| { |
| "epoch": 0.8421862675311184, |
| "grad_norm": 1.7016851902008057, |
| "learning_rate": 6.378245717303899e-07, |
| "loss": 0.3967, |
| "step": 1368 |
| }, |
| { |
| "epoch": 0.8428019007676177, |
| "grad_norm": 1.7479567527770996, |
| "learning_rate": 6.329589969143518e-07, |
| "loss": 0.3832, |
| "step": 1369 |
| }, |
| { |
| "epoch": 0.843417534004117, |
| "grad_norm": 1.6406941413879395, |
| "learning_rate": 6.281107971359801e-07, |
| "loss": 0.3965, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.8440331672406164, |
| "grad_norm": 1.6212124824523926, |
| "learning_rate": 6.232799916846888e-07, |
| "loss": 0.3874, |
| "step": 1371 |
| }, |
| { |
| "epoch": 0.8446488004771158, |
| "grad_norm": 1.7557140588760376, |
| "learning_rate": 6.184665997806832e-07, |
| "loss": 0.3894, |
| "step": 1372 |
| }, |
| { |
| "epoch": 0.8452644337136151, |
| "grad_norm": 1.7088721990585327, |
| "learning_rate": 6.136706405748838e-07, |
| "loss": 0.3955, |
| "step": 1373 |
| }, |
| { |
| "epoch": 0.8458800669501144, |
| "grad_norm": 1.8180969953536987, |
| "learning_rate": 6.088921331488568e-07, |
| "loss": 0.3946, |
| "step": 1374 |
| }, |
| { |
| "epoch": 0.8464957001866138, |
| "grad_norm": 1.5999853610992432, |
| "learning_rate": 6.041310965147318e-07, |
| "loss": 0.3843, |
| "step": 1375 |
| }, |
| { |
| "epoch": 0.8471113334231132, |
| "grad_norm": 1.669258952140808, |
| "learning_rate": 5.993875496151253e-07, |
| "loss": 0.3811, |
| "step": 1376 |
| }, |
| { |
| "epoch": 0.8477269666596126, |
| "grad_norm": 1.7172578573226929, |
| "learning_rate": 5.94661511323072e-07, |
| "loss": 0.4119, |
| "step": 1377 |
| }, |
| { |
| "epoch": 0.8483425998961119, |
| "grad_norm": 1.7204616069793701, |
| "learning_rate": 5.899530004419396e-07, |
| "loss": 0.4112, |
| "step": 1378 |
| }, |
| { |
| "epoch": 0.8489582331326112, |
| "grad_norm": 1.6229082345962524, |
| "learning_rate": 5.852620357053651e-07, |
| "loss": 0.3582, |
| "step": 1379 |
| }, |
| { |
| "epoch": 0.8495738663691106, |
| "grad_norm": 1.6449095010757446, |
| "learning_rate": 5.80588635777175e-07, |
| "loss": 0.3719, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.85018949960561, |
| "grad_norm": 1.5927188396453857, |
| "learning_rate": 5.759328192513075e-07, |
| "loss": 0.3786, |
| "step": 1381 |
| }, |
| { |
| "epoch": 0.8508051328421093, |
| "grad_norm": 1.69962477684021, |
| "learning_rate": 5.71294604651747e-07, |
| "loss": 0.3991, |
| "step": 1382 |
| }, |
| { |
| "epoch": 0.8514207660786086, |
| "grad_norm": 1.7231343984603882, |
| "learning_rate": 5.666740104324392e-07, |
| "loss": 0.4029, |
| "step": 1383 |
| }, |
| { |
| "epoch": 0.852036399315108, |
| "grad_norm": 1.570091724395752, |
| "learning_rate": 5.620710549772295e-07, |
| "loss": 0.3688, |
| "step": 1384 |
| }, |
| { |
| "epoch": 0.8526520325516074, |
| "grad_norm": 1.7892401218414307, |
| "learning_rate": 5.574857565997838e-07, |
| "loss": 0.4016, |
| "step": 1385 |
| }, |
| { |
| "epoch": 0.8532676657881068, |
| "grad_norm": 1.714543342590332, |
| "learning_rate": 5.529181335435124e-07, |
| "loss": 0.3883, |
| "step": 1386 |
| }, |
| { |
| "epoch": 0.8538832990246061, |
| "grad_norm": 1.5173468589782715, |
| "learning_rate": 5.483682039815059e-07, |
| "loss": 0.3706, |
| "step": 1387 |
| }, |
| { |
| "epoch": 0.8544989322611054, |
| "grad_norm": 1.5355753898620605, |
| "learning_rate": 5.438359860164555e-07, |
| "loss": 0.3557, |
| "step": 1388 |
| }, |
| { |
| "epoch": 0.8551145654976048, |
| "grad_norm": 1.6694539785385132, |
| "learning_rate": 5.393214976805833e-07, |
| "loss": 0.3878, |
| "step": 1389 |
| }, |
| { |
| "epoch": 0.8557301987341042, |
| "grad_norm": 1.6121852397918701, |
| "learning_rate": 5.348247569355736e-07, |
| "loss": 0.3804, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.8563458319706035, |
| "grad_norm": 1.6395108699798584, |
| "learning_rate": 5.303457816724955e-07, |
| "loss": 0.3602, |
| "step": 1391 |
| }, |
| { |
| "epoch": 0.8569614652071029, |
| "grad_norm": 1.6895431280136108, |
| "learning_rate": 5.258845897117387e-07, |
| "loss": 0.3834, |
| "step": 1392 |
| }, |
| { |
| "epoch": 0.8575770984436022, |
| "grad_norm": 1.9071327447891235, |
| "learning_rate": 5.214411988029355e-07, |
| "loss": 0.3926, |
| "step": 1393 |
| }, |
| { |
| "epoch": 0.8581927316801016, |
| "grad_norm": 1.610316514968872, |
| "learning_rate": 5.17015626624896e-07, |
| "loss": 0.3709, |
| "step": 1394 |
| }, |
| { |
| "epoch": 0.858808364916601, |
| "grad_norm": 1.5735969543457031, |
| "learning_rate": 5.126078907855342e-07, |
| "loss": 0.3613, |
| "step": 1395 |
| }, |
| { |
| "epoch": 0.8594239981531003, |
| "grad_norm": 1.9769666194915771, |
| "learning_rate": 5.082180088217981e-07, |
| "loss": 0.4104, |
| "step": 1396 |
| }, |
| { |
| "epoch": 0.8600396313895996, |
| "grad_norm": 1.7611583471298218, |
| "learning_rate": 5.038459981996036e-07, |
| "loss": 0.3717, |
| "step": 1397 |
| }, |
| { |
| "epoch": 0.860655264626099, |
| "grad_norm": 1.6172531843185425, |
| "learning_rate": 4.994918763137596e-07, |
| "loss": 0.3812, |
| "step": 1398 |
| }, |
| { |
| "epoch": 0.8612708978625984, |
| "grad_norm": 1.7320444583892822, |
| "learning_rate": 4.951556604879049e-07, |
| "loss": 0.3781, |
| "step": 1399 |
| }, |
| { |
| "epoch": 0.8618865310990977, |
| "grad_norm": 1.697505235671997, |
| "learning_rate": 4.908373679744316e-07, |
| "loss": 0.3847, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.8618865310990977, |
| "eval_loss": 0.3799481987953186, |
| "eval_runtime": 117.871, |
| "eval_samples_per_second": 35.641, |
| "eval_steps_per_second": 4.463, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.8625021643355971, |
| "grad_norm": 1.6632241010665894, |
| "learning_rate": 4.865370159544236e-07, |
| "loss": 0.384, |
| "step": 1401 |
| }, |
| { |
| "epoch": 0.8631177975720964, |
| "grad_norm": 1.6136724948883057, |
| "learning_rate": 4.822546215375851e-07, |
| "loss": 0.3654, |
| "step": 1402 |
| }, |
| { |
| "epoch": 0.8637334308085958, |
| "grad_norm": 1.6867445707321167, |
| "learning_rate": 4.779902017621718e-07, |
| "loss": 0.3938, |
| "step": 1403 |
| }, |
| { |
| "epoch": 0.8643490640450952, |
| "grad_norm": 1.618415117263794, |
| "learning_rate": 4.737437735949263e-07, |
| "loss": 0.3642, |
| "step": 1404 |
| }, |
| { |
| "epoch": 0.8649646972815945, |
| "grad_norm": 1.6127417087554932, |
| "learning_rate": 4.6951535393100654e-07, |
| "loss": 0.3645, |
| "step": 1405 |
| }, |
| { |
| "epoch": 0.8655803305180938, |
| "grad_norm": 1.56674063205719, |
| "learning_rate": 4.653049595939191e-07, |
| "loss": 0.3645, |
| "step": 1406 |
| }, |
| { |
| "epoch": 0.8661959637545932, |
| "grad_norm": 1.5660558938980103, |
| "learning_rate": 4.6111260733545714e-07, |
| "loss": 0.3577, |
| "step": 1407 |
| }, |
| { |
| "epoch": 0.8668115969910926, |
| "grad_norm": 1.7765249013900757, |
| "learning_rate": 4.569383138356276e-07, |
| "loss": 0.3888, |
| "step": 1408 |
| }, |
| { |
| "epoch": 0.8674272302275919, |
| "grad_norm": 1.646727442741394, |
| "learning_rate": 4.5278209570258914e-07, |
| "loss": 0.3965, |
| "step": 1409 |
| }, |
| { |
| "epoch": 0.8680428634640913, |
| "grad_norm": 1.8428785800933838, |
| "learning_rate": 4.486439694725858e-07, |
| "loss": 0.4066, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.8686584967005906, |
| "grad_norm": 1.605111837387085, |
| "learning_rate": 4.4452395160987314e-07, |
| "loss": 0.3915, |
| "step": 1411 |
| }, |
| { |
| "epoch": 0.86927412993709, |
| "grad_norm": 1.6525288820266724, |
| "learning_rate": 4.404220585066671e-07, |
| "loss": 0.3705, |
| "step": 1412 |
| }, |
| { |
| "epoch": 0.8698897631735893, |
| "grad_norm": 1.6098047494888306, |
| "learning_rate": 4.3633830648306675e-07, |
| "loss": 0.3834, |
| "step": 1413 |
| }, |
| { |
| "epoch": 0.8705053964100887, |
| "grad_norm": 1.6825004816055298, |
| "learning_rate": 4.322727117869951e-07, |
| "loss": 0.3743, |
| "step": 1414 |
| }, |
| { |
| "epoch": 0.871121029646588, |
| "grad_norm": 1.6571918725967407, |
| "learning_rate": 4.282252905941342e-07, |
| "loss": 0.3528, |
| "step": 1415 |
| }, |
| { |
| "epoch": 0.8717366628830874, |
| "grad_norm": 1.560193419456482, |
| "learning_rate": 4.2419605900785755e-07, |
| "loss": 0.3714, |
| "step": 1416 |
| }, |
| { |
| "epoch": 0.8723522961195868, |
| "grad_norm": 1.694968819618225, |
| "learning_rate": 4.201850330591678e-07, |
| "loss": 0.4058, |
| "step": 1417 |
| }, |
| { |
| "epoch": 0.8729679293560861, |
| "grad_norm": 1.742849588394165, |
| "learning_rate": 4.16192228706635e-07, |
| "loss": 0.4102, |
| "step": 1418 |
| }, |
| { |
| "epoch": 0.8735835625925855, |
| "grad_norm": 1.7649222612380981, |
| "learning_rate": 4.122176618363305e-07, |
| "loss": 0.3816, |
| "step": 1419 |
| }, |
| { |
| "epoch": 0.8741991958290848, |
| "grad_norm": 1.692421555519104, |
| "learning_rate": 4.082613482617664e-07, |
| "loss": 0.3759, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.8748148290655842, |
| "grad_norm": 1.821478247642517, |
| "learning_rate": 4.043233037238281e-07, |
| "loss": 0.3886, |
| "step": 1421 |
| }, |
| { |
| "epoch": 0.8754304623020835, |
| "grad_norm": 1.5918692350387573, |
| "learning_rate": 4.0040354389071613e-07, |
| "loss": 0.3695, |
| "step": 1422 |
| }, |
| { |
| "epoch": 0.8760460955385829, |
| "grad_norm": 1.6844005584716797, |
| "learning_rate": 3.965020843578804e-07, |
| "loss": 0.3911, |
| "step": 1423 |
| }, |
| { |
| "epoch": 0.8766617287750822, |
| "grad_norm": 1.5743554830551147, |
| "learning_rate": 3.9261894064796136e-07, |
| "loss": 0.3598, |
| "step": 1424 |
| }, |
| { |
| "epoch": 0.8772773620115816, |
| "grad_norm": 1.627099633216858, |
| "learning_rate": 3.8875412821072875e-07, |
| "loss": 0.3787, |
| "step": 1425 |
| }, |
| { |
| "epoch": 0.877892995248081, |
| "grad_norm": 1.6978716850280762, |
| "learning_rate": 3.8490766242301356e-07, |
| "loss": 0.3838, |
| "step": 1426 |
| }, |
| { |
| "epoch": 0.8785086284845803, |
| "grad_norm": 1.640041470527649, |
| "learning_rate": 3.810795585886551e-07, |
| "loss": 0.3756, |
| "step": 1427 |
| }, |
| { |
| "epoch": 0.8791242617210797, |
| "grad_norm": 1.7754591703414917, |
| "learning_rate": 3.772698319384349e-07, |
| "loss": 0.4062, |
| "step": 1428 |
| }, |
| { |
| "epoch": 0.879739894957579, |
| "grad_norm": 1.700578212738037, |
| "learning_rate": 3.734784976300165e-07, |
| "loss": 0.3834, |
| "step": 1429 |
| }, |
| { |
| "epoch": 0.8803555281940784, |
| "grad_norm": 1.6692880392074585, |
| "learning_rate": 3.6970557074788913e-07, |
| "loss": 0.3648, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.8809711614305777, |
| "grad_norm": 1.7037373781204224, |
| "learning_rate": 3.6595106630330277e-07, |
| "loss": 0.3955, |
| "step": 1431 |
| }, |
| { |
| "epoch": 0.8815867946670771, |
| "grad_norm": 1.6975781917572021, |
| "learning_rate": 3.6221499923421164e-07, |
| "loss": 0.3978, |
| "step": 1432 |
| }, |
| { |
| "epoch": 0.8822024279035764, |
| "grad_norm": 1.6766349077224731, |
| "learning_rate": 3.5849738440521254e-07, |
| "loss": 0.3781, |
| "step": 1433 |
| }, |
| { |
| "epoch": 0.8828180611400758, |
| "grad_norm": 1.649646282196045, |
| "learning_rate": 3.5479823660748703e-07, |
| "loss": 0.385, |
| "step": 1434 |
| }, |
| { |
| "epoch": 0.8834336943765752, |
| "grad_norm": 1.8312703371047974, |
| "learning_rate": 3.511175705587433e-07, |
| "loss": 0.3806, |
| "step": 1435 |
| }, |
| { |
| "epoch": 0.8840493276130745, |
| "grad_norm": 1.7184886932373047, |
| "learning_rate": 3.4745540090315556e-07, |
| "loss": 0.4092, |
| "step": 1436 |
| }, |
| { |
| "epoch": 0.8846649608495739, |
| "grad_norm": 1.6493487358093262, |
| "learning_rate": 3.4381174221130796e-07, |
| "loss": 0.3742, |
| "step": 1437 |
| }, |
| { |
| "epoch": 0.8852805940860732, |
| "grad_norm": 1.7439310550689697, |
| "learning_rate": 3.4018660898013423e-07, |
| "loss": 0.4147, |
| "step": 1438 |
| }, |
| { |
| "epoch": 0.8858962273225726, |
| "grad_norm": 1.649951696395874, |
| "learning_rate": 3.365800156328619e-07, |
| "loss": 0.3849, |
| "step": 1439 |
| }, |
| { |
| "epoch": 0.8865118605590719, |
| "grad_norm": 1.6006723642349243, |
| "learning_rate": 3.329919765189554e-07, |
| "loss": 0.3673, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.8871274937955713, |
| "grad_norm": 1.630724549293518, |
| "learning_rate": 3.2942250591405546e-07, |
| "loss": 0.3806, |
| "step": 1441 |
| }, |
| { |
| "epoch": 0.8877431270320706, |
| "grad_norm": 1.649955153465271, |
| "learning_rate": 3.258716180199278e-07, |
| "loss": 0.3725, |
| "step": 1442 |
| }, |
| { |
| "epoch": 0.88835876026857, |
| "grad_norm": 1.5823496580123901, |
| "learning_rate": 3.2233932696440096e-07, |
| "loss": 0.3574, |
| "step": 1443 |
| }, |
| { |
| "epoch": 0.8889743935050693, |
| "grad_norm": 1.649905800819397, |
| "learning_rate": 3.18825646801314e-07, |
| "loss": 0.3625, |
| "step": 1444 |
| }, |
| { |
| "epoch": 0.8895900267415687, |
| "grad_norm": 1.7206919193267822, |
| "learning_rate": 3.153305915104593e-07, |
| "loss": 0.3922, |
| "step": 1445 |
| }, |
| { |
| "epoch": 0.8902056599780681, |
| "grad_norm": 1.6427253484725952, |
| "learning_rate": 3.118541749975257e-07, |
| "loss": 0.3815, |
| "step": 1446 |
| }, |
| { |
| "epoch": 0.8908212932145674, |
| "grad_norm": 1.593934416770935, |
| "learning_rate": 3.0839641109404627e-07, |
| "loss": 0.37, |
| "step": 1447 |
| }, |
| { |
| "epoch": 0.8914369264510668, |
| "grad_norm": 1.6241251230239868, |
| "learning_rate": 3.0495731355733915e-07, |
| "loss": 0.4025, |
| "step": 1448 |
| }, |
| { |
| "epoch": 0.8920525596875661, |
| "grad_norm": 1.6705073118209839, |
| "learning_rate": 3.015368960704584e-07, |
| "loss": 0.374, |
| "step": 1449 |
| }, |
| { |
| "epoch": 0.8926681929240655, |
| "grad_norm": 1.6619371175765991, |
| "learning_rate": 2.9813517224213274e-07, |
| "loss": 0.3668, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.8932838261605649, |
| "grad_norm": 1.7148122787475586, |
| "learning_rate": 2.947521556067162e-07, |
| "loss": 0.3972, |
| "step": 1451 |
| }, |
| { |
| "epoch": 0.8938994593970642, |
| "grad_norm": 1.7702488899230957, |
| "learning_rate": 2.913878596241343e-07, |
| "loss": 0.3788, |
| "step": 1452 |
| }, |
| { |
| "epoch": 0.8945150926335635, |
| "grad_norm": 1.6259140968322754, |
| "learning_rate": 2.8804229767982637e-07, |
| "loss": 0.3687, |
| "step": 1453 |
| }, |
| { |
| "epoch": 0.8951307258700629, |
| "grad_norm": 1.8274110555648804, |
| "learning_rate": 2.847154830846971e-07, |
| "loss": 0.417, |
| "step": 1454 |
| }, |
| { |
| "epoch": 0.8957463591065623, |
| "grad_norm": 1.8136776685714722, |
| "learning_rate": 2.8140742907506403e-07, |
| "loss": 0.3772, |
| "step": 1455 |
| }, |
| { |
| "epoch": 0.8963619923430616, |
| "grad_norm": 1.7630428075790405, |
| "learning_rate": 2.7811814881259503e-07, |
| "loss": 0.3994, |
| "step": 1456 |
| }, |
| { |
| "epoch": 0.896977625579561, |
| "grad_norm": 1.6697845458984375, |
| "learning_rate": 2.748476553842711e-07, |
| "loss": 0.3783, |
| "step": 1457 |
| }, |
| { |
| "epoch": 0.8975932588160603, |
| "grad_norm": 1.7866156101226807, |
| "learning_rate": 2.715959618023212e-07, |
| "loss": 0.3926, |
| "step": 1458 |
| }, |
| { |
| "epoch": 0.8982088920525597, |
| "grad_norm": 1.6477011442184448, |
| "learning_rate": 2.6836308100417874e-07, |
| "loss": 0.3921, |
| "step": 1459 |
| }, |
| { |
| "epoch": 0.8988245252890591, |
| "grad_norm": 1.6541386842727661, |
| "learning_rate": 2.651490258524281e-07, |
| "loss": 0.3612, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.8994401585255584, |
| "grad_norm": 1.728593349456787, |
| "learning_rate": 2.619538091347473e-07, |
| "loss": 0.3688, |
| "step": 1461 |
| }, |
| { |
| "epoch": 0.9000557917620577, |
| "grad_norm": 1.6868138313293457, |
| "learning_rate": 2.587774435638679e-07, |
| "loss": 0.3673, |
| "step": 1462 |
| }, |
| { |
| "epoch": 0.9006714249985571, |
| "grad_norm": 1.7073603868484497, |
| "learning_rate": 2.556199417775174e-07, |
| "loss": 0.3852, |
| "step": 1463 |
| }, |
| { |
| "epoch": 0.9012870582350565, |
| "grad_norm": 1.7586218118667603, |
| "learning_rate": 2.524813163383683e-07, |
| "loss": 0.4101, |
| "step": 1464 |
| }, |
| { |
| "epoch": 0.9019026914715558, |
| "grad_norm": 1.718984603881836, |
| "learning_rate": 2.4936157973399266e-07, |
| "loss": 0.3901, |
| "step": 1465 |
| }, |
| { |
| "epoch": 0.9025183247080552, |
| "grad_norm": 1.692847728729248, |
| "learning_rate": 2.4626074437680836e-07, |
| "loss": 0.3694, |
| "step": 1466 |
| }, |
| { |
| "epoch": 0.9031339579445545, |
| "grad_norm": 1.7062618732452393, |
| "learning_rate": 2.431788226040327e-07, |
| "loss": 0.3863, |
| "step": 1467 |
| }, |
| { |
| "epoch": 0.9037495911810539, |
| "grad_norm": 1.7544633150100708, |
| "learning_rate": 2.40115826677631e-07, |
| "loss": 0.389, |
| "step": 1468 |
| }, |
| { |
| "epoch": 0.9043652244175533, |
| "grad_norm": 1.732604742050171, |
| "learning_rate": 2.3707176878426886e-07, |
| "loss": 0.4039, |
| "step": 1469 |
| }, |
| { |
| "epoch": 0.9049808576540526, |
| "grad_norm": 1.642321228981018, |
| "learning_rate": 2.3404666103526542e-07, |
| "loss": 0.3708, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.9055964908905519, |
| "grad_norm": 1.707271933555603, |
| "learning_rate": 2.3104051546654016e-07, |
| "loss": 0.383, |
| "step": 1471 |
| }, |
| { |
| "epoch": 0.9062121241270513, |
| "grad_norm": 1.650363802909851, |
| "learning_rate": 2.280533440385696e-07, |
| "loss": 0.3985, |
| "step": 1472 |
| }, |
| { |
| "epoch": 0.9068277573635507, |
| "grad_norm": 1.6128193140029907, |
| "learning_rate": 2.2508515863634062e-07, |
| "loss": 0.3699, |
| "step": 1473 |
| }, |
| { |
| "epoch": 0.90744339060005, |
| "grad_norm": 1.687888503074646, |
| "learning_rate": 2.2213597106929608e-07, |
| "loss": 0.3824, |
| "step": 1474 |
| }, |
| { |
| "epoch": 0.9080590238365494, |
| "grad_norm": 1.7968963384628296, |
| "learning_rate": 2.1920579307129818e-07, |
| "loss": 0.3833, |
| "step": 1475 |
| }, |
| { |
| "epoch": 0.9086746570730487, |
| "grad_norm": 1.6406185626983643, |
| "learning_rate": 2.1629463630057136e-07, |
| "loss": 0.3858, |
| "step": 1476 |
| }, |
| { |
| "epoch": 0.9092902903095481, |
| "grad_norm": 1.5874775648117065, |
| "learning_rate": 2.134025123396638e-07, |
| "loss": 0.3473, |
| "step": 1477 |
| }, |
| { |
| "epoch": 0.9099059235460475, |
| "grad_norm": 1.7503160238265991, |
| "learning_rate": 2.1052943269539716e-07, |
| "loss": 0.4037, |
| "step": 1478 |
| }, |
| { |
| "epoch": 0.9105215567825468, |
| "grad_norm": 1.679175615310669, |
| "learning_rate": 2.0767540879882143e-07, |
| "loss": 0.3891, |
| "step": 1479 |
| }, |
| { |
| "epoch": 0.9111371900190461, |
| "grad_norm": 1.7528096437454224, |
| "learning_rate": 2.0484045200517222e-07, |
| "loss": 0.382, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.9117528232555455, |
| "grad_norm": 1.7357664108276367, |
| "learning_rate": 2.0202457359381978e-07, |
| "loss": 0.3854, |
| "step": 1481 |
| }, |
| { |
| "epoch": 0.9123684564920449, |
| "grad_norm": 1.7275396585464478, |
| "learning_rate": 1.9922778476823167e-07, |
| "loss": 0.3966, |
| "step": 1482 |
| }, |
| { |
| "epoch": 0.9129840897285442, |
| "grad_norm": 1.6157152652740479, |
| "learning_rate": 1.9645009665592073e-07, |
| "loss": 0.3657, |
| "step": 1483 |
| }, |
| { |
| "epoch": 0.9135997229650435, |
| "grad_norm": 1.7594859600067139, |
| "learning_rate": 1.9369152030840553e-07, |
| "loss": 0.4016, |
| "step": 1484 |
| }, |
| { |
| "epoch": 0.9142153562015429, |
| "grad_norm": 1.7511507272720337, |
| "learning_rate": 1.9095206670116718e-07, |
| "loss": 0.4145, |
| "step": 1485 |
| }, |
| { |
| "epoch": 0.9148309894380423, |
| "grad_norm": 1.7000651359558105, |
| "learning_rate": 1.882317467335998e-07, |
| "loss": 0.3958, |
| "step": 1486 |
| }, |
| { |
| "epoch": 0.9154466226745417, |
| "grad_norm": 1.7108042240142822, |
| "learning_rate": 1.85530571228974e-07, |
| "loss": 0.3891, |
| "step": 1487 |
| }, |
| { |
| "epoch": 0.916062255911041, |
| "grad_norm": 1.7302545309066772, |
| "learning_rate": 1.8284855093438969e-07, |
| "loss": 0.3691, |
| "step": 1488 |
| }, |
| { |
| "epoch": 0.9166778891475403, |
| "grad_norm": 1.855621576309204, |
| "learning_rate": 1.801856965207338e-07, |
| "loss": 0.3857, |
| "step": 1489 |
| }, |
| { |
| "epoch": 0.9172935223840397, |
| "grad_norm": 1.662436842918396, |
| "learning_rate": 1.7754201858263987e-07, |
| "loss": 0.368, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.9179091556205391, |
| "grad_norm": 1.6870900392532349, |
| "learning_rate": 1.7491752763844294e-07, |
| "loss": 0.3869, |
| "step": 1491 |
| }, |
| { |
| "epoch": 0.9185247888570384, |
| "grad_norm": 1.6385629177093506, |
| "learning_rate": 1.7231223413014086e-07, |
| "loss": 0.372, |
| "step": 1492 |
| }, |
| { |
| "epoch": 0.9191404220935377, |
| "grad_norm": 1.694305419921875, |
| "learning_rate": 1.697261484233492e-07, |
| "loss": 0.389, |
| "step": 1493 |
| }, |
| { |
| "epoch": 0.9197560553300371, |
| "grad_norm": 1.7155510187149048, |
| "learning_rate": 1.6715928080726417e-07, |
| "loss": 0.3931, |
| "step": 1494 |
| }, |
| { |
| "epoch": 0.9203716885665365, |
| "grad_norm": 1.7123414278030396, |
| "learning_rate": 1.6461164149461805e-07, |
| "loss": 0.3886, |
| "step": 1495 |
| }, |
| { |
| "epoch": 0.9209873218030359, |
| "grad_norm": 1.6566325426101685, |
| "learning_rate": 1.6208324062163884e-07, |
| "loss": 0.3807, |
| "step": 1496 |
| }, |
| { |
| "epoch": 0.9216029550395352, |
| "grad_norm": 1.7428280115127563, |
| "learning_rate": 1.5957408824801347e-07, |
| "loss": 0.3841, |
| "step": 1497 |
| }, |
| { |
| "epoch": 0.9222185882760345, |
| "grad_norm": 1.6075456142425537, |
| "learning_rate": 1.5708419435684463e-07, |
| "loss": 0.3639, |
| "step": 1498 |
| }, |
| { |
| "epoch": 0.9228342215125339, |
| "grad_norm": 1.6050225496292114, |
| "learning_rate": 1.5461356885461077e-07, |
| "loss": 0.3555, |
| "step": 1499 |
| }, |
| { |
| "epoch": 0.9234498547490333, |
| "grad_norm": 1.565177083015442, |
| "learning_rate": 1.5216222157112826e-07, |
| "loss": 0.3785, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.9234498547490333, |
| "eval_loss": 0.37673336267471313, |
| "eval_runtime": 118.7596, |
| "eval_samples_per_second": 35.374, |
| "eval_steps_per_second": 4.429, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.9240654879855327, |
| "grad_norm": 1.634409785270691, |
| "learning_rate": 1.4973016225951097e-07, |
| "loss": 0.3773, |
| "step": 1501 |
| }, |
| { |
| "epoch": 0.9246811212220319, |
| "grad_norm": 1.7125306129455566, |
| "learning_rate": 1.4731740059613365e-07, |
| "loss": 0.3803, |
| "step": 1502 |
| }, |
| { |
| "epoch": 0.9252967544585313, |
| "grad_norm": 1.8356815576553345, |
| "learning_rate": 1.4492394618059234e-07, |
| "loss": 0.3966, |
| "step": 1503 |
| }, |
| { |
| "epoch": 0.9259123876950307, |
| "grad_norm": 1.6698647737503052, |
| "learning_rate": 1.4254980853566248e-07, |
| "loss": 0.362, |
| "step": 1504 |
| }, |
| { |
| "epoch": 0.9265280209315301, |
| "grad_norm": 1.796517014503479, |
| "learning_rate": 1.4019499710726913e-07, |
| "loss": 0.4043, |
| "step": 1505 |
| }, |
| { |
| "epoch": 0.9271436541680294, |
| "grad_norm": 1.8080421686172485, |
| "learning_rate": 1.3785952126444014e-07, |
| "loss": 0.3847, |
| "step": 1506 |
| }, |
| { |
| "epoch": 0.9277592874045287, |
| "grad_norm": 1.6595298051834106, |
| "learning_rate": 1.3554339029927532e-07, |
| "loss": 0.3827, |
| "step": 1507 |
| }, |
| { |
| "epoch": 0.9283749206410281, |
| "grad_norm": 1.7067153453826904, |
| "learning_rate": 1.3324661342690892e-07, |
| "loss": 0.3979, |
| "step": 1508 |
| }, |
| { |
| "epoch": 0.9289905538775275, |
| "grad_norm": 1.6506260633468628, |
| "learning_rate": 1.3096919978546842e-07, |
| "loss": 0.355, |
| "step": 1509 |
| }, |
| { |
| "epoch": 0.9296061871140269, |
| "grad_norm": 1.6815375089645386, |
| "learning_rate": 1.2871115843604508e-07, |
| "loss": 0.3635, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.9302218203505261, |
| "grad_norm": 1.6872223615646362, |
| "learning_rate": 1.264724983626492e-07, |
| "loss": 0.3799, |
| "step": 1511 |
| }, |
| { |
| "epoch": 0.9308374535870255, |
| "grad_norm": 1.5445810556411743, |
| "learning_rate": 1.2425322847218368e-07, |
| "loss": 0.3644, |
| "step": 1512 |
| }, |
| { |
| "epoch": 0.9314530868235249, |
| "grad_norm": 1.8322566747665405, |
| "learning_rate": 1.220533575944033e-07, |
| "loss": 0.4074, |
| "step": 1513 |
| }, |
| { |
| "epoch": 0.9320687200600243, |
| "grad_norm": 1.8397573232650757, |
| "learning_rate": 1.1987289448187777e-07, |
| "loss": 0.4239, |
| "step": 1514 |
| }, |
| { |
| "epoch": 0.9326843532965235, |
| "grad_norm": 1.6941214799880981, |
| "learning_rate": 1.1771184780996315e-07, |
| "loss": 0.3807, |
| "step": 1515 |
| }, |
| { |
| "epoch": 0.9332999865330229, |
| "grad_norm": 1.8412351608276367, |
| "learning_rate": 1.1557022617676217e-07, |
| "loss": 0.3952, |
| "step": 1516 |
| }, |
| { |
| "epoch": 0.9339156197695223, |
| "grad_norm": 1.6536684036254883, |
| "learning_rate": 1.1344803810309001e-07, |
| "loss": 0.3542, |
| "step": 1517 |
| }, |
| { |
| "epoch": 0.9345312530060217, |
| "grad_norm": 1.670566439628601, |
| "learning_rate": 1.1134529203244592e-07, |
| "loss": 0.3879, |
| "step": 1518 |
| }, |
| { |
| "epoch": 0.9351468862425211, |
| "grad_norm": 1.7458523511886597, |
| "learning_rate": 1.0926199633097156e-07, |
| "loss": 0.3742, |
| "step": 1519 |
| }, |
| { |
| "epoch": 0.9357625194790203, |
| "grad_norm": 1.7485136985778809, |
| "learning_rate": 1.071981592874255e-07, |
| "loss": 0.3796, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.9363781527155197, |
| "grad_norm": 1.7157516479492188, |
| "learning_rate": 1.0515378911314378e-07, |
| "loss": 0.3915, |
| "step": 1521 |
| }, |
| { |
| "epoch": 0.9369937859520191, |
| "grad_norm": 1.6509552001953125, |
| "learning_rate": 1.031288939420122e-07, |
| "loss": 0.3931, |
| "step": 1522 |
| }, |
| { |
| "epoch": 0.9376094191885185, |
| "grad_norm": 1.7180267572402954, |
| "learning_rate": 1.011234818304302e-07, |
| "loss": 0.403, |
| "step": 1523 |
| }, |
| { |
| "epoch": 0.9382250524250177, |
| "grad_norm": 1.630218744277954, |
| "learning_rate": 9.913756075728088e-08, |
| "loss": 0.3504, |
| "step": 1524 |
| }, |
| { |
| "epoch": 0.9388406856615171, |
| "grad_norm": 1.607527732849121, |
| "learning_rate": 9.717113862389993e-08, |
| "loss": 0.3572, |
| "step": 1525 |
| }, |
| { |
| "epoch": 0.9394563188980165, |
| "grad_norm": 1.72373628616333, |
| "learning_rate": 9.522422325404234e-08, |
| "loss": 0.3881, |
| "step": 1526 |
| }, |
| { |
| "epoch": 0.9400719521345159, |
| "grad_norm": 1.6644386053085327, |
| "learning_rate": 9.32968223938513e-08, |
| "loss": 0.3824, |
| "step": 1527 |
| }, |
| { |
| "epoch": 0.9406875853710153, |
| "grad_norm": 1.7010245323181152, |
| "learning_rate": 9.138894371182983e-08, |
| "loss": 0.3697, |
| "step": 1528 |
| }, |
| { |
| "epoch": 0.9413032186075145, |
| "grad_norm": 1.7236442565917969, |
| "learning_rate": 8.950059479880591e-08, |
| "loss": 0.398, |
| "step": 1529 |
| }, |
| { |
| "epoch": 0.9419188518440139, |
| "grad_norm": 1.6048997640609741, |
| "learning_rate": 8.7631783167908e-08, |
| "loss": 0.3533, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.9425344850805133, |
| "grad_norm": 1.529986023902893, |
| "learning_rate": 8.57825162545295e-08, |
| "loss": 0.3575, |
| "step": 1531 |
| }, |
| { |
| "epoch": 0.9431501183170127, |
| "grad_norm": 1.6632636785507202, |
| "learning_rate": 8.395280141630324e-08, |
| "loss": 0.3639, |
| "step": 1532 |
| }, |
| { |
| "epoch": 0.943765751553512, |
| "grad_norm": 1.7666611671447754, |
| "learning_rate": 8.214264593307097e-08, |
| "loss": 0.4111, |
| "step": 1533 |
| }, |
| { |
| "epoch": 0.9443813847900113, |
| "grad_norm": 1.7613328695297241, |
| "learning_rate": 8.035205700685167e-08, |
| "loss": 0.3867, |
| "step": 1534 |
| }, |
| { |
| "epoch": 0.9449970180265107, |
| "grad_norm": 1.7610583305358887, |
| "learning_rate": 7.85810417618188e-08, |
| "loss": 0.3847, |
| "step": 1535 |
| }, |
| { |
| "epoch": 0.9456126512630101, |
| "grad_norm": 1.7215214967727661, |
| "learning_rate": 7.682960724426592e-08, |
| "loss": 0.3978, |
| "step": 1536 |
| }, |
| { |
| "epoch": 0.9462282844995095, |
| "grad_norm": 1.7222514152526855, |
| "learning_rate": 7.509776042258166e-08, |
| "loss": 0.3871, |
| "step": 1537 |
| }, |
| { |
| "epoch": 0.9468439177360087, |
| "grad_norm": 1.7724417448043823, |
| "learning_rate": 7.338550818722367e-08, |
| "loss": 0.3539, |
| "step": 1538 |
| }, |
| { |
| "epoch": 0.9474595509725081, |
| "grad_norm": 1.6608505249023438, |
| "learning_rate": 7.169285735068531e-08, |
| "loss": 0.3768, |
| "step": 1539 |
| }, |
| { |
| "epoch": 0.9480751842090075, |
| "grad_norm": 1.6848286390304565, |
| "learning_rate": 7.001981464747565e-08, |
| "loss": 0.3879, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.9486908174455069, |
| "grad_norm": 1.7346253395080566, |
| "learning_rate": 6.83663867340878e-08, |
| "loss": 0.3887, |
| "step": 1541 |
| }, |
| { |
| "epoch": 0.9493064506820061, |
| "grad_norm": 1.8061364889144897, |
| "learning_rate": 6.673258018897455e-08, |
| "loss": 0.3684, |
| "step": 1542 |
| }, |
| { |
| "epoch": 0.9499220839185055, |
| "grad_norm": 1.6204763650894165, |
| "learning_rate": 6.511840151252169e-08, |
| "loss": 0.3895, |
| "step": 1543 |
| }, |
| { |
| "epoch": 0.9505377171550049, |
| "grad_norm": 1.61834716796875, |
| "learning_rate": 6.352385712702191e-08, |
| "loss": 0.3594, |
| "step": 1544 |
| }, |
| { |
| "epoch": 0.9511533503915043, |
| "grad_norm": 1.6177845001220703, |
| "learning_rate": 6.194895337664875e-08, |
| "loss": 0.3703, |
| "step": 1545 |
| }, |
| { |
| "epoch": 0.9517689836280037, |
| "grad_norm": 1.6411689519882202, |
| "learning_rate": 6.039369652743266e-08, |
| "loss": 0.3706, |
| "step": 1546 |
| }, |
| { |
| "epoch": 0.9523846168645029, |
| "grad_norm": 1.7305967807769775, |
| "learning_rate": 5.8858092767236084e-08, |
| "loss": 0.3757, |
| "step": 1547 |
| }, |
| { |
| "epoch": 0.9530002501010023, |
| "grad_norm": 1.7725684642791748, |
| "learning_rate": 5.734214820572737e-08, |
| "loss": 0.401, |
| "step": 1548 |
| }, |
| { |
| "epoch": 0.9536158833375017, |
| "grad_norm": 1.592667818069458, |
| "learning_rate": 5.584586887435739e-08, |
| "loss": 0.3622, |
| "step": 1549 |
| }, |
| { |
| "epoch": 0.9542315165740011, |
| "grad_norm": 1.591291904449463, |
| "learning_rate": 5.436926072633575e-08, |
| "loss": 0.367, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.9548471498105003, |
| "grad_norm": 1.700524091720581, |
| "learning_rate": 5.291232963660686e-08, |
| "loss": 0.3915, |
| "step": 1551 |
| }, |
| { |
| "epoch": 0.9554627830469997, |
| "grad_norm": 1.600724220275879, |
| "learning_rate": 5.1475081401825553e-08, |
| "loss": 0.3611, |
| "step": 1552 |
| }, |
| { |
| "epoch": 0.9560784162834991, |
| "grad_norm": 1.7572377920150757, |
| "learning_rate": 5.0057521740336515e-08, |
| "loss": 0.378, |
| "step": 1553 |
| }, |
| { |
| "epoch": 0.9566940495199985, |
| "grad_norm": 1.6119253635406494, |
| "learning_rate": 4.865965629214819e-08, |
| "loss": 0.3671, |
| "step": 1554 |
| }, |
| { |
| "epoch": 0.9573096827564979, |
| "grad_norm": 1.6394660472869873, |
| "learning_rate": 4.7281490618914516e-08, |
| "loss": 0.3738, |
| "step": 1555 |
| }, |
| { |
| "epoch": 0.9579253159929971, |
| "grad_norm": 1.7939733266830444, |
| "learning_rate": 4.5923030203908203e-08, |
| "loss": 0.3778, |
| "step": 1556 |
| }, |
| { |
| "epoch": 0.9585409492294965, |
| "grad_norm": 1.6165034770965576, |
| "learning_rate": 4.4584280452001914e-08, |
| "loss": 0.3462, |
| "step": 1557 |
| }, |
| { |
| "epoch": 0.9591565824659959, |
| "grad_norm": 1.738077998161316, |
| "learning_rate": 4.32652466896466e-08, |
| "loss": 0.4012, |
| "step": 1558 |
| }, |
| { |
| "epoch": 0.9597722157024953, |
| "grad_norm": 1.7063754796981812, |
| "learning_rate": 4.196593416484873e-08, |
| "loss": 0.3752, |
| "step": 1559 |
| }, |
| { |
| "epoch": 0.9603878489389946, |
| "grad_norm": 1.73617684841156, |
| "learning_rate": 4.068634804715088e-08, |
| "loss": 0.3808, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.9610034821754939, |
| "grad_norm": 1.6154147386550903, |
| "learning_rate": 3.9426493427611177e-08, |
| "loss": 0.3569, |
| "step": 1561 |
| }, |
| { |
| "epoch": 0.9616191154119933, |
| "grad_norm": 1.5781503915786743, |
| "learning_rate": 3.818637531878056e-08, |
| "loss": 0.3652, |
| "step": 1562 |
| }, |
| { |
| "epoch": 0.9622347486484927, |
| "grad_norm": 1.6501221656799316, |
| "learning_rate": 3.69659986546872e-08, |
| "loss": 0.3717, |
| "step": 1563 |
| }, |
| { |
| "epoch": 0.9628503818849921, |
| "grad_norm": 1.5871952772140503, |
| "learning_rate": 3.576536829081323e-08, |
| "loss": 0.3757, |
| "step": 1564 |
| }, |
| { |
| "epoch": 0.9634660151214913, |
| "grad_norm": 1.6805063486099243, |
| "learning_rate": 3.458448900407752e-08, |
| "loss": 0.3818, |
| "step": 1565 |
| }, |
| { |
| "epoch": 0.9640816483579907, |
| "grad_norm": 1.7517391443252563, |
| "learning_rate": 3.3423365492813994e-08, |
| "loss": 0.3799, |
| "step": 1566 |
| }, |
| { |
| "epoch": 0.9646972815944901, |
| "grad_norm": 1.4884332418441772, |
| "learning_rate": 3.2282002376756163e-08, |
| "loss": 0.3387, |
| "step": 1567 |
| }, |
| { |
| "epoch": 0.9653129148309895, |
| "grad_norm": 1.7127952575683594, |
| "learning_rate": 3.1160404197018155e-08, |
| "loss": 0.3873, |
| "step": 1568 |
| }, |
| { |
| "epoch": 0.9659285480674888, |
| "grad_norm": 1.7407442331314087, |
| "learning_rate": 3.005857541607371e-08, |
| "loss": 0.3916, |
| "step": 1569 |
| }, |
| { |
| "epoch": 0.9665441813039881, |
| "grad_norm": 1.687168002128601, |
| "learning_rate": 2.8976520417742794e-08, |
| "loss": 0.3894, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.9671598145404875, |
| "grad_norm": 1.6315783262252808, |
| "learning_rate": 2.7914243507169427e-08, |
| "loss": 0.3603, |
| "step": 1571 |
| }, |
| { |
| "epoch": 0.9677754477769869, |
| "grad_norm": 1.6698793172836304, |
| "learning_rate": 2.6871748910808903e-08, |
| "loss": 0.3667, |
| "step": 1572 |
| }, |
| { |
| "epoch": 0.9683910810134863, |
| "grad_norm": 1.6630492210388184, |
| "learning_rate": 2.584904077640893e-08, |
| "loss": 0.3811, |
| "step": 1573 |
| }, |
| { |
| "epoch": 0.9690067142499855, |
| "grad_norm": 1.758796215057373, |
| "learning_rate": 2.4846123172992953e-08, |
| "loss": 0.404, |
| "step": 1574 |
| }, |
| { |
| "epoch": 0.9696223474864849, |
| "grad_norm": 1.6306681632995605, |
| "learning_rate": 2.386300009084408e-08, |
| "loss": 0.3869, |
| "step": 1575 |
| }, |
| { |
| "epoch": 0.9702379807229843, |
| "grad_norm": 1.671326994895935, |
| "learning_rate": 2.2899675441490078e-08, |
| "loss": 0.3626, |
| "step": 1576 |
| }, |
| { |
| "epoch": 0.9708536139594837, |
| "grad_norm": 1.7411401271820068, |
| "learning_rate": 2.195615305768617e-08, |
| "loss": 0.3879, |
| "step": 1577 |
| }, |
| { |
| "epoch": 0.971469247195983, |
| "grad_norm": 1.7686033248901367, |
| "learning_rate": 2.103243669340227e-08, |
| "loss": 0.408, |
| "step": 1578 |
| }, |
| { |
| "epoch": 0.9720848804324823, |
| "grad_norm": 1.715114712715149, |
| "learning_rate": 2.012853002380466e-08, |
| "loss": 0.3809, |
| "step": 1579 |
| }, |
| { |
| "epoch": 0.9727005136689817, |
| "grad_norm": 1.6715350151062012, |
| "learning_rate": 1.9244436645246002e-08, |
| "loss": 0.3737, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.9733161469054811, |
| "grad_norm": 1.6584084033966064, |
| "learning_rate": 1.838016007524479e-08, |
| "loss": 0.3678, |
| "step": 1581 |
| }, |
| { |
| "epoch": 0.9739317801419805, |
| "grad_norm": 1.616811990737915, |
| "learning_rate": 1.753570375247815e-08, |
| "loss": 0.3659, |
| "step": 1582 |
| }, |
| { |
| "epoch": 0.9745474133784797, |
| "grad_norm": 1.5945422649383545, |
| "learning_rate": 1.6711071036763506e-08, |
| "loss": 0.3612, |
| "step": 1583 |
| }, |
| { |
| "epoch": 0.9751630466149791, |
| "grad_norm": 1.6576652526855469, |
| "learning_rate": 1.590626520904526e-08, |
| "loss": 0.3926, |
| "step": 1584 |
| }, |
| { |
| "epoch": 0.9757786798514785, |
| "grad_norm": 1.6075187921524048, |
| "learning_rate": 1.5121289471385915e-08, |
| "loss": 0.3596, |
| "step": 1585 |
| }, |
| { |
| "epoch": 0.9763943130879779, |
| "grad_norm": 1.673209547996521, |
| "learning_rate": 1.4356146946948313e-08, |
| "loss": 0.3785, |
| "step": 1586 |
| }, |
| { |
| "epoch": 0.9770099463244772, |
| "grad_norm": 1.7732353210449219, |
| "learning_rate": 1.3610840679985638e-08, |
| "loss": 0.3842, |
| "step": 1587 |
| }, |
| { |
| "epoch": 0.9776255795609765, |
| "grad_norm": 1.6880178451538086, |
| "learning_rate": 1.2885373635829756e-08, |
| "loss": 0.3914, |
| "step": 1588 |
| }, |
| { |
| "epoch": 0.9782412127974759, |
| "grad_norm": 1.6833375692367554, |
| "learning_rate": 1.2179748700879013e-08, |
| "loss": 0.3878, |
| "step": 1589 |
| }, |
| { |
| "epoch": 0.9788568460339753, |
| "grad_norm": 1.6276341676712036, |
| "learning_rate": 1.14939686825849e-08, |
| "loss": 0.3786, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.9794724792704746, |
| "grad_norm": 1.8428518772125244, |
| "learning_rate": 1.0828036309443735e-08, |
| "loss": 0.4015, |
| "step": 1591 |
| }, |
| { |
| "epoch": 0.9800881125069739, |
| "grad_norm": 1.7889384031295776, |
| "learning_rate": 1.0181954230983893e-08, |
| "loss": 0.3867, |
| "step": 1592 |
| }, |
| { |
| "epoch": 0.9807037457434733, |
| "grad_norm": 1.6265109777450562, |
| "learning_rate": 9.555725017756922e-09, |
| "loss": 0.372, |
| "step": 1593 |
| }, |
| { |
| "epoch": 0.9813193789799727, |
| "grad_norm": 1.63323974609375, |
| "learning_rate": 8.949351161324227e-09, |
| "loss": 0.3725, |
| "step": 1594 |
| }, |
| { |
| "epoch": 0.9819350122164721, |
| "grad_norm": 1.5772345066070557, |
| "learning_rate": 8.362835074251508e-09, |
| "loss": 0.3765, |
| "step": 1595 |
| }, |
| { |
| "epoch": 0.9825506454529714, |
| "grad_norm": 1.7093181610107422, |
| "learning_rate": 7.796179090094891e-09, |
| "loss": 0.3648, |
| "step": 1596 |
| }, |
| { |
| "epoch": 0.9831662786894707, |
| "grad_norm": 1.6425986289978027, |
| "learning_rate": 7.249385463395375e-09, |
| "loss": 0.3802, |
| "step": 1597 |
| }, |
| { |
| "epoch": 0.9837819119259701, |
| "grad_norm": 1.6807538270950317, |
| "learning_rate": 6.722456369666619e-09, |
| "loss": 0.3741, |
| "step": 1598 |
| }, |
| { |
| "epoch": 0.9843975451624695, |
| "grad_norm": 1.6659401655197144, |
| "learning_rate": 6.215393905388278e-09, |
| "loss": 0.3929, |
| "step": 1599 |
| }, |
| { |
| "epoch": 0.9850131783989688, |
| "grad_norm": 1.6375937461853027, |
| "learning_rate": 5.728200087997126e-09, |
| "loss": 0.3676, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.9850131783989688, |
| "eval_loss": 0.37590205669403076, |
| "eval_runtime": 118.0408, |
| "eval_samples_per_second": 35.589, |
| "eval_steps_per_second": 4.456, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.9856288116354681, |
| "grad_norm": 1.6175541877746582, |
| "learning_rate": 5.2608768558798376e-09, |
| "loss": 0.3793, |
| "step": 1601 |
| }, |
| { |
| "epoch": 0.9862444448719675, |
| "grad_norm": 1.5827006101608276, |
| "learning_rate": 4.813426068362992e-09, |
| "loss": 0.3577, |
| "step": 1602 |
| }, |
| { |
| "epoch": 0.9868600781084669, |
| "grad_norm": 1.6540141105651855, |
| "learning_rate": 4.385849505708084e-09, |
| "loss": 0.3754, |
| "step": 1603 |
| }, |
| { |
| "epoch": 0.9874757113449663, |
| "grad_norm": 1.7661128044128418, |
| "learning_rate": 3.978148869103748e-09, |
| "loss": 0.4099, |
| "step": 1604 |
| }, |
| { |
| "epoch": 0.9880913445814656, |
| "grad_norm": 1.6497414112091064, |
| "learning_rate": 3.5903257806579884e-09, |
| "loss": 0.3911, |
| "step": 1605 |
| }, |
| { |
| "epoch": 0.9887069778179649, |
| "grad_norm": 1.6843533515930176, |
| "learning_rate": 3.2223817833931803e-09, |
| "loss": 0.36, |
| "step": 1606 |
| }, |
| { |
| "epoch": 0.9893226110544643, |
| "grad_norm": 1.7438652515411377, |
| "learning_rate": 2.8743183412388578e-09, |
| "loss": 0.385, |
| "step": 1607 |
| }, |
| { |
| "epoch": 0.9899382442909637, |
| "grad_norm": 1.6070297956466675, |
| "learning_rate": 2.5461368390261587e-09, |
| "loss": 0.3675, |
| "step": 1608 |
| }, |
| { |
| "epoch": 0.990553877527463, |
| "grad_norm": 1.569028377532959, |
| "learning_rate": 2.237838582483387e-09, |
| "loss": 0.3662, |
| "step": 1609 |
| }, |
| { |
| "epoch": 0.9911695107639624, |
| "grad_norm": 1.6607086658477783, |
| "learning_rate": 1.9494247982282386e-09, |
| "loss": 0.3952, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.9917851440004617, |
| "grad_norm": 1.6013164520263672, |
| "learning_rate": 1.6808966337661382e-09, |
| "loss": 0.3746, |
| "step": 1611 |
| }, |
| { |
| "epoch": 0.9924007772369611, |
| "grad_norm": 1.690483808517456, |
| "learning_rate": 1.4322551574830202e-09, |
| "loss": 0.3731, |
| "step": 1612 |
| }, |
| { |
| "epoch": 0.9930164104734605, |
| "grad_norm": 1.6636048555374146, |
| "learning_rate": 1.203501358642556e-09, |
| "loss": 0.375, |
| "step": 1613 |
| }, |
| { |
| "epoch": 0.9936320437099598, |
| "grad_norm": 1.710705041885376, |
| "learning_rate": 9.946361473822664e-10, |
| "loss": 0.3713, |
| "step": 1614 |
| }, |
| { |
| "epoch": 0.9942476769464591, |
| "grad_norm": 1.6652582883834839, |
| "learning_rate": 8.056603547090813e-10, |
| "loss": 0.4017, |
| "step": 1615 |
| }, |
| { |
| "epoch": 0.9948633101829585, |
| "grad_norm": 1.7423772811889648, |
| "learning_rate": 6.365747324954541e-10, |
| "loss": 0.3843, |
| "step": 1616 |
| }, |
| { |
| "epoch": 0.9954789434194579, |
| "grad_norm": 1.6425862312316895, |
| "learning_rate": 4.87379953478806e-10, |
| "loss": 0.3755, |
| "step": 1617 |
| }, |
| { |
| "epoch": 0.9960945766559572, |
| "grad_norm": 1.648992657661438, |
| "learning_rate": 3.580766112565304e-10, |
| "loss": 0.3831, |
| "step": 1618 |
| }, |
| { |
| "epoch": 0.9967102098924566, |
| "grad_norm": 1.7648104429244995, |
| "learning_rate": 2.486652202848827e-10, |
| "loss": 0.3979, |
| "step": 1619 |
| }, |
| { |
| "epoch": 0.9973258431289559, |
| "grad_norm": 1.735405445098877, |
| "learning_rate": 1.591462158756496e-10, |
| "loss": 0.3663, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.9979414763654553, |
| "grad_norm": 1.6370799541473389, |
| "learning_rate": 8.951995419614889e-11, |
| "loss": 0.3798, |
| "step": 1621 |
| }, |
| { |
| "epoch": 0.9985571096019547, |
| "grad_norm": 1.6457384824752808, |
| "learning_rate": 3.9786712267009256e-11, |
| "loss": 0.3724, |
| "step": 1622 |
| }, |
| { |
| "epoch": 0.999172742838454, |
| "grad_norm": 1.5603142976760864, |
| "learning_rate": 9.946687960504797e-12, |
| "loss": 0.3682, |
| "step": 1623 |
| }, |
| { |
| "epoch": 0.9997883760749533, |
| "grad_norm": 1.7614634037017822, |
| "learning_rate": 0.0, |
| "loss": 0.3983, |
| "step": 1624 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 1624, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 400, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 9.8093512752785e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|