diff --git "a/checkpoint-1788/trainer_state.json" "b/checkpoint-1788/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1788/trainer_state.json" @@ -0,0 +1,12549 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 1788, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0011185682326621924, + "grad_norm": 2.231250286102295, + "learning_rate": 5.0000000000000004e-08, + "loss": 1.0507, + "step": 1 + }, + { + "epoch": 0.0022371364653243847, + "grad_norm": 2.1123249530792236, + "learning_rate": 1.0000000000000001e-07, + "loss": 1.04, + "step": 2 + }, + { + "epoch": 0.003355704697986577, + "grad_norm": 2.0946707725524902, + "learning_rate": 1.5000000000000002e-07, + "loss": 1.0307, + "step": 3 + }, + { + "epoch": 0.0044742729306487695, + "grad_norm": 2.0837416648864746, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.0484, + "step": 4 + }, + { + "epoch": 0.005592841163310962, + "grad_norm": 1.9843275547027588, + "learning_rate": 2.5000000000000004e-07, + "loss": 1.0012, + "step": 5 + }, + { + "epoch": 0.006711409395973154, + "grad_norm": 2.121988296508789, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.0603, + "step": 6 + }, + { + "epoch": 0.007829977628635347, + "grad_norm": 2.029029369354248, + "learning_rate": 3.5000000000000004e-07, + "loss": 1.0323, + "step": 7 + }, + { + "epoch": 0.008948545861297539, + "grad_norm": 1.9815905094146729, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.0273, + "step": 8 + }, + { + "epoch": 0.010067114093959731, + "grad_norm": 2.3339314460754395, + "learning_rate": 4.5000000000000003e-07, + "loss": 1.0805, + "step": 9 + }, + { + "epoch": 0.011185682326621925, + "grad_norm": 2.1078243255615234, + "learning_rate": 5.000000000000001e-07, + "loss": 1.0382, + "step": 10 + }, + { + "epoch": 0.012304250559284116, + "grad_norm": 1.8874777555465698, + "learning_rate": 5.5e-07, + "loss": 1.008, + "step": 11 + }, + { + "epoch": 0.013422818791946308, + "grad_norm": 1.9720211029052734, + "learning_rate": 6.000000000000001e-07, + "loss": 1.0065, + "step": 12 + }, + { + "epoch": 0.0145413870246085, + "grad_norm": 2.0002245903015137, + "learning_rate": 6.5e-07, + "loss": 1.0379, + "step": 13 + }, + { + "epoch": 0.015659955257270694, + "grad_norm": 1.983207106590271, + "learning_rate": 7.000000000000001e-07, + "loss": 1.0271, + "step": 14 + }, + { + "epoch": 0.016778523489932886, + "grad_norm": 1.886121153831482, + "learning_rate": 7.5e-07, + "loss": 1.0019, + "step": 15 + }, + { + "epoch": 0.017897091722595078, + "grad_norm": 1.9403958320617676, + "learning_rate": 8.000000000000001e-07, + "loss": 0.9885, + "step": 16 + }, + { + "epoch": 0.01901565995525727, + "grad_norm": 1.9739996194839478, + "learning_rate": 8.500000000000001e-07, + "loss": 0.9904, + "step": 17 + }, + { + "epoch": 0.020134228187919462, + "grad_norm": 1.7419469356536865, + "learning_rate": 9.000000000000001e-07, + "loss": 0.9709, + "step": 18 + }, + { + "epoch": 0.021252796420581657, + "grad_norm": 1.7856152057647705, + "learning_rate": 9.500000000000001e-07, + "loss": 0.9859, + "step": 19 + }, + { + "epoch": 0.02237136465324385, + "grad_norm": 1.6159933805465698, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.9895, + "step": 20 + }, + { + "epoch": 0.02348993288590604, + "grad_norm": 1.7010679244995117, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.0115, + "step": 21 + }, + { + "epoch": 0.024608501118568233, + "grad_norm": 1.7860039472579956, + "learning_rate": 1.1e-06, + "loss": 0.9917, + "step": 22 + }, + { + "epoch": 0.025727069351230425, + "grad_norm": 1.3735058307647705, + "learning_rate": 1.1500000000000002e-06, + "loss": 0.9441, + "step": 23 + }, + { + "epoch": 0.026845637583892617, + "grad_norm": 1.439109206199646, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.9304, + "step": 24 + }, + { + "epoch": 0.02796420581655481, + "grad_norm": 1.380369782447815, + "learning_rate": 1.25e-06, + "loss": 0.9469, + "step": 25 + }, + { + "epoch": 0.029082774049217, + "grad_norm": 1.2287472486495972, + "learning_rate": 1.3e-06, + "loss": 0.8808, + "step": 26 + }, + { + "epoch": 0.030201342281879196, + "grad_norm": 1.0899194478988647, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.8912, + "step": 27 + }, + { + "epoch": 0.03131991051454139, + "grad_norm": 1.0445002317428589, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.876, + "step": 28 + }, + { + "epoch": 0.03243847874720358, + "grad_norm": 1.0201383829116821, + "learning_rate": 1.45e-06, + "loss": 0.9003, + "step": 29 + }, + { + "epoch": 0.03355704697986577, + "grad_norm": 0.9528365731239319, + "learning_rate": 1.5e-06, + "loss": 0.8537, + "step": 30 + }, + { + "epoch": 0.03467561521252797, + "grad_norm": 0.9615768194198608, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.8819, + "step": 31 + }, + { + "epoch": 0.035794183445190156, + "grad_norm": 0.9578896760940552, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.8859, + "step": 32 + }, + { + "epoch": 0.03691275167785235, + "grad_norm": 0.977853536605835, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.8835, + "step": 33 + }, + { + "epoch": 0.03803131991051454, + "grad_norm": 0.8976068496704102, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.8599, + "step": 34 + }, + { + "epoch": 0.039149888143176735, + "grad_norm": 0.8779590725898743, + "learning_rate": 1.75e-06, + "loss": 0.8708, + "step": 35 + }, + { + "epoch": 0.040268456375838924, + "grad_norm": 0.853705644607544, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.8465, + "step": 36 + }, + { + "epoch": 0.04138702460850112, + "grad_norm": 0.8480839729309082, + "learning_rate": 1.85e-06, + "loss": 0.8292, + "step": 37 + }, + { + "epoch": 0.042505592841163314, + "grad_norm": 0.8372538089752197, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.8026, + "step": 38 + }, + { + "epoch": 0.0436241610738255, + "grad_norm": 0.8592961430549622, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.8153, + "step": 39 + }, + { + "epoch": 0.0447427293064877, + "grad_norm": 0.8222276568412781, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.814, + "step": 40 + }, + { + "epoch": 0.04586129753914989, + "grad_norm": 0.825672447681427, + "learning_rate": 2.05e-06, + "loss": 0.7793, + "step": 41 + }, + { + "epoch": 0.04697986577181208, + "grad_norm": 0.8016732335090637, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.771, + "step": 42 + }, + { + "epoch": 0.04809843400447427, + "grad_norm": 0.7026550769805908, + "learning_rate": 2.15e-06, + "loss": 0.7664, + "step": 43 + }, + { + "epoch": 0.049217002237136466, + "grad_norm": 0.6678670644760132, + "learning_rate": 2.2e-06, + "loss": 0.7774, + "step": 44 + }, + { + "epoch": 0.050335570469798654, + "grad_norm": 0.6766750812530518, + "learning_rate": 2.25e-06, + "loss": 0.7832, + "step": 45 + }, + { + "epoch": 0.05145413870246085, + "grad_norm": 0.7094117999076843, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.7861, + "step": 46 + }, + { + "epoch": 0.052572706935123045, + "grad_norm": 0.6871191263198853, + "learning_rate": 2.35e-06, + "loss": 0.7848, + "step": 47 + }, + { + "epoch": 0.053691275167785234, + "grad_norm": 0.6089867353439331, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.7658, + "step": 48 + }, + { + "epoch": 0.05480984340044743, + "grad_norm": 0.5112010836601257, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.7921, + "step": 49 + }, + { + "epoch": 0.05592841163310962, + "grad_norm": 0.5008496046066284, + "learning_rate": 2.5e-06, + "loss": 0.7105, + "step": 50 + }, + { + "epoch": 0.05704697986577181, + "grad_norm": 0.5599631071090698, + "learning_rate": 2.55e-06, + "loss": 0.7526, + "step": 51 + }, + { + "epoch": 0.058165548098434, + "grad_norm": 0.6905913352966309, + "learning_rate": 2.6e-06, + "loss": 0.7496, + "step": 52 + }, + { + "epoch": 0.0592841163310962, + "grad_norm": 0.6198621392250061, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.7297, + "step": 53 + }, + { + "epoch": 0.06040268456375839, + "grad_norm": 0.6158658862113953, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.7309, + "step": 54 + }, + { + "epoch": 0.06152125279642058, + "grad_norm": 0.5798735618591309, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.7102, + "step": 55 + }, + { + "epoch": 0.06263982102908278, + "grad_norm": 0.5550254583358765, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.7488, + "step": 56 + }, + { + "epoch": 0.06375838926174497, + "grad_norm": 0.4888734221458435, + "learning_rate": 2.85e-06, + "loss": 0.75, + "step": 57 + }, + { + "epoch": 0.06487695749440715, + "grad_norm": 0.4579496383666992, + "learning_rate": 2.9e-06, + "loss": 0.7108, + "step": 58 + }, + { + "epoch": 0.06599552572706935, + "grad_norm": 0.5775673389434814, + "learning_rate": 2.95e-06, + "loss": 0.7337, + "step": 59 + }, + { + "epoch": 0.06711409395973154, + "grad_norm": 0.5035051703453064, + "learning_rate": 3e-06, + "loss": 0.7677, + "step": 60 + }, + { + "epoch": 0.06823266219239374, + "grad_norm": 0.4771614074707031, + "learning_rate": 3.05e-06, + "loss": 0.724, + "step": 61 + }, + { + "epoch": 0.06935123042505593, + "grad_norm": 0.45495525002479553, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.7393, + "step": 62 + }, + { + "epoch": 0.07046979865771812, + "grad_norm": 0.36385607719421387, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.7029, + "step": 63 + }, + { + "epoch": 0.07158836689038031, + "grad_norm": 0.3554967939853668, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.6991, + "step": 64 + }, + { + "epoch": 0.07270693512304251, + "grad_norm": 0.36548176407814026, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.7292, + "step": 65 + }, + { + "epoch": 0.0738255033557047, + "grad_norm": 0.35280168056488037, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.7295, + "step": 66 + }, + { + "epoch": 0.07494407158836688, + "grad_norm": 0.3599022924900055, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.6956, + "step": 67 + }, + { + "epoch": 0.07606263982102908, + "grad_norm": 0.3802206516265869, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.6796, + "step": 68 + }, + { + "epoch": 0.07718120805369127, + "grad_norm": 0.3787902891635895, + "learning_rate": 3.45e-06, + "loss": 0.7141, + "step": 69 + }, + { + "epoch": 0.07829977628635347, + "grad_norm": 0.374461829662323, + "learning_rate": 3.5e-06, + "loss": 0.7043, + "step": 70 + }, + { + "epoch": 0.07941834451901567, + "grad_norm": 0.34469330310821533, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.7037, + "step": 71 + }, + { + "epoch": 0.08053691275167785, + "grad_norm": 0.346836119890213, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.7246, + "step": 72 + }, + { + "epoch": 0.08165548098434004, + "grad_norm": 0.34163376688957214, + "learning_rate": 3.65e-06, + "loss": 0.6977, + "step": 73 + }, + { + "epoch": 0.08277404921700224, + "grad_norm": 0.3481418788433075, + "learning_rate": 3.7e-06, + "loss": 0.7356, + "step": 74 + }, + { + "epoch": 0.08389261744966443, + "grad_norm": 0.3230934739112854, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.6869, + "step": 75 + }, + { + "epoch": 0.08501118568232663, + "grad_norm": 0.319917231798172, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.6722, + "step": 76 + }, + { + "epoch": 0.08612975391498881, + "grad_norm": 0.3535120487213135, + "learning_rate": 3.85e-06, + "loss": 0.6951, + "step": 77 + }, + { + "epoch": 0.087248322147651, + "grad_norm": 0.3229662775993347, + "learning_rate": 3.900000000000001e-06, + "loss": 0.69, + "step": 78 + }, + { + "epoch": 0.0883668903803132, + "grad_norm": 0.33365264534950256, + "learning_rate": 3.95e-06, + "loss": 0.701, + "step": 79 + }, + { + "epoch": 0.0894854586129754, + "grad_norm": 0.3302946984767914, + "learning_rate": 4.000000000000001e-06, + "loss": 0.6733, + "step": 80 + }, + { + "epoch": 0.09060402684563758, + "grad_norm": 0.3478582799434662, + "learning_rate": 4.05e-06, + "loss": 0.7022, + "step": 81 + }, + { + "epoch": 0.09172259507829977, + "grad_norm": 0.33355170488357544, + "learning_rate": 4.1e-06, + "loss": 0.7141, + "step": 82 + }, + { + "epoch": 0.09284116331096197, + "grad_norm": 0.3217330574989319, + "learning_rate": 4.15e-06, + "loss": 0.6799, + "step": 83 + }, + { + "epoch": 0.09395973154362416, + "grad_norm": 0.328838050365448, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.6943, + "step": 84 + }, + { + "epoch": 0.09507829977628636, + "grad_norm": 0.3279136121273041, + "learning_rate": 4.25e-06, + "loss": 0.6699, + "step": 85 + }, + { + "epoch": 0.09619686800894854, + "grad_norm": 0.333351194858551, + "learning_rate": 4.3e-06, + "loss": 0.712, + "step": 86 + }, + { + "epoch": 0.09731543624161074, + "grad_norm": 0.33052128553390503, + "learning_rate": 4.350000000000001e-06, + "loss": 0.7169, + "step": 87 + }, + { + "epoch": 0.09843400447427293, + "grad_norm": 0.31631597876548767, + "learning_rate": 4.4e-06, + "loss": 0.6772, + "step": 88 + }, + { + "epoch": 0.09955257270693513, + "grad_norm": 0.327311635017395, + "learning_rate": 4.450000000000001e-06, + "loss": 0.6873, + "step": 89 + }, + { + "epoch": 0.10067114093959731, + "grad_norm": 0.32048892974853516, + "learning_rate": 4.5e-06, + "loss": 0.6614, + "step": 90 + }, + { + "epoch": 0.1017897091722595, + "grad_norm": 0.32614201307296753, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.7197, + "step": 91 + }, + { + "epoch": 0.1029082774049217, + "grad_norm": 0.31145355105400085, + "learning_rate": 4.600000000000001e-06, + "loss": 0.6567, + "step": 92 + }, + { + "epoch": 0.1040268456375839, + "grad_norm": 0.31379351019859314, + "learning_rate": 4.65e-06, + "loss": 0.7013, + "step": 93 + }, + { + "epoch": 0.10514541387024609, + "grad_norm": 0.32741424441337585, + "learning_rate": 4.7e-06, + "loss": 0.6737, + "step": 94 + }, + { + "epoch": 0.10626398210290827, + "grad_norm": 0.325630247592926, + "learning_rate": 4.75e-06, + "loss": 0.6673, + "step": 95 + }, + { + "epoch": 0.10738255033557047, + "grad_norm": 0.3153480291366577, + "learning_rate": 4.800000000000001e-06, + "loss": 0.6943, + "step": 96 + }, + { + "epoch": 0.10850111856823266, + "grad_norm": 0.3244793117046356, + "learning_rate": 4.85e-06, + "loss": 0.6896, + "step": 97 + }, + { + "epoch": 0.10961968680089486, + "grad_norm": 0.3078743517398834, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.6837, + "step": 98 + }, + { + "epoch": 0.11073825503355705, + "grad_norm": 0.3314874470233917, + "learning_rate": 4.95e-06, + "loss": 0.7416, + "step": 99 + }, + { + "epoch": 0.11185682326621924, + "grad_norm": 0.31931284070014954, + "learning_rate": 5e-06, + "loss": 0.6903, + "step": 100 + }, + { + "epoch": 0.11297539149888143, + "grad_norm": 0.3176276981830597, + "learning_rate": 4.999999554776598e-06, + "loss": 0.6761, + "step": 101 + }, + { + "epoch": 0.11409395973154363, + "grad_norm": 0.3285365700721741, + "learning_rate": 4.999998219106549e-06, + "loss": 0.6892, + "step": 102 + }, + { + "epoch": 0.11521252796420582, + "grad_norm": 0.31144094467163086, + "learning_rate": 4.99999599299033e-06, + "loss": 0.6586, + "step": 103 + }, + { + "epoch": 0.116331096196868, + "grad_norm": 0.313289076089859, + "learning_rate": 4.999992876428732e-06, + "loss": 0.708, + "step": 104 + }, + { + "epoch": 0.1174496644295302, + "grad_norm": 0.3252837061882019, + "learning_rate": 4.999988869422867e-06, + "loss": 0.7083, + "step": 105 + }, + { + "epoch": 0.1185682326621924, + "grad_norm": 0.3168275058269501, + "learning_rate": 4.9999839719741615e-06, + "loss": 0.6806, + "step": 106 + }, + { + "epoch": 0.11968680089485459, + "grad_norm": 0.31589415669441223, + "learning_rate": 4.9999781840843594e-06, + "loss": 0.6702, + "step": 107 + }, + { + "epoch": 0.12080536912751678, + "grad_norm": 0.318037748336792, + "learning_rate": 4.999971505755523e-06, + "loss": 0.6601, + "step": 108 + }, + { + "epoch": 0.12192393736017897, + "grad_norm": 0.33259475231170654, + "learning_rate": 4.999963936990031e-06, + "loss": 0.7001, + "step": 109 + }, + { + "epoch": 0.12304250559284116, + "grad_norm": 0.33322346210479736, + "learning_rate": 4.999955477790579e-06, + "loss": 0.6731, + "step": 110 + }, + { + "epoch": 0.12416107382550336, + "grad_norm": 0.31344881653785706, + "learning_rate": 4.999946128160179e-06, + "loss": 0.6667, + "step": 111 + }, + { + "epoch": 0.12527964205816555, + "grad_norm": 0.32769575715065, + "learning_rate": 4.999935888102162e-06, + "loss": 0.7123, + "step": 112 + }, + { + "epoch": 0.12639821029082773, + "grad_norm": 0.314619243144989, + "learning_rate": 4.9999247576201765e-06, + "loss": 0.683, + "step": 113 + }, + { + "epoch": 0.12751677852348994, + "grad_norm": 0.3301268219947815, + "learning_rate": 4.999912736718185e-06, + "loss": 0.6761, + "step": 114 + }, + { + "epoch": 0.12863534675615212, + "grad_norm": 0.31477460265159607, + "learning_rate": 4.99989982540047e-06, + "loss": 0.6722, + "step": 115 + }, + { + "epoch": 0.1297539149888143, + "grad_norm": 0.31430870294570923, + "learning_rate": 4.999886023671629e-06, + "loss": 0.6693, + "step": 116 + }, + { + "epoch": 0.13087248322147652, + "grad_norm": 0.31705909967422485, + "learning_rate": 4.999871331536581e-06, + "loss": 0.6567, + "step": 117 + }, + { + "epoch": 0.1319910514541387, + "grad_norm": 0.3331652879714966, + "learning_rate": 4.999855749000555e-06, + "loss": 0.6895, + "step": 118 + }, + { + "epoch": 0.1331096196868009, + "grad_norm": 0.32147714495658875, + "learning_rate": 4.999839276069105e-06, + "loss": 0.6693, + "step": 119 + }, + { + "epoch": 0.1342281879194631, + "grad_norm": 0.3312559127807617, + "learning_rate": 4.999821912748095e-06, + "loss": 0.6843, + "step": 120 + }, + { + "epoch": 0.13534675615212527, + "grad_norm": 0.34178397059440613, + "learning_rate": 4.999803659043712e-06, + "loss": 0.6774, + "step": 121 + }, + { + "epoch": 0.13646532438478748, + "grad_norm": 0.3154846727848053, + "learning_rate": 4.999784514962456e-06, + "loss": 0.6638, + "step": 122 + }, + { + "epoch": 0.13758389261744966, + "grad_norm": 0.31137940287590027, + "learning_rate": 4.999764480511145e-06, + "loss": 0.6467, + "step": 123 + }, + { + "epoch": 0.13870246085011187, + "grad_norm": 0.3188192546367645, + "learning_rate": 4.999743555696918e-06, + "loss": 0.6511, + "step": 124 + }, + { + "epoch": 0.13982102908277405, + "grad_norm": 0.30495911836624146, + "learning_rate": 4.999721740527225e-06, + "loss": 0.6637, + "step": 125 + }, + { + "epoch": 0.14093959731543623, + "grad_norm": 0.3152139186859131, + "learning_rate": 4.999699035009837e-06, + "loss": 0.6631, + "step": 126 + }, + { + "epoch": 0.14205816554809844, + "grad_norm": 0.32285481691360474, + "learning_rate": 4.999675439152842e-06, + "loss": 0.6621, + "step": 127 + }, + { + "epoch": 0.14317673378076062, + "grad_norm": 0.3176666796207428, + "learning_rate": 4.999650952964643e-06, + "loss": 0.6654, + "step": 128 + }, + { + "epoch": 0.14429530201342283, + "grad_norm": 0.314035028219223, + "learning_rate": 4.999625576453962e-06, + "loss": 0.6927, + "step": 129 + }, + { + "epoch": 0.14541387024608501, + "grad_norm": 0.3227815628051758, + "learning_rate": 4.999599309629839e-06, + "loss": 0.6865, + "step": 130 + }, + { + "epoch": 0.1465324384787472, + "grad_norm": 0.3137218952178955, + "learning_rate": 4.9995721525016275e-06, + "loss": 0.6499, + "step": 131 + }, + { + "epoch": 0.1476510067114094, + "grad_norm": 0.32401353120803833, + "learning_rate": 4.999544105079001e-06, + "loss": 0.64, + "step": 132 + }, + { + "epoch": 0.1487695749440716, + "grad_norm": 0.3110584020614624, + "learning_rate": 4.99951516737195e-06, + "loss": 0.6747, + "step": 133 + }, + { + "epoch": 0.14988814317673377, + "grad_norm": 0.3246876895427704, + "learning_rate": 4.999485339390781e-06, + "loss": 0.6943, + "step": 134 + }, + { + "epoch": 0.15100671140939598, + "grad_norm": 0.3346574008464813, + "learning_rate": 4.999454621146117e-06, + "loss": 0.6675, + "step": 135 + }, + { + "epoch": 0.15212527964205816, + "grad_norm": 0.3305971920490265, + "learning_rate": 4.999423012648902e-06, + "loss": 0.7065, + "step": 136 + }, + { + "epoch": 0.15324384787472037, + "grad_norm": 0.31732234358787537, + "learning_rate": 4.9993905139103924e-06, + "loss": 0.7038, + "step": 137 + }, + { + "epoch": 0.15436241610738255, + "grad_norm": 0.3233291208744049, + "learning_rate": 4.999357124942163e-06, + "loss": 0.6856, + "step": 138 + }, + { + "epoch": 0.15548098434004473, + "grad_norm": 0.31733304262161255, + "learning_rate": 4.999322845756107e-06, + "loss": 0.702, + "step": 139 + }, + { + "epoch": 0.15659955257270694, + "grad_norm": 0.33124351501464844, + "learning_rate": 4.9992876763644346e-06, + "loss": 0.6616, + "step": 140 + }, + { + "epoch": 0.15771812080536912, + "grad_norm": 0.3264501094818115, + "learning_rate": 4.999251616779671e-06, + "loss": 0.6773, + "step": 141 + }, + { + "epoch": 0.15883668903803133, + "grad_norm": 0.34606418013572693, + "learning_rate": 4.999214667014662e-06, + "loss": 0.6765, + "step": 142 + }, + { + "epoch": 0.1599552572706935, + "grad_norm": 0.3292436897754669, + "learning_rate": 4.999176827082566e-06, + "loss": 0.6692, + "step": 143 + }, + { + "epoch": 0.1610738255033557, + "grad_norm": 0.31322377920150757, + "learning_rate": 4.9991380969968615e-06, + "loss": 0.6811, + "step": 144 + }, + { + "epoch": 0.1621923937360179, + "grad_norm": 0.32053160667419434, + "learning_rate": 4.999098476771344e-06, + "loss": 0.6544, + "step": 145 + }, + { + "epoch": 0.16331096196868009, + "grad_norm": 0.34363314509391785, + "learning_rate": 4.9990579664201244e-06, + "loss": 0.6839, + "step": 146 + }, + { + "epoch": 0.1644295302013423, + "grad_norm": 0.3260481357574463, + "learning_rate": 4.999016565957633e-06, + "loss": 0.7048, + "step": 147 + }, + { + "epoch": 0.16554809843400448, + "grad_norm": 0.3410928547382355, + "learning_rate": 4.998974275398614e-06, + "loss": 0.6846, + "step": 148 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 0.33661726117134094, + "learning_rate": 4.998931094758132e-06, + "loss": 0.6468, + "step": 149 + }, + { + "epoch": 0.16778523489932887, + "grad_norm": 0.32493966817855835, + "learning_rate": 4.998887024051565e-06, + "loss": 0.6741, + "step": 150 + }, + { + "epoch": 0.16890380313199105, + "grad_norm": 0.3396671414375305, + "learning_rate": 4.998842063294613e-06, + "loss": 0.6703, + "step": 151 + }, + { + "epoch": 0.17002237136465326, + "grad_norm": 0.32851850986480713, + "learning_rate": 4.998796212503287e-06, + "loss": 0.6589, + "step": 152 + }, + { + "epoch": 0.17114093959731544, + "grad_norm": 0.33433711528778076, + "learning_rate": 4.99874947169392e-06, + "loss": 0.6594, + "step": 153 + }, + { + "epoch": 0.17225950782997762, + "grad_norm": 0.3388006091117859, + "learning_rate": 4.99870184088316e-06, + "loss": 0.6604, + "step": 154 + }, + { + "epoch": 0.17337807606263983, + "grad_norm": 0.31802693009376526, + "learning_rate": 4.998653320087971e-06, + "loss": 0.6467, + "step": 155 + }, + { + "epoch": 0.174496644295302, + "grad_norm": 0.33016613125801086, + "learning_rate": 4.998603909325636e-06, + "loss": 0.6599, + "step": 156 + }, + { + "epoch": 0.1756152125279642, + "grad_norm": 0.32546237111091614, + "learning_rate": 4.998553608613755e-06, + "loss": 0.6519, + "step": 157 + }, + { + "epoch": 0.1767337807606264, + "grad_norm": 0.3362942337989807, + "learning_rate": 4.998502417970242e-06, + "loss": 0.671, + "step": 158 + }, + { + "epoch": 0.17785234899328858, + "grad_norm": 0.33070167899131775, + "learning_rate": 4.998450337413331e-06, + "loss": 0.6624, + "step": 159 + }, + { + "epoch": 0.1789709172259508, + "grad_norm": 0.32430973649024963, + "learning_rate": 4.998397366961571e-06, + "loss": 0.6263, + "step": 160 + }, + { + "epoch": 0.18008948545861297, + "grad_norm": 0.32481464743614197, + "learning_rate": 4.998343506633831e-06, + "loss": 0.6683, + "step": 161 + }, + { + "epoch": 0.18120805369127516, + "grad_norm": 0.33035483956336975, + "learning_rate": 4.998288756449292e-06, + "loss": 0.6816, + "step": 162 + }, + { + "epoch": 0.18232662192393737, + "grad_norm": 0.33188679814338684, + "learning_rate": 4.998233116427458e-06, + "loss": 0.6693, + "step": 163 + }, + { + "epoch": 0.18344519015659955, + "grad_norm": 0.33667680621147156, + "learning_rate": 4.998176586588145e-06, + "loss": 0.6619, + "step": 164 + }, + { + "epoch": 0.18456375838926176, + "grad_norm": 0.33836281299591064, + "learning_rate": 4.998119166951488e-06, + "loss": 0.6697, + "step": 165 + }, + { + "epoch": 0.18568232662192394, + "grad_norm": 0.31710004806518555, + "learning_rate": 4.998060857537938e-06, + "loss": 0.6386, + "step": 166 + }, + { + "epoch": 0.18680089485458612, + "grad_norm": 0.3220674395561218, + "learning_rate": 4.9980016583682655e-06, + "loss": 0.6477, + "step": 167 + }, + { + "epoch": 0.18791946308724833, + "grad_norm": 0.31624945998191833, + "learning_rate": 4.997941569463554e-06, + "loss": 0.6771, + "step": 168 + }, + { + "epoch": 0.1890380313199105, + "grad_norm": 0.33520039916038513, + "learning_rate": 4.997880590845208e-06, + "loss": 0.6777, + "step": 169 + }, + { + "epoch": 0.19015659955257272, + "grad_norm": 0.33738774061203003, + "learning_rate": 4.997818722534944e-06, + "loss": 0.6603, + "step": 170 + }, + { + "epoch": 0.1912751677852349, + "grad_norm": 0.33408045768737793, + "learning_rate": 4.9977559645548e-06, + "loss": 0.6581, + "step": 171 + }, + { + "epoch": 0.19239373601789708, + "grad_norm": 0.3269501328468323, + "learning_rate": 4.997692316927129e-06, + "loss": 0.6623, + "step": 172 + }, + { + "epoch": 0.1935123042505593, + "grad_norm": 0.33073386549949646, + "learning_rate": 4.997627779674601e-06, + "loss": 0.6465, + "step": 173 + }, + { + "epoch": 0.19463087248322147, + "grad_norm": 0.33027294278144836, + "learning_rate": 4.997562352820201e-06, + "loss": 0.6795, + "step": 174 + }, + { + "epoch": 0.19574944071588368, + "grad_norm": 0.3329165577888489, + "learning_rate": 4.997496036387235e-06, + "loss": 0.6717, + "step": 175 + }, + { + "epoch": 0.19686800894854586, + "grad_norm": 0.3321872353553772, + "learning_rate": 4.997428830399322e-06, + "loss": 0.6415, + "step": 176 + }, + { + "epoch": 0.19798657718120805, + "grad_norm": 0.3222750723361969, + "learning_rate": 4.997360734880401e-06, + "loss": 0.657, + "step": 177 + }, + { + "epoch": 0.19910514541387025, + "grad_norm": 0.33294835686683655, + "learning_rate": 4.997291749854725e-06, + "loss": 0.6931, + "step": 178 + }, + { + "epoch": 0.20022371364653244, + "grad_norm": 0.3413322865962982, + "learning_rate": 4.997221875346863e-06, + "loss": 0.6761, + "step": 179 + }, + { + "epoch": 0.20134228187919462, + "grad_norm": 0.3300095796585083, + "learning_rate": 4.997151111381707e-06, + "loss": 0.6626, + "step": 180 + }, + { + "epoch": 0.20246085011185683, + "grad_norm": 0.337289035320282, + "learning_rate": 4.997079457984459e-06, + "loss": 0.6861, + "step": 181 + }, + { + "epoch": 0.203579418344519, + "grad_norm": 0.3266119658946991, + "learning_rate": 4.997006915180642e-06, + "loss": 0.6687, + "step": 182 + }, + { + "epoch": 0.20469798657718122, + "grad_norm": 0.33044853806495667, + "learning_rate": 4.996933482996092e-06, + "loss": 0.6637, + "step": 183 + }, + { + "epoch": 0.2058165548098434, + "grad_norm": 0.33716171979904175, + "learning_rate": 4.996859161456965e-06, + "loss": 0.6644, + "step": 184 + }, + { + "epoch": 0.20693512304250558, + "grad_norm": 0.32554203271865845, + "learning_rate": 4.996783950589733e-06, + "loss": 0.6524, + "step": 185 + }, + { + "epoch": 0.2080536912751678, + "grad_norm": 0.3271404504776001, + "learning_rate": 4.996707850421184e-06, + "loss": 0.6581, + "step": 186 + }, + { + "epoch": 0.20917225950782997, + "grad_norm": 0.34464138746261597, + "learning_rate": 4.996630860978424e-06, + "loss": 0.6768, + "step": 187 + }, + { + "epoch": 0.21029082774049218, + "grad_norm": 0.3408767282962799, + "learning_rate": 4.996552982288875e-06, + "loss": 0.6556, + "step": 188 + }, + { + "epoch": 0.21140939597315436, + "grad_norm": 0.3375307023525238, + "learning_rate": 4.996474214380276e-06, + "loss": 0.6819, + "step": 189 + }, + { + "epoch": 0.21252796420581654, + "grad_norm": 0.3313542902469635, + "learning_rate": 4.99639455728068e-06, + "loss": 0.6483, + "step": 190 + }, + { + "epoch": 0.21364653243847875, + "grad_norm": 0.3327822685241699, + "learning_rate": 4.996314011018462e-06, + "loss": 0.6669, + "step": 191 + }, + { + "epoch": 0.21476510067114093, + "grad_norm": 0.33021339774131775, + "learning_rate": 4.99623257562231e-06, + "loss": 0.6734, + "step": 192 + }, + { + "epoch": 0.21588366890380314, + "grad_norm": 0.32687169313430786, + "learning_rate": 4.996150251121229e-06, + "loss": 0.6387, + "step": 193 + }, + { + "epoch": 0.21700223713646533, + "grad_norm": 0.3394392728805542, + "learning_rate": 4.996067037544542e-06, + "loss": 0.6623, + "step": 194 + }, + { + "epoch": 0.2181208053691275, + "grad_norm": 0.33284223079681396, + "learning_rate": 4.995982934921887e-06, + "loss": 0.6405, + "step": 195 + }, + { + "epoch": 0.21923937360178972, + "grad_norm": 0.344235360622406, + "learning_rate": 4.995897943283221e-06, + "loss": 0.6741, + "step": 196 + }, + { + "epoch": 0.2203579418344519, + "grad_norm": 0.33437255024909973, + "learning_rate": 4.995812062658815e-06, + "loss": 0.6718, + "step": 197 + }, + { + "epoch": 0.2214765100671141, + "grad_norm": 0.3216111361980438, + "learning_rate": 4.995725293079257e-06, + "loss": 0.6709, + "step": 198 + }, + { + "epoch": 0.2225950782997763, + "grad_norm": 0.3448997139930725, + "learning_rate": 4.9956376345754556e-06, + "loss": 0.6458, + "step": 199 + }, + { + "epoch": 0.22371364653243847, + "grad_norm": 0.34354478120803833, + "learning_rate": 4.99554908717863e-06, + "loss": 0.6615, + "step": 200 + }, + { + "epoch": 0.22483221476510068, + "grad_norm": 0.3417740762233734, + "learning_rate": 4.99545965092032e-06, + "loss": 0.6847, + "step": 201 + }, + { + "epoch": 0.22595078299776286, + "grad_norm": 0.3366676867008209, + "learning_rate": 4.99536932583238e-06, + "loss": 0.649, + "step": 202 + }, + { + "epoch": 0.22706935123042504, + "grad_norm": 0.3610089421272278, + "learning_rate": 4.995278111946983e-06, + "loss": 0.6616, + "step": 203 + }, + { + "epoch": 0.22818791946308725, + "grad_norm": 0.3359774053096771, + "learning_rate": 4.995186009296618e-06, + "loss": 0.6519, + "step": 204 + }, + { + "epoch": 0.22930648769574943, + "grad_norm": 0.34075963497161865, + "learning_rate": 4.9950930179140885e-06, + "loss": 0.6762, + "step": 205 + }, + { + "epoch": 0.23042505592841164, + "grad_norm": 0.32416507601737976, + "learning_rate": 4.994999137832517e-06, + "loss": 0.6499, + "step": 206 + }, + { + "epoch": 0.23154362416107382, + "grad_norm": 0.32749176025390625, + "learning_rate": 4.99490436908534e-06, + "loss": 0.645, + "step": 207 + }, + { + "epoch": 0.232662192393736, + "grad_norm": 0.3349708318710327, + "learning_rate": 4.994808711706314e-06, + "loss": 0.6676, + "step": 208 + }, + { + "epoch": 0.23378076062639822, + "grad_norm": 0.3491227328777313, + "learning_rate": 4.9947121657295094e-06, + "loss": 0.6287, + "step": 209 + }, + { + "epoch": 0.2348993288590604, + "grad_norm": 0.3400874733924866, + "learning_rate": 4.994614731189314e-06, + "loss": 0.6473, + "step": 210 + }, + { + "epoch": 0.2360178970917226, + "grad_norm": 0.3388952612876892, + "learning_rate": 4.994516408120432e-06, + "loss": 0.6821, + "step": 211 + }, + { + "epoch": 0.2371364653243848, + "grad_norm": 0.33224812150001526, + "learning_rate": 4.994417196557884e-06, + "loss": 0.649, + "step": 212 + }, + { + "epoch": 0.23825503355704697, + "grad_norm": 0.3307199478149414, + "learning_rate": 4.994317096537006e-06, + "loss": 0.6581, + "step": 213 + }, + { + "epoch": 0.23937360178970918, + "grad_norm": 0.3505164682865143, + "learning_rate": 4.994216108093452e-06, + "loss": 0.6498, + "step": 214 + }, + { + "epoch": 0.24049217002237136, + "grad_norm": 0.3284938335418701, + "learning_rate": 4.994114231263193e-06, + "loss": 0.6503, + "step": 215 + }, + { + "epoch": 0.24161073825503357, + "grad_norm": 0.3475011885166168, + "learning_rate": 4.994011466082514e-06, + "loss": 0.6724, + "step": 216 + }, + { + "epoch": 0.24272930648769575, + "grad_norm": 0.34667858481407166, + "learning_rate": 4.993907812588019e-06, + "loss": 0.6373, + "step": 217 + }, + { + "epoch": 0.24384787472035793, + "grad_norm": 0.34276899695396423, + "learning_rate": 4.993803270816627e-06, + "loss": 0.6513, + "step": 218 + }, + { + "epoch": 0.24496644295302014, + "grad_norm": 0.34308409690856934, + "learning_rate": 4.993697840805572e-06, + "loss": 0.6596, + "step": 219 + }, + { + "epoch": 0.24608501118568232, + "grad_norm": 0.33108261227607727, + "learning_rate": 4.9935915225924075e-06, + "loss": 0.6623, + "step": 220 + }, + { + "epoch": 0.24720357941834453, + "grad_norm": 0.3529888093471527, + "learning_rate": 4.9934843162150015e-06, + "loss": 0.658, + "step": 221 + }, + { + "epoch": 0.2483221476510067, + "grad_norm": 0.3457166850566864, + "learning_rate": 4.993376221711538e-06, + "loss": 0.6342, + "step": 222 + }, + { + "epoch": 0.2494407158836689, + "grad_norm": 0.35108813643455505, + "learning_rate": 4.993267239120519e-06, + "loss": 0.6325, + "step": 223 + }, + { + "epoch": 0.2505592841163311, + "grad_norm": 0.3448682129383087, + "learning_rate": 4.993157368480761e-06, + "loss": 0.6746, + "step": 224 + }, + { + "epoch": 0.2516778523489933, + "grad_norm": 0.34094589948654175, + "learning_rate": 4.993046609831397e-06, + "loss": 0.6313, + "step": 225 + }, + { + "epoch": 0.25279642058165547, + "grad_norm": 0.33934327960014343, + "learning_rate": 4.9929349632118785e-06, + "loss": 0.6371, + "step": 226 + }, + { + "epoch": 0.2539149888143177, + "grad_norm": 0.3517382740974426, + "learning_rate": 4.99282242866197e-06, + "loss": 0.6411, + "step": 227 + }, + { + "epoch": 0.2550335570469799, + "grad_norm": 0.34098172187805176, + "learning_rate": 4.992709006221755e-06, + "loss": 0.6648, + "step": 228 + }, + { + "epoch": 0.25615212527964204, + "grad_norm": 0.3397183120250702, + "learning_rate": 4.992594695931632e-06, + "loss": 0.6038, + "step": 229 + }, + { + "epoch": 0.25727069351230425, + "grad_norm": 0.35994404554367065, + "learning_rate": 4.992479497832316e-06, + "loss": 0.6832, + "step": 230 + }, + { + "epoch": 0.25838926174496646, + "grad_norm": 0.3505656123161316, + "learning_rate": 4.992363411964838e-06, + "loss": 0.682, + "step": 231 + }, + { + "epoch": 0.2595078299776286, + "grad_norm": 0.343159556388855, + "learning_rate": 4.992246438370545e-06, + "loss": 0.6597, + "step": 232 + }, + { + "epoch": 0.2606263982102908, + "grad_norm": 0.36491280794143677, + "learning_rate": 4.9921285770911e-06, + "loss": 0.6422, + "step": 233 + }, + { + "epoch": 0.26174496644295303, + "grad_norm": 0.3656606078147888, + "learning_rate": 4.992009828168484e-06, + "loss": 0.6988, + "step": 234 + }, + { + "epoch": 0.26286353467561524, + "grad_norm": 0.3348519206047058, + "learning_rate": 4.991890191644993e-06, + "loss": 0.6281, + "step": 235 + }, + { + "epoch": 0.2639821029082774, + "grad_norm": 0.3407367467880249, + "learning_rate": 4.991769667563237e-06, + "loss": 0.6487, + "step": 236 + }, + { + "epoch": 0.2651006711409396, + "grad_norm": 0.3644556999206543, + "learning_rate": 4.991648255966145e-06, + "loss": 0.6443, + "step": 237 + }, + { + "epoch": 0.2662192393736018, + "grad_norm": 0.3346659243106842, + "learning_rate": 4.991525956896962e-06, + "loss": 0.632, + "step": 238 + }, + { + "epoch": 0.26733780760626397, + "grad_norm": 0.36535120010375977, + "learning_rate": 4.991402770399249e-06, + "loss": 0.6347, + "step": 239 + }, + { + "epoch": 0.2684563758389262, + "grad_norm": 0.34253549575805664, + "learning_rate": 4.991278696516879e-06, + "loss": 0.6946, + "step": 240 + }, + { + "epoch": 0.2695749440715884, + "grad_norm": 0.35125476121902466, + "learning_rate": 4.9911537352940485e-06, + "loss": 0.6669, + "step": 241 + }, + { + "epoch": 0.27069351230425054, + "grad_norm": 0.35836276412010193, + "learning_rate": 4.991027886775264e-06, + "loss": 0.6534, + "step": 242 + }, + { + "epoch": 0.27181208053691275, + "grad_norm": 0.34344252943992615, + "learning_rate": 4.990901151005349e-06, + "loss": 0.6595, + "step": 243 + }, + { + "epoch": 0.27293064876957496, + "grad_norm": 0.35524797439575195, + "learning_rate": 4.9907735280294465e-06, + "loss": 0.6612, + "step": 244 + }, + { + "epoch": 0.2740492170022371, + "grad_norm": 0.3483973741531372, + "learning_rate": 4.990645017893013e-06, + "loss": 0.6694, + "step": 245 + }, + { + "epoch": 0.2751677852348993, + "grad_norm": 0.34605199098587036, + "learning_rate": 4.990515620641819e-06, + "loss": 0.6453, + "step": 246 + }, + { + "epoch": 0.27628635346756153, + "grad_norm": 0.3441944122314453, + "learning_rate": 4.990385336321954e-06, + "loss": 0.6356, + "step": 247 + }, + { + "epoch": 0.27740492170022374, + "grad_norm": 0.36233291029930115, + "learning_rate": 4.990254164979823e-06, + "loss": 0.673, + "step": 248 + }, + { + "epoch": 0.2785234899328859, + "grad_norm": 0.3624320328235626, + "learning_rate": 4.990122106662145e-06, + "loss": 0.6459, + "step": 249 + }, + { + "epoch": 0.2796420581655481, + "grad_norm": 0.3635922372341156, + "learning_rate": 4.989989161415959e-06, + "loss": 0.6552, + "step": 250 + }, + { + "epoch": 0.2807606263982103, + "grad_norm": 0.3370678424835205, + "learning_rate": 4.989855329288615e-06, + "loss": 0.6098, + "step": 251 + }, + { + "epoch": 0.28187919463087246, + "grad_norm": 0.35488393902778625, + "learning_rate": 4.989720610327782e-06, + "loss": 0.6554, + "step": 252 + }, + { + "epoch": 0.2829977628635347, + "grad_norm": 0.3495752513408661, + "learning_rate": 4.989585004581444e-06, + "loss": 0.6339, + "step": 253 + }, + { + "epoch": 0.2841163310961969, + "grad_norm": 0.3451905846595764, + "learning_rate": 4.989448512097901e-06, + "loss": 0.6954, + "step": 254 + }, + { + "epoch": 0.28523489932885904, + "grad_norm": 0.3532280921936035, + "learning_rate": 4.989311132925768e-06, + "loss": 0.6198, + "step": 255 + }, + { + "epoch": 0.28635346756152125, + "grad_norm": 0.3570882976055145, + "learning_rate": 4.989172867113976e-06, + "loss": 0.6492, + "step": 256 + }, + { + "epoch": 0.28747203579418346, + "grad_norm": 0.33201339840888977, + "learning_rate": 4.9890337147117755e-06, + "loss": 0.6324, + "step": 257 + }, + { + "epoch": 0.28859060402684567, + "grad_norm": 0.3354071080684662, + "learning_rate": 4.988893675768726e-06, + "loss": 0.628, + "step": 258 + }, + { + "epoch": 0.2897091722595078, + "grad_norm": 0.341133713722229, + "learning_rate": 4.988752750334708e-06, + "loss": 0.6316, + "step": 259 + }, + { + "epoch": 0.29082774049217003, + "grad_norm": 0.33613571524620056, + "learning_rate": 4.9886109384599165e-06, + "loss": 0.6401, + "step": 260 + }, + { + "epoch": 0.29194630872483224, + "grad_norm": 0.3657302260398865, + "learning_rate": 4.988468240194861e-06, + "loss": 0.6743, + "step": 261 + }, + { + "epoch": 0.2930648769574944, + "grad_norm": 0.3349529504776001, + "learning_rate": 4.988324655590369e-06, + "loss": 0.6121, + "step": 262 + }, + { + "epoch": 0.2941834451901566, + "grad_norm": 0.34364601969718933, + "learning_rate": 4.98818018469758e-06, + "loss": 0.6427, + "step": 263 + }, + { + "epoch": 0.2953020134228188, + "grad_norm": 0.35895341634750366, + "learning_rate": 4.988034827567953e-06, + "loss": 0.6913, + "step": 264 + }, + { + "epoch": 0.29642058165548096, + "grad_norm": 0.3571792244911194, + "learning_rate": 4.987888584253262e-06, + "loss": 0.6286, + "step": 265 + }, + { + "epoch": 0.2975391498881432, + "grad_norm": 0.3684253394603729, + "learning_rate": 4.987741454805594e-06, + "loss": 0.6365, + "step": 266 + }, + { + "epoch": 0.2986577181208054, + "grad_norm": 0.3521265983581543, + "learning_rate": 4.987593439277353e-06, + "loss": 0.6172, + "step": 267 + }, + { + "epoch": 0.29977628635346754, + "grad_norm": 0.34943872690200806, + "learning_rate": 4.98744453772126e-06, + "loss": 0.6382, + "step": 268 + }, + { + "epoch": 0.30089485458612975, + "grad_norm": 0.35075268149375916, + "learning_rate": 4.9872947501903515e-06, + "loss": 0.6497, + "step": 269 + }, + { + "epoch": 0.30201342281879195, + "grad_norm": 0.3558429479598999, + "learning_rate": 4.987144076737978e-06, + "loss": 0.6561, + "step": 270 + }, + { + "epoch": 0.30313199105145416, + "grad_norm": 0.35898861289024353, + "learning_rate": 4.986992517417805e-06, + "loss": 0.6613, + "step": 271 + }, + { + "epoch": 0.3042505592841163, + "grad_norm": 0.35135966539382935, + "learning_rate": 4.986840072283815e-06, + "loss": 0.6507, + "step": 272 + }, + { + "epoch": 0.3053691275167785, + "grad_norm": 0.3530326783657074, + "learning_rate": 4.986686741390308e-06, + "loss": 0.6459, + "step": 273 + }, + { + "epoch": 0.30648769574944074, + "grad_norm": 0.34631597995758057, + "learning_rate": 4.986532524791894e-06, + "loss": 0.6074, + "step": 274 + }, + { + "epoch": 0.3076062639821029, + "grad_norm": 0.37036123871803284, + "learning_rate": 4.986377422543503e-06, + "loss": 0.6416, + "step": 275 + }, + { + "epoch": 0.3087248322147651, + "grad_norm": 0.3576701283454895, + "learning_rate": 4.98622143470038e-06, + "loss": 0.5939, + "step": 276 + }, + { + "epoch": 0.3098434004474273, + "grad_norm": 0.35285863280296326, + "learning_rate": 4.986064561318083e-06, + "loss": 0.6405, + "step": 277 + }, + { + "epoch": 0.31096196868008946, + "grad_norm": 0.35576099157333374, + "learning_rate": 4.985906802452488e-06, + "loss": 0.6348, + "step": 278 + }, + { + "epoch": 0.31208053691275167, + "grad_norm": 0.35739952325820923, + "learning_rate": 4.985748158159785e-06, + "loss": 0.65, + "step": 279 + }, + { + "epoch": 0.3131991051454139, + "grad_norm": 0.35885411500930786, + "learning_rate": 4.985588628496481e-06, + "loss": 0.6575, + "step": 280 + }, + { + "epoch": 0.3143176733780761, + "grad_norm": 0.36760038137435913, + "learning_rate": 4.985428213519396e-06, + "loss": 0.6606, + "step": 281 + }, + { + "epoch": 0.31543624161073824, + "grad_norm": 0.35371875762939453, + "learning_rate": 4.9852669132856645e-06, + "loss": 0.6495, + "step": 282 + }, + { + "epoch": 0.31655480984340045, + "grad_norm": 0.3489198684692383, + "learning_rate": 4.985104727852741e-06, + "loss": 0.6402, + "step": 283 + }, + { + "epoch": 0.31767337807606266, + "grad_norm": 0.3595716953277588, + "learning_rate": 4.984941657278392e-06, + "loss": 0.6495, + "step": 284 + }, + { + "epoch": 0.3187919463087248, + "grad_norm": 0.3551527261734009, + "learning_rate": 4.984777701620698e-06, + "loss": 0.6555, + "step": 285 + }, + { + "epoch": 0.319910514541387, + "grad_norm": 0.35428524017333984, + "learning_rate": 4.984612860938059e-06, + "loss": 0.6435, + "step": 286 + }, + { + "epoch": 0.32102908277404923, + "grad_norm": 0.36495980620384216, + "learning_rate": 4.984447135289185e-06, + "loss": 0.6375, + "step": 287 + }, + { + "epoch": 0.3221476510067114, + "grad_norm": 0.36956459283828735, + "learning_rate": 4.984280524733107e-06, + "loss": 0.654, + "step": 288 + }, + { + "epoch": 0.3232662192393736, + "grad_norm": 0.34770330786705017, + "learning_rate": 4.984113029329166e-06, + "loss": 0.6313, + "step": 289 + }, + { + "epoch": 0.3243847874720358, + "grad_norm": 0.3676060140132904, + "learning_rate": 4.9839446491370215e-06, + "loss": 0.6697, + "step": 290 + }, + { + "epoch": 0.32550335570469796, + "grad_norm": 0.3490992486476898, + "learning_rate": 4.983775384216646e-06, + "loss": 0.6343, + "step": 291 + }, + { + "epoch": 0.32662192393736017, + "grad_norm": 0.36720773577690125, + "learning_rate": 4.983605234628328e-06, + "loss": 0.6609, + "step": 292 + }, + { + "epoch": 0.3277404921700224, + "grad_norm": 0.3619595766067505, + "learning_rate": 4.983434200432672e-06, + "loss": 0.6635, + "step": 293 + }, + { + "epoch": 0.3288590604026846, + "grad_norm": 0.35261741280555725, + "learning_rate": 4.983262281690596e-06, + "loss": 0.6273, + "step": 294 + }, + { + "epoch": 0.32997762863534674, + "grad_norm": 0.34182801842689514, + "learning_rate": 4.983089478463335e-06, + "loss": 0.6271, + "step": 295 + }, + { + "epoch": 0.33109619686800895, + "grad_norm": 0.3623373806476593, + "learning_rate": 4.982915790812436e-06, + "loss": 0.6491, + "step": 296 + }, + { + "epoch": 0.33221476510067116, + "grad_norm": 0.3656613826751709, + "learning_rate": 4.982741218799763e-06, + "loss": 0.6672, + "step": 297 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.3533206880092621, + "learning_rate": 4.982565762487498e-06, + "loss": 0.6257, + "step": 298 + }, + { + "epoch": 0.3344519015659955, + "grad_norm": 0.35345301032066345, + "learning_rate": 4.982389421938131e-06, + "loss": 0.6486, + "step": 299 + }, + { + "epoch": 0.33557046979865773, + "grad_norm": 0.36566492915153503, + "learning_rate": 4.982212197214472e-06, + "loss": 0.6411, + "step": 300 + }, + { + "epoch": 0.3366890380313199, + "grad_norm": 0.3681536018848419, + "learning_rate": 4.982034088379646e-06, + "loss": 0.6208, + "step": 301 + }, + { + "epoch": 0.3378076062639821, + "grad_norm": 0.3649962246417999, + "learning_rate": 4.98185509549709e-06, + "loss": 0.6557, + "step": 302 + }, + { + "epoch": 0.3389261744966443, + "grad_norm": 0.350563108921051, + "learning_rate": 4.981675218630557e-06, + "loss": 0.6361, + "step": 303 + }, + { + "epoch": 0.3400447427293065, + "grad_norm": 0.35432034730911255, + "learning_rate": 4.981494457844117e-06, + "loss": 0.6294, + "step": 304 + }, + { + "epoch": 0.34116331096196867, + "grad_norm": 0.3590239882469177, + "learning_rate": 4.981312813202153e-06, + "loss": 0.6318, + "step": 305 + }, + { + "epoch": 0.3422818791946309, + "grad_norm": 0.3664059042930603, + "learning_rate": 4.981130284769361e-06, + "loss": 0.648, + "step": 306 + }, + { + "epoch": 0.3434004474272931, + "grad_norm": 0.35878944396972656, + "learning_rate": 4.9809468726107555e-06, + "loss": 0.619, + "step": 307 + }, + { + "epoch": 0.34451901565995524, + "grad_norm": 0.35763007402420044, + "learning_rate": 4.980762576791664e-06, + "loss": 0.6567, + "step": 308 + }, + { + "epoch": 0.34563758389261745, + "grad_norm": 0.35878288745880127, + "learning_rate": 4.980577397377728e-06, + "loss": 0.6421, + "step": 309 + }, + { + "epoch": 0.34675615212527966, + "grad_norm": 0.3597519099712372, + "learning_rate": 4.980391334434906e-06, + "loss": 0.6352, + "step": 310 + }, + { + "epoch": 0.3478747203579418, + "grad_norm": 0.3498169481754303, + "learning_rate": 4.980204388029466e-06, + "loss": 0.626, + "step": 311 + }, + { + "epoch": 0.348993288590604, + "grad_norm": 0.36353105306625366, + "learning_rate": 4.980016558227998e-06, + "loss": 0.6513, + "step": 312 + }, + { + "epoch": 0.35011185682326623, + "grad_norm": 0.355794221162796, + "learning_rate": 4.979827845097402e-06, + "loss": 0.6385, + "step": 313 + }, + { + "epoch": 0.3512304250559284, + "grad_norm": 0.3594406545162201, + "learning_rate": 4.979638248704894e-06, + "loss": 0.6134, + "step": 314 + }, + { + "epoch": 0.3523489932885906, + "grad_norm": 0.3639025092124939, + "learning_rate": 4.979447769118002e-06, + "loss": 0.6386, + "step": 315 + }, + { + "epoch": 0.3534675615212528, + "grad_norm": 0.3681359887123108, + "learning_rate": 4.979256406404574e-06, + "loss": 0.6213, + "step": 316 + }, + { + "epoch": 0.354586129753915, + "grad_norm": 0.35021546483039856, + "learning_rate": 4.979064160632766e-06, + "loss": 0.5933, + "step": 317 + }, + { + "epoch": 0.35570469798657717, + "grad_norm": 0.35783469676971436, + "learning_rate": 4.978871031871054e-06, + "loss": 0.6054, + "step": 318 + }, + { + "epoch": 0.3568232662192394, + "grad_norm": 0.3768575191497803, + "learning_rate": 4.978677020188226e-06, + "loss": 0.651, + "step": 319 + }, + { + "epoch": 0.3579418344519016, + "grad_norm": 0.3841581642627716, + "learning_rate": 4.978482125653385e-06, + "loss": 0.6447, + "step": 320 + }, + { + "epoch": 0.35906040268456374, + "grad_norm": 0.3749678134918213, + "learning_rate": 4.978286348335949e-06, + "loss": 0.6403, + "step": 321 + }, + { + "epoch": 0.36017897091722595, + "grad_norm": 0.35757121443748474, + "learning_rate": 4.978089688305647e-06, + "loss": 0.6297, + "step": 322 + }, + { + "epoch": 0.36129753914988816, + "grad_norm": 0.36303797364234924, + "learning_rate": 4.977892145632528e-06, + "loss": 0.6438, + "step": 323 + }, + { + "epoch": 0.3624161073825503, + "grad_norm": 0.3670295476913452, + "learning_rate": 4.977693720386951e-06, + "loss": 0.6055, + "step": 324 + }, + { + "epoch": 0.3635346756152125, + "grad_norm": 0.3521486818790436, + "learning_rate": 4.977494412639591e-06, + "loss": 0.6072, + "step": 325 + }, + { + "epoch": 0.36465324384787473, + "grad_norm": 0.35688498616218567, + "learning_rate": 4.9772942224614375e-06, + "loss": 0.6252, + "step": 326 + }, + { + "epoch": 0.36577181208053694, + "grad_norm": 0.3542022109031677, + "learning_rate": 4.9770931499237925e-06, + "loss": 0.6407, + "step": 327 + }, + { + "epoch": 0.3668903803131991, + "grad_norm": 0.3773319125175476, + "learning_rate": 4.976891195098277e-06, + "loss": 0.6524, + "step": 328 + }, + { + "epoch": 0.3680089485458613, + "grad_norm": 0.3697628080844879, + "learning_rate": 4.97668835805682e-06, + "loss": 0.6503, + "step": 329 + }, + { + "epoch": 0.3691275167785235, + "grad_norm": 0.36458712816238403, + "learning_rate": 4.976484638871669e-06, + "loss": 0.6722, + "step": 330 + }, + { + "epoch": 0.37024608501118567, + "grad_norm": 0.3523670434951782, + "learning_rate": 4.976280037615385e-06, + "loss": 0.6273, + "step": 331 + }, + { + "epoch": 0.3713646532438479, + "grad_norm": 0.35473543405532837, + "learning_rate": 4.9760745543608414e-06, + "loss": 0.6243, + "step": 332 + }, + { + "epoch": 0.3724832214765101, + "grad_norm": 0.36719077825546265, + "learning_rate": 4.9758681891812276e-06, + "loss": 0.6476, + "step": 333 + }, + { + "epoch": 0.37360178970917224, + "grad_norm": 0.3712293207645416, + "learning_rate": 4.9756609421500464e-06, + "loss": 0.6475, + "step": 334 + }, + { + "epoch": 0.37472035794183445, + "grad_norm": 0.36749109625816345, + "learning_rate": 4.9754528133411144e-06, + "loss": 0.6428, + "step": 335 + }, + { + "epoch": 0.37583892617449666, + "grad_norm": 0.35749316215515137, + "learning_rate": 4.975243802828563e-06, + "loss": 0.6123, + "step": 336 + }, + { + "epoch": 0.3769574944071588, + "grad_norm": 0.37466734647750854, + "learning_rate": 4.975033910686837e-06, + "loss": 0.6393, + "step": 337 + }, + { + "epoch": 0.378076062639821, + "grad_norm": 0.36750441789627075, + "learning_rate": 4.974823136990697e-06, + "loss": 0.6405, + "step": 338 + }, + { + "epoch": 0.37919463087248323, + "grad_norm": 0.3831922113895416, + "learning_rate": 4.9746114818152135e-06, + "loss": 0.6633, + "step": 339 + }, + { + "epoch": 0.38031319910514544, + "grad_norm": 0.35679274797439575, + "learning_rate": 4.974398945235776e-06, + "loss": 0.6431, + "step": 340 + }, + { + "epoch": 0.3814317673378076, + "grad_norm": 0.36524152755737305, + "learning_rate": 4.974185527328084e-06, + "loss": 0.6419, + "step": 341 + }, + { + "epoch": 0.3825503355704698, + "grad_norm": 0.3668903410434723, + "learning_rate": 4.9739712281681525e-06, + "loss": 0.6418, + "step": 342 + }, + { + "epoch": 0.383668903803132, + "grad_norm": 0.37841862440109253, + "learning_rate": 4.973756047832312e-06, + "loss": 0.6585, + "step": 343 + }, + { + "epoch": 0.38478747203579416, + "grad_norm": 0.37758868932724, + "learning_rate": 4.9735399863972024e-06, + "loss": 0.6493, + "step": 344 + }, + { + "epoch": 0.3859060402684564, + "grad_norm": 0.3663494288921356, + "learning_rate": 4.973323043939783e-06, + "loss": 0.6728, + "step": 345 + }, + { + "epoch": 0.3870246085011186, + "grad_norm": 0.3930216431617737, + "learning_rate": 4.973105220537322e-06, + "loss": 0.6608, + "step": 346 + }, + { + "epoch": 0.38814317673378074, + "grad_norm": 0.390828400850296, + "learning_rate": 4.972886516267404e-06, + "loss": 0.6497, + "step": 347 + }, + { + "epoch": 0.38926174496644295, + "grad_norm": 0.37600764632225037, + "learning_rate": 4.972666931207927e-06, + "loss": 0.6426, + "step": 348 + }, + { + "epoch": 0.39038031319910516, + "grad_norm": 0.36520275473594666, + "learning_rate": 4.972446465437103e-06, + "loss": 0.645, + "step": 349 + }, + { + "epoch": 0.39149888143176736, + "grad_norm": 0.3984422981739044, + "learning_rate": 4.972225119033457e-06, + "loss": 0.6368, + "step": 350 + }, + { + "epoch": 0.3926174496644295, + "grad_norm": 0.3725559711456299, + "learning_rate": 4.972002892075827e-06, + "loss": 0.625, + "step": 351 + }, + { + "epoch": 0.39373601789709173, + "grad_norm": 0.3889387547969818, + "learning_rate": 4.9717797846433655e-06, + "loss": 0.6258, + "step": 352 + }, + { + "epoch": 0.39485458612975394, + "grad_norm": 0.37537676095962524, + "learning_rate": 4.97155579681554e-06, + "loss": 0.64, + "step": 353 + }, + { + "epoch": 0.3959731543624161, + "grad_norm": 0.3795606791973114, + "learning_rate": 4.97133092867213e-06, + "loss": 0.6849, + "step": 354 + }, + { + "epoch": 0.3970917225950783, + "grad_norm": 0.38519197702407837, + "learning_rate": 4.971105180293228e-06, + "loss": 0.6493, + "step": 355 + }, + { + "epoch": 0.3982102908277405, + "grad_norm": 0.3789883553981781, + "learning_rate": 4.97087855175924e-06, + "loss": 0.6281, + "step": 356 + }, + { + "epoch": 0.39932885906040266, + "grad_norm": 0.3687867820262909, + "learning_rate": 4.970651043150887e-06, + "loss": 0.6278, + "step": 357 + }, + { + "epoch": 0.4004474272930649, + "grad_norm": 0.365581214427948, + "learning_rate": 4.970422654549204e-06, + "loss": 0.647, + "step": 358 + }, + { + "epoch": 0.4015659955257271, + "grad_norm": 0.37871256470680237, + "learning_rate": 4.970193386035537e-06, + "loss": 0.6349, + "step": 359 + }, + { + "epoch": 0.40268456375838924, + "grad_norm": 0.37639445066452026, + "learning_rate": 4.969963237691547e-06, + "loss": 0.6544, + "step": 360 + }, + { + "epoch": 0.40380313199105144, + "grad_norm": 0.38033226132392883, + "learning_rate": 4.9697322095992075e-06, + "loss": 0.6216, + "step": 361 + }, + { + "epoch": 0.40492170022371365, + "grad_norm": 0.3785533010959625, + "learning_rate": 4.969500301840805e-06, + "loss": 0.6379, + "step": 362 + }, + { + "epoch": 0.40604026845637586, + "grad_norm": 0.36831173300743103, + "learning_rate": 4.969267514498942e-06, + "loss": 0.6305, + "step": 363 + }, + { + "epoch": 0.407158836689038, + "grad_norm": 0.3860856592655182, + "learning_rate": 4.969033847656531e-06, + "loss": 0.6428, + "step": 364 + }, + { + "epoch": 0.4082774049217002, + "grad_norm": 0.37992534041404724, + "learning_rate": 4.9687993013968e-06, + "loss": 0.629, + "step": 365 + }, + { + "epoch": 0.40939597315436244, + "grad_norm": 0.37179872393608093, + "learning_rate": 4.9685638758032885e-06, + "loss": 0.6146, + "step": 366 + }, + { + "epoch": 0.4105145413870246, + "grad_norm": 0.3762771487236023, + "learning_rate": 4.96832757095985e-06, + "loss": 0.6294, + "step": 367 + }, + { + "epoch": 0.4116331096196868, + "grad_norm": 0.37078356742858887, + "learning_rate": 4.968090386950653e-06, + "loss": 0.6438, + "step": 368 + }, + { + "epoch": 0.412751677852349, + "grad_norm": 0.3619535565376282, + "learning_rate": 4.967852323860176e-06, + "loss": 0.6229, + "step": 369 + }, + { + "epoch": 0.41387024608501116, + "grad_norm": 0.3610592782497406, + "learning_rate": 4.967613381773211e-06, + "loss": 0.6332, + "step": 370 + }, + { + "epoch": 0.41498881431767337, + "grad_norm": 0.36372244358062744, + "learning_rate": 4.9673735607748665e-06, + "loss": 0.6379, + "step": 371 + }, + { + "epoch": 0.4161073825503356, + "grad_norm": 0.3713506758213043, + "learning_rate": 4.96713286095056e-06, + "loss": 0.6051, + "step": 372 + }, + { + "epoch": 0.4172259507829978, + "grad_norm": 0.37290191650390625, + "learning_rate": 4.9668912823860244e-06, + "loss": 0.6431, + "step": 373 + }, + { + "epoch": 0.41834451901565994, + "grad_norm": 0.3736407458782196, + "learning_rate": 4.966648825167305e-06, + "loss": 0.6296, + "step": 374 + }, + { + "epoch": 0.41946308724832215, + "grad_norm": 0.38261404633522034, + "learning_rate": 4.9664054893807586e-06, + "loss": 0.6559, + "step": 375 + }, + { + "epoch": 0.42058165548098436, + "grad_norm": 0.36865612864494324, + "learning_rate": 4.966161275113057e-06, + "loss": 0.6372, + "step": 376 + }, + { + "epoch": 0.4217002237136465, + "grad_norm": 0.3745094835758209, + "learning_rate": 4.965916182451185e-06, + "loss": 0.6526, + "step": 377 + }, + { + "epoch": 0.4228187919463087, + "grad_norm": 0.3758225440979004, + "learning_rate": 4.965670211482437e-06, + "loss": 0.6423, + "step": 378 + }, + { + "epoch": 0.42393736017897093, + "grad_norm": 0.37716934084892273, + "learning_rate": 4.965423362294426e-06, + "loss": 0.6431, + "step": 379 + }, + { + "epoch": 0.4250559284116331, + "grad_norm": 0.3757461905479431, + "learning_rate": 4.965175634975072e-06, + "loss": 0.6335, + "step": 380 + }, + { + "epoch": 0.4261744966442953, + "grad_norm": 0.3701077401638031, + "learning_rate": 4.964927029612611e-06, + "loss": 0.6182, + "step": 381 + }, + { + "epoch": 0.4272930648769575, + "grad_norm": 0.38263121247291565, + "learning_rate": 4.96467754629559e-06, + "loss": 0.6371, + "step": 382 + }, + { + "epoch": 0.42841163310961966, + "grad_norm": 0.3740926682949066, + "learning_rate": 4.9644271851128715e-06, + "loss": 0.6272, + "step": 383 + }, + { + "epoch": 0.42953020134228187, + "grad_norm": 0.39056089520454407, + "learning_rate": 4.964175946153627e-06, + "loss": 0.624, + "step": 384 + }, + { + "epoch": 0.4306487695749441, + "grad_norm": 0.3867873549461365, + "learning_rate": 4.963923829507343e-06, + "loss": 0.6714, + "step": 385 + }, + { + "epoch": 0.4317673378076063, + "grad_norm": 0.3808860182762146, + "learning_rate": 4.963670835263819e-06, + "loss": 0.6412, + "step": 386 + }, + { + "epoch": 0.43288590604026844, + "grad_norm": 0.3839844763278961, + "learning_rate": 4.963416963513166e-06, + "loss": 0.6288, + "step": 387 + }, + { + "epoch": 0.43400447427293065, + "grad_norm": 0.37187114357948303, + "learning_rate": 4.963162214345806e-06, + "loss": 0.6307, + "step": 388 + }, + { + "epoch": 0.43512304250559286, + "grad_norm": 0.36723873019218445, + "learning_rate": 4.962906587852477e-06, + "loss": 0.6285, + "step": 389 + }, + { + "epoch": 0.436241610738255, + "grad_norm": 0.3754160404205322, + "learning_rate": 4.962650084124226e-06, + "loss": 0.6227, + "step": 390 + }, + { + "epoch": 0.4373601789709172, + "grad_norm": 0.374239981174469, + "learning_rate": 4.962392703252417e-06, + "loss": 0.6612, + "step": 391 + }, + { + "epoch": 0.43847874720357943, + "grad_norm": 0.3758632242679596, + "learning_rate": 4.9621344453287214e-06, + "loss": 0.6408, + "step": 392 + }, + { + "epoch": 0.4395973154362416, + "grad_norm": 0.3839190602302551, + "learning_rate": 4.9618753104451254e-06, + "loss": 0.6524, + "step": 393 + }, + { + "epoch": 0.4407158836689038, + "grad_norm": 0.36766374111175537, + "learning_rate": 4.961615298693928e-06, + "loss": 0.6232, + "step": 394 + }, + { + "epoch": 0.441834451901566, + "grad_norm": 0.3692423701286316, + "learning_rate": 4.961354410167739e-06, + "loss": 0.6436, + "step": 395 + }, + { + "epoch": 0.4429530201342282, + "grad_norm": 0.3720521926879883, + "learning_rate": 4.961092644959482e-06, + "loss": 0.6346, + "step": 396 + }, + { + "epoch": 0.44407158836689037, + "grad_norm": 0.373910129070282, + "learning_rate": 4.960830003162392e-06, + "loss": 0.6211, + "step": 397 + }, + { + "epoch": 0.4451901565995526, + "grad_norm": 0.37455853819847107, + "learning_rate": 4.960566484870017e-06, + "loss": 0.6366, + "step": 398 + }, + { + "epoch": 0.4463087248322148, + "grad_norm": 0.387390673160553, + "learning_rate": 4.960302090176215e-06, + "loss": 0.6543, + "step": 399 + }, + { + "epoch": 0.44742729306487694, + "grad_norm": 0.3862502872943878, + "learning_rate": 4.960036819175159e-06, + "loss": 0.6351, + "step": 400 + }, + { + "epoch": 0.44854586129753915, + "grad_norm": 0.38686901330947876, + "learning_rate": 4.959770671961334e-06, + "loss": 0.6247, + "step": 401 + }, + { + "epoch": 0.44966442953020136, + "grad_norm": 0.38111770153045654, + "learning_rate": 4.959503648629534e-06, + "loss": 0.6624, + "step": 402 + }, + { + "epoch": 0.4507829977628635, + "grad_norm": 0.3962753713130951, + "learning_rate": 4.959235749274866e-06, + "loss": 0.6224, + "step": 403 + }, + { + "epoch": 0.4519015659955257, + "grad_norm": 0.36403393745422363, + "learning_rate": 4.958966973992754e-06, + "loss": 0.6215, + "step": 404 + }, + { + "epoch": 0.45302013422818793, + "grad_norm": 0.3858584463596344, + "learning_rate": 4.958697322878926e-06, + "loss": 0.6473, + "step": 405 + }, + { + "epoch": 0.4541387024608501, + "grad_norm": 0.39325979351997375, + "learning_rate": 4.958426796029429e-06, + "loss": 0.6664, + "step": 406 + }, + { + "epoch": 0.4552572706935123, + "grad_norm": 0.37423112988471985, + "learning_rate": 4.958155393540618e-06, + "loss": 0.6416, + "step": 407 + }, + { + "epoch": 0.4563758389261745, + "grad_norm": 0.3979191482067108, + "learning_rate": 4.9578831155091585e-06, + "loss": 0.6493, + "step": 408 + }, + { + "epoch": 0.4574944071588367, + "grad_norm": 0.375473290681839, + "learning_rate": 4.957609962032034e-06, + "loss": 0.6246, + "step": 409 + }, + { + "epoch": 0.45861297539149887, + "grad_norm": 0.37951260805130005, + "learning_rate": 4.957335933206533e-06, + "loss": 0.6374, + "step": 410 + }, + { + "epoch": 0.4597315436241611, + "grad_norm": 0.384162575006485, + "learning_rate": 4.9570610291302605e-06, + "loss": 0.6411, + "step": 411 + }, + { + "epoch": 0.4608501118568233, + "grad_norm": 0.37713801860809326, + "learning_rate": 4.95678524990113e-06, + "loss": 0.6384, + "step": 412 + }, + { + "epoch": 0.46196868008948544, + "grad_norm": 0.3779420554637909, + "learning_rate": 4.95650859561737e-06, + "loss": 0.6238, + "step": 413 + }, + { + "epoch": 0.46308724832214765, + "grad_norm": 0.3826324939727783, + "learning_rate": 4.956231066377517e-06, + "loss": 0.6373, + "step": 414 + }, + { + "epoch": 0.46420581655480986, + "grad_norm": 0.3693124055862427, + "learning_rate": 4.955952662280422e-06, + "loss": 0.6264, + "step": 415 + }, + { + "epoch": 0.465324384787472, + "grad_norm": 0.3891177177429199, + "learning_rate": 4.9556733834252465e-06, + "loss": 0.6755, + "step": 416 + }, + { + "epoch": 0.4664429530201342, + "grad_norm": 0.3732079863548279, + "learning_rate": 4.955393229911465e-06, + "loss": 0.6163, + "step": 417 + }, + { + "epoch": 0.46756152125279643, + "grad_norm": 0.39267081022262573, + "learning_rate": 4.955112201838859e-06, + "loss": 0.653, + "step": 418 + }, + { + "epoch": 0.46868008948545864, + "grad_norm": 0.37127041816711426, + "learning_rate": 4.9548302993075275e-06, + "loss": 0.6024, + "step": 419 + }, + { + "epoch": 0.4697986577181208, + "grad_norm": 0.38274380564689636, + "learning_rate": 4.954547522417878e-06, + "loss": 0.6103, + "step": 420 + }, + { + "epoch": 0.470917225950783, + "grad_norm": 0.39440205693244934, + "learning_rate": 4.954263871270627e-06, + "loss": 0.6388, + "step": 421 + }, + { + "epoch": 0.4720357941834452, + "grad_norm": 0.38207298517227173, + "learning_rate": 4.953979345966808e-06, + "loss": 0.6157, + "step": 422 + }, + { + "epoch": 0.47315436241610737, + "grad_norm": 0.37390536069869995, + "learning_rate": 4.953693946607762e-06, + "loss": 0.612, + "step": 423 + }, + { + "epoch": 0.4742729306487696, + "grad_norm": 0.3679952621459961, + "learning_rate": 4.953407673295141e-06, + "loss": 0.5962, + "step": 424 + }, + { + "epoch": 0.4753914988814318, + "grad_norm": 0.36741313338279724, + "learning_rate": 4.953120526130911e-06, + "loss": 0.5802, + "step": 425 + }, + { + "epoch": 0.47651006711409394, + "grad_norm": 0.40101951360702515, + "learning_rate": 4.952832505217347e-06, + "loss": 0.631, + "step": 426 + }, + { + "epoch": 0.47762863534675615, + "grad_norm": 0.37646785378456116, + "learning_rate": 4.952543610657036e-06, + "loss": 0.6192, + "step": 427 + }, + { + "epoch": 0.47874720357941836, + "grad_norm": 0.3909439444541931, + "learning_rate": 4.952253842552876e-06, + "loss": 0.6288, + "step": 428 + }, + { + "epoch": 0.4798657718120805, + "grad_norm": 0.379685640335083, + "learning_rate": 4.9519632010080765e-06, + "loss": 0.6296, + "step": 429 + }, + { + "epoch": 0.4809843400447427, + "grad_norm": 0.3872782588005066, + "learning_rate": 4.9516716861261575e-06, + "loss": 0.6307, + "step": 430 + }, + { + "epoch": 0.48210290827740493, + "grad_norm": 0.4066009223461151, + "learning_rate": 4.951379298010951e-06, + "loss": 0.6454, + "step": 431 + }, + { + "epoch": 0.48322147651006714, + "grad_norm": 0.38412487506866455, + "learning_rate": 4.951086036766599e-06, + "loss": 0.6254, + "step": 432 + }, + { + "epoch": 0.4843400447427293, + "grad_norm": 0.37819865345954895, + "learning_rate": 4.9507919024975545e-06, + "loss": 0.629, + "step": 433 + }, + { + "epoch": 0.4854586129753915, + "grad_norm": 0.38674691319465637, + "learning_rate": 4.950496895308582e-06, + "loss": 0.6357, + "step": 434 + }, + { + "epoch": 0.4865771812080537, + "grad_norm": 0.39304593205451965, + "learning_rate": 4.950201015304758e-06, + "loss": 0.6475, + "step": 435 + }, + { + "epoch": 0.48769574944071586, + "grad_norm": 0.381124347448349, + "learning_rate": 4.949904262591467e-06, + "loss": 0.6523, + "step": 436 + }, + { + "epoch": 0.4888143176733781, + "grad_norm": 0.4084749221801758, + "learning_rate": 4.949606637274408e-06, + "loss": 0.6773, + "step": 437 + }, + { + "epoch": 0.4899328859060403, + "grad_norm": 0.3967250883579254, + "learning_rate": 4.949308139459586e-06, + "loss": 0.6263, + "step": 438 + }, + { + "epoch": 0.49105145413870244, + "grad_norm": 0.39761948585510254, + "learning_rate": 4.949008769253322e-06, + "loss": 0.6273, + "step": 439 + }, + { + "epoch": 0.49217002237136465, + "grad_norm": 0.3865715265274048, + "learning_rate": 4.948708526762244e-06, + "loss": 0.6464, + "step": 440 + }, + { + "epoch": 0.49328859060402686, + "grad_norm": 0.3970697820186615, + "learning_rate": 4.948407412093292e-06, + "loss": 0.6229, + "step": 441 + }, + { + "epoch": 0.49440715883668906, + "grad_norm": 0.3817065954208374, + "learning_rate": 4.948105425353718e-06, + "loss": 0.6375, + "step": 442 + }, + { + "epoch": 0.4955257270693512, + "grad_norm": 0.3877985179424286, + "learning_rate": 4.947802566651082e-06, + "loss": 0.6389, + "step": 443 + }, + { + "epoch": 0.4966442953020134, + "grad_norm": 0.40800127387046814, + "learning_rate": 4.947498836093257e-06, + "loss": 0.6627, + "step": 444 + }, + { + "epoch": 0.49776286353467564, + "grad_norm": 0.40732380747795105, + "learning_rate": 4.947194233788423e-06, + "loss": 0.6156, + "step": 445 + }, + { + "epoch": 0.4988814317673378, + "grad_norm": 0.3948177695274353, + "learning_rate": 4.946888759845074e-06, + "loss": 0.6481, + "step": 446 + }, + { + "epoch": 0.5, + "grad_norm": 0.38517189025878906, + "learning_rate": 4.9465824143720145e-06, + "loss": 0.6224, + "step": 447 + }, + { + "epoch": 0.5011185682326622, + "grad_norm": 0.3713424503803253, + "learning_rate": 4.946275197478358e-06, + "loss": 0.626, + "step": 448 + }, + { + "epoch": 0.5022371364653244, + "grad_norm": 0.4172223210334778, + "learning_rate": 4.945967109273527e-06, + "loss": 0.6405, + "step": 449 + }, + { + "epoch": 0.5033557046979866, + "grad_norm": 0.4550599157810211, + "learning_rate": 4.945658149867257e-06, + "loss": 0.6103, + "step": 450 + }, + { + "epoch": 0.5044742729306487, + "grad_norm": 0.3938581347465515, + "learning_rate": 4.945348319369593e-06, + "loss": 0.6304, + "step": 451 + }, + { + "epoch": 0.5055928411633109, + "grad_norm": 0.3923262059688568, + "learning_rate": 4.94503761789089e-06, + "loss": 0.6603, + "step": 452 + }, + { + "epoch": 0.5067114093959731, + "grad_norm": 0.3978983163833618, + "learning_rate": 4.944726045541814e-06, + "loss": 0.6445, + "step": 453 + }, + { + "epoch": 0.5078299776286354, + "grad_norm": 0.4101882576942444, + "learning_rate": 4.9444136024333374e-06, + "loss": 0.6223, + "step": 454 + }, + { + "epoch": 0.5089485458612976, + "grad_norm": 0.4056575298309326, + "learning_rate": 4.944100288676749e-06, + "loss": 0.6343, + "step": 455 + }, + { + "epoch": 0.5100671140939598, + "grad_norm": 0.39720436930656433, + "learning_rate": 4.943786104383644e-06, + "loss": 0.6246, + "step": 456 + }, + { + "epoch": 0.5111856823266219, + "grad_norm": 0.3909725248813629, + "learning_rate": 4.943471049665925e-06, + "loss": 0.6339, + "step": 457 + }, + { + "epoch": 0.5123042505592841, + "grad_norm": 0.3773731291294098, + "learning_rate": 4.943155124635812e-06, + "loss": 0.6215, + "step": 458 + }, + { + "epoch": 0.5134228187919463, + "grad_norm": 0.4020001292228699, + "learning_rate": 4.9428383294058295e-06, + "loss": 0.6269, + "step": 459 + }, + { + "epoch": 0.5145413870246085, + "grad_norm": 0.3916706144809723, + "learning_rate": 4.942520664088812e-06, + "loss": 0.6233, + "step": 460 + }, + { + "epoch": 0.5156599552572707, + "grad_norm": 0.38717713952064514, + "learning_rate": 4.9422021287979076e-06, + "loss": 0.6216, + "step": 461 + }, + { + "epoch": 0.5167785234899329, + "grad_norm": 0.38645485043525696, + "learning_rate": 4.941882723646568e-06, + "loss": 0.6092, + "step": 462 + }, + { + "epoch": 0.5178970917225951, + "grad_norm": 0.38496407866477966, + "learning_rate": 4.9415624487485615e-06, + "loss": 0.6368, + "step": 463 + }, + { + "epoch": 0.5190156599552572, + "grad_norm": 0.3946744501590729, + "learning_rate": 4.941241304217962e-06, + "loss": 0.6525, + "step": 464 + }, + { + "epoch": 0.5201342281879194, + "grad_norm": 0.3994438648223877, + "learning_rate": 4.940919290169155e-06, + "loss": 0.6314, + "step": 465 + }, + { + "epoch": 0.5212527964205816, + "grad_norm": 0.3929794728755951, + "learning_rate": 4.940596406716834e-06, + "loss": 0.6148, + "step": 466 + }, + { + "epoch": 0.5223713646532439, + "grad_norm": 0.42620542645454407, + "learning_rate": 4.940272653976005e-06, + "loss": 0.6468, + "step": 467 + }, + { + "epoch": 0.5234899328859061, + "grad_norm": 0.4014374613761902, + "learning_rate": 4.9399480320619805e-06, + "loss": 0.6451, + "step": 468 + }, + { + "epoch": 0.5246085011185683, + "grad_norm": 0.39342424273490906, + "learning_rate": 4.939622541090384e-06, + "loss": 0.6696, + "step": 469 + }, + { + "epoch": 0.5257270693512305, + "grad_norm": 0.3870956301689148, + "learning_rate": 4.939296181177149e-06, + "loss": 0.6451, + "step": 470 + }, + { + "epoch": 0.5268456375838926, + "grad_norm": 0.39973214268684387, + "learning_rate": 4.938968952438518e-06, + "loss": 0.6254, + "step": 471 + }, + { + "epoch": 0.5279642058165548, + "grad_norm": 0.3956799805164337, + "learning_rate": 4.938640854991041e-06, + "loss": 0.6169, + "step": 472 + }, + { + "epoch": 0.529082774049217, + "grad_norm": 0.38881829380989075, + "learning_rate": 4.938311888951583e-06, + "loss": 0.5989, + "step": 473 + }, + { + "epoch": 0.5302013422818792, + "grad_norm": 0.392107218503952, + "learning_rate": 4.93798205443731e-06, + "loss": 0.6284, + "step": 474 + }, + { + "epoch": 0.5313199105145414, + "grad_norm": 0.4042797088623047, + "learning_rate": 4.937651351565707e-06, + "loss": 0.6235, + "step": 475 + }, + { + "epoch": 0.5324384787472036, + "grad_norm": 0.380206435918808, + "learning_rate": 4.937319780454559e-06, + "loss": 0.5894, + "step": 476 + }, + { + "epoch": 0.5335570469798657, + "grad_norm": 0.3989536166191101, + "learning_rate": 4.936987341221968e-06, + "loss": 0.6522, + "step": 477 + }, + { + "epoch": 0.5346756152125279, + "grad_norm": 0.38699498772621155, + "learning_rate": 4.9366540339863395e-06, + "loss": 0.6202, + "step": 478 + }, + { + "epoch": 0.5357941834451901, + "grad_norm": 0.4171985387802124, + "learning_rate": 4.936319858866391e-06, + "loss": 0.624, + "step": 479 + }, + { + "epoch": 0.5369127516778524, + "grad_norm": 0.3932148218154907, + "learning_rate": 4.93598481598115e-06, + "loss": 0.6215, + "step": 480 + }, + { + "epoch": 0.5380313199105146, + "grad_norm": 0.3934101462364197, + "learning_rate": 4.935648905449949e-06, + "loss": 0.6402, + "step": 481 + }, + { + "epoch": 0.5391498881431768, + "grad_norm": 0.3917444348335266, + "learning_rate": 4.935312127392434e-06, + "loss": 0.641, + "step": 482 + }, + { + "epoch": 0.540268456375839, + "grad_norm": 0.4036387503147125, + "learning_rate": 4.9349744819285584e-06, + "loss": 0.6405, + "step": 483 + }, + { + "epoch": 0.5413870246085011, + "grad_norm": 0.38611260056495667, + "learning_rate": 4.934635969178584e-06, + "loss": 0.6231, + "step": 484 + }, + { + "epoch": 0.5425055928411633, + "grad_norm": 0.39185649156570435, + "learning_rate": 4.9342965892630805e-06, + "loss": 0.6214, + "step": 485 + }, + { + "epoch": 0.5436241610738255, + "grad_norm": 0.3736090362071991, + "learning_rate": 4.933956342302929e-06, + "loss": 0.6053, + "step": 486 + }, + { + "epoch": 0.5447427293064877, + "grad_norm": 0.39649662375450134, + "learning_rate": 4.93361522841932e-06, + "loss": 0.6408, + "step": 487 + }, + { + "epoch": 0.5458612975391499, + "grad_norm": 0.3990592658519745, + "learning_rate": 4.933273247733746e-06, + "loss": 0.6081, + "step": 488 + }, + { + "epoch": 0.5469798657718121, + "grad_norm": 0.39177680015563965, + "learning_rate": 4.932930400368019e-06, + "loss": 0.6114, + "step": 489 + }, + { + "epoch": 0.5480984340044742, + "grad_norm": 0.3953116536140442, + "learning_rate": 4.9325866864442495e-06, + "loss": 0.6339, + "step": 490 + }, + { + "epoch": 0.5492170022371364, + "grad_norm": 0.38563409447669983, + "learning_rate": 4.932242106084864e-06, + "loss": 0.6331, + "step": 491 + }, + { + "epoch": 0.5503355704697986, + "grad_norm": 0.40618443489074707, + "learning_rate": 4.931896659412593e-06, + "loss": 0.6441, + "step": 492 + }, + { + "epoch": 0.5514541387024608, + "grad_norm": 0.4008066654205322, + "learning_rate": 4.931550346550479e-06, + "loss": 0.6243, + "step": 493 + }, + { + "epoch": 0.5525727069351231, + "grad_norm": 0.39776620268821716, + "learning_rate": 4.931203167621868e-06, + "loss": 0.6152, + "step": 494 + }, + { + "epoch": 0.5536912751677853, + "grad_norm": 0.38687410950660706, + "learning_rate": 4.930855122750421e-06, + "loss": 0.5969, + "step": 495 + }, + { + "epoch": 0.5548098434004475, + "grad_norm": 0.3877246081829071, + "learning_rate": 4.9305062120601035e-06, + "loss": 0.6016, + "step": 496 + }, + { + "epoch": 0.5559284116331096, + "grad_norm": 0.40948086977005005, + "learning_rate": 4.930156435675189e-06, + "loss": 0.6168, + "step": 497 + }, + { + "epoch": 0.5570469798657718, + "grad_norm": 0.404021292924881, + "learning_rate": 4.929805793720262e-06, + "loss": 0.6092, + "step": 498 + }, + { + "epoch": 0.558165548098434, + "grad_norm": 0.39509114623069763, + "learning_rate": 4.929454286320211e-06, + "loss": 0.6346, + "step": 499 + }, + { + "epoch": 0.5592841163310962, + "grad_norm": 0.39687812328338623, + "learning_rate": 4.9291019136002385e-06, + "loss": 0.639, + "step": 500 + }, + { + "epoch": 0.5604026845637584, + "grad_norm": 0.39210405945777893, + "learning_rate": 4.92874867568585e-06, + "loss": 0.6083, + "step": 501 + }, + { + "epoch": 0.5615212527964206, + "grad_norm": 0.4022452235221863, + "learning_rate": 4.928394572702862e-06, + "loss": 0.6252, + "step": 502 + }, + { + "epoch": 0.5626398210290827, + "grad_norm": 0.40317103266716003, + "learning_rate": 4.928039604777399e-06, + "loss": 0.614, + "step": 503 + }, + { + "epoch": 0.5637583892617449, + "grad_norm": 0.4097250998020172, + "learning_rate": 4.9276837720358924e-06, + "loss": 0.6218, + "step": 504 + }, + { + "epoch": 0.5648769574944071, + "grad_norm": 0.3927348554134369, + "learning_rate": 4.927327074605083e-06, + "loss": 0.6079, + "step": 505 + }, + { + "epoch": 0.5659955257270693, + "grad_norm": 0.3961605131626129, + "learning_rate": 4.9269695126120185e-06, + "loss": 0.612, + "step": 506 + }, + { + "epoch": 0.5671140939597316, + "grad_norm": 0.3945924639701843, + "learning_rate": 4.926611086184054e-06, + "loss": 0.6268, + "step": 507 + }, + { + "epoch": 0.5682326621923938, + "grad_norm": 0.39072591066360474, + "learning_rate": 4.926251795448854e-06, + "loss": 0.6176, + "step": 508 + }, + { + "epoch": 0.569351230425056, + "grad_norm": 0.39760643243789673, + "learning_rate": 4.9258916405343904e-06, + "loss": 0.6437, + "step": 509 + }, + { + "epoch": 0.5704697986577181, + "grad_norm": 0.39866000413894653, + "learning_rate": 4.925530621568942e-06, + "loss": 0.6383, + "step": 510 + }, + { + "epoch": 0.5715883668903803, + "grad_norm": 0.3932257294654846, + "learning_rate": 4.925168738681097e-06, + "loss": 0.6156, + "step": 511 + }, + { + "epoch": 0.5727069351230425, + "grad_norm": 0.39929500222206116, + "learning_rate": 4.924805991999751e-06, + "loss": 0.6069, + "step": 512 + }, + { + "epoch": 0.5738255033557047, + "grad_norm": 0.41192054748535156, + "learning_rate": 4.924442381654105e-06, + "loss": 0.6451, + "step": 513 + }, + { + "epoch": 0.5749440715883669, + "grad_norm": 0.41147273778915405, + "learning_rate": 4.92407790777367e-06, + "loss": 0.647, + "step": 514 + }, + { + "epoch": 0.5760626398210291, + "grad_norm": 0.4128178358078003, + "learning_rate": 4.923712570488264e-06, + "loss": 0.5909, + "step": 515 + }, + { + "epoch": 0.5771812080536913, + "grad_norm": 0.4081036150455475, + "learning_rate": 4.923346369928012e-06, + "loss": 0.6248, + "step": 516 + }, + { + "epoch": 0.5782997762863534, + "grad_norm": 0.3965778350830078, + "learning_rate": 4.922979306223347e-06, + "loss": 0.6019, + "step": 517 + }, + { + "epoch": 0.5794183445190156, + "grad_norm": 0.3979526162147522, + "learning_rate": 4.922611379505009e-06, + "loss": 0.6368, + "step": 518 + }, + { + "epoch": 0.5805369127516778, + "grad_norm": 0.38306665420532227, + "learning_rate": 4.922242589904046e-06, + "loss": 0.62, + "step": 519 + }, + { + "epoch": 0.5816554809843401, + "grad_norm": 0.3833399713039398, + "learning_rate": 4.921872937551814e-06, + "loss": 0.6064, + "step": 520 + }, + { + "epoch": 0.5827740492170023, + "grad_norm": 0.39361608028411865, + "learning_rate": 4.921502422579973e-06, + "loss": 0.6236, + "step": 521 + }, + { + "epoch": 0.5838926174496645, + "grad_norm": 0.39250272512435913, + "learning_rate": 4.921131045120494e-06, + "loss": 0.624, + "step": 522 + }, + { + "epoch": 0.5850111856823266, + "grad_norm": 0.40747684240341187, + "learning_rate": 4.920758805305654e-06, + "loss": 0.6096, + "step": 523 + }, + { + "epoch": 0.5861297539149888, + "grad_norm": 0.39987003803253174, + "learning_rate": 4.920385703268037e-06, + "loss": 0.6282, + "step": 524 + }, + { + "epoch": 0.587248322147651, + "grad_norm": 0.39122274518013, + "learning_rate": 4.920011739140532e-06, + "loss": 0.6479, + "step": 525 + }, + { + "epoch": 0.5883668903803132, + "grad_norm": 0.39809542894363403, + "learning_rate": 4.919636913056339e-06, + "loss": 0.6213, + "step": 526 + }, + { + "epoch": 0.5894854586129754, + "grad_norm": 0.39921343326568604, + "learning_rate": 4.919261225148963e-06, + "loss": 0.6118, + "step": 527 + }, + { + "epoch": 0.5906040268456376, + "grad_norm": 0.4086368680000305, + "learning_rate": 4.9188846755522155e-06, + "loss": 0.6214, + "step": 528 + }, + { + "epoch": 0.5917225950782998, + "grad_norm": 0.4066048264503479, + "learning_rate": 4.918507264400216e-06, + "loss": 0.6316, + "step": 529 + }, + { + "epoch": 0.5928411633109619, + "grad_norm": 0.41961807012557983, + "learning_rate": 4.91812899182739e-06, + "loss": 0.5948, + "step": 530 + }, + { + "epoch": 0.5939597315436241, + "grad_norm": 0.39992618560791016, + "learning_rate": 4.917749857968469e-06, + "loss": 0.6113, + "step": 531 + }, + { + "epoch": 0.5950782997762863, + "grad_norm": 0.41020235419273376, + "learning_rate": 4.917369862958494e-06, + "loss": 0.622, + "step": 532 + }, + { + "epoch": 0.5961968680089486, + "grad_norm": 0.40504705905914307, + "learning_rate": 4.916989006932811e-06, + "loss": 0.621, + "step": 533 + }, + { + "epoch": 0.5973154362416108, + "grad_norm": 0.3829837441444397, + "learning_rate": 4.9166072900270725e-06, + "loss": 0.5942, + "step": 534 + }, + { + "epoch": 0.598434004474273, + "grad_norm": 0.4082834720611572, + "learning_rate": 4.9162247123772375e-06, + "loss": 0.5923, + "step": 535 + }, + { + "epoch": 0.5995525727069351, + "grad_norm": 0.40038296580314636, + "learning_rate": 4.915841274119572e-06, + "loss": 0.6057, + "step": 536 + }, + { + "epoch": 0.6006711409395973, + "grad_norm": 0.40687569975852966, + "learning_rate": 4.91545697539065e-06, + "loss": 0.6343, + "step": 537 + }, + { + "epoch": 0.6017897091722595, + "grad_norm": 0.386262983083725, + "learning_rate": 4.9150718163273494e-06, + "loss": 0.6372, + "step": 538 + }, + { + "epoch": 0.6029082774049217, + "grad_norm": 0.39570850133895874, + "learning_rate": 4.914685797066855e-06, + "loss": 0.6157, + "step": 539 + }, + { + "epoch": 0.6040268456375839, + "grad_norm": 0.40055716037750244, + "learning_rate": 4.9142989177466594e-06, + "loss": 0.6141, + "step": 540 + }, + { + "epoch": 0.6051454138702461, + "grad_norm": 0.4038466811180115, + "learning_rate": 4.913911178504562e-06, + "loss": 0.6286, + "step": 541 + }, + { + "epoch": 0.6062639821029083, + "grad_norm": 0.38774847984313965, + "learning_rate": 4.913522579478664e-06, + "loss": 0.6343, + "step": 542 + }, + { + "epoch": 0.6073825503355704, + "grad_norm": 0.39426755905151367, + "learning_rate": 4.913133120807379e-06, + "loss": 0.6121, + "step": 543 + }, + { + "epoch": 0.6085011185682326, + "grad_norm": 0.4076898396015167, + "learning_rate": 4.912742802629423e-06, + "loss": 0.6273, + "step": 544 + }, + { + "epoch": 0.6096196868008948, + "grad_norm": 0.3859540820121765, + "learning_rate": 4.91235162508382e-06, + "loss": 0.6314, + "step": 545 + }, + { + "epoch": 0.610738255033557, + "grad_norm": 0.3914327621459961, + "learning_rate": 4.911959588309897e-06, + "loss": 0.6027, + "step": 546 + }, + { + "epoch": 0.6118568232662193, + "grad_norm": 0.3892766833305359, + "learning_rate": 4.9115666924472906e-06, + "loss": 0.5922, + "step": 547 + }, + { + "epoch": 0.6129753914988815, + "grad_norm": 0.3921322226524353, + "learning_rate": 4.911172937635942e-06, + "loss": 0.6066, + "step": 548 + }, + { + "epoch": 0.6140939597315436, + "grad_norm": 0.3972843885421753, + "learning_rate": 4.910778324016098e-06, + "loss": 0.614, + "step": 549 + }, + { + "epoch": 0.6152125279642058, + "grad_norm": 0.4027954638004303, + "learning_rate": 4.9103828517283105e-06, + "loss": 0.6174, + "step": 550 + }, + { + "epoch": 0.616331096196868, + "grad_norm": 0.40479257702827454, + "learning_rate": 4.909986520913441e-06, + "loss": 0.6114, + "step": 551 + }, + { + "epoch": 0.6174496644295302, + "grad_norm": 0.4246085584163666, + "learning_rate": 4.909589331712651e-06, + "loss": 0.6145, + "step": 552 + }, + { + "epoch": 0.6185682326621924, + "grad_norm": 0.4173775017261505, + "learning_rate": 4.909191284267413e-06, + "loss": 0.6375, + "step": 553 + }, + { + "epoch": 0.6196868008948546, + "grad_norm": 0.4135677218437195, + "learning_rate": 4.908792378719502e-06, + "loss": 0.6444, + "step": 554 + }, + { + "epoch": 0.6208053691275168, + "grad_norm": 0.40163061022758484, + "learning_rate": 4.9083926152110004e-06, + "loss": 0.6128, + "step": 555 + }, + { + "epoch": 0.6219239373601789, + "grad_norm": 0.41246625781059265, + "learning_rate": 4.907991993884295e-06, + "loss": 0.6229, + "step": 556 + }, + { + "epoch": 0.6230425055928411, + "grad_norm": 0.4114304780960083, + "learning_rate": 4.907590514882079e-06, + "loss": 0.6028, + "step": 557 + }, + { + "epoch": 0.6241610738255033, + "grad_norm": 0.40224742889404297, + "learning_rate": 4.90718817834735e-06, + "loss": 0.5896, + "step": 558 + }, + { + "epoch": 0.6252796420581656, + "grad_norm": 0.397650808095932, + "learning_rate": 4.906784984423411e-06, + "loss": 0.6309, + "step": 559 + }, + { + "epoch": 0.6263982102908278, + "grad_norm": 0.4087318480014801, + "learning_rate": 4.906380933253874e-06, + "loss": 0.6002, + "step": 560 + }, + { + "epoch": 0.62751677852349, + "grad_norm": 0.3988543450832367, + "learning_rate": 4.90597602498265e-06, + "loss": 0.6415, + "step": 561 + }, + { + "epoch": 0.6286353467561522, + "grad_norm": 0.38457274436950684, + "learning_rate": 4.905570259753961e-06, + "loss": 0.6105, + "step": 562 + }, + { + "epoch": 0.6297539149888143, + "grad_norm": 0.38756313920021057, + "learning_rate": 4.905163637712331e-06, + "loss": 0.5953, + "step": 563 + }, + { + "epoch": 0.6308724832214765, + "grad_norm": 0.4071662127971649, + "learning_rate": 4.90475615900259e-06, + "loss": 0.635, + "step": 564 + }, + { + "epoch": 0.6319910514541387, + "grad_norm": 0.4213521182537079, + "learning_rate": 4.904347823769875e-06, + "loss": 0.6141, + "step": 565 + }, + { + "epoch": 0.6331096196868009, + "grad_norm": 0.4104982018470764, + "learning_rate": 4.9039386321596235e-06, + "loss": 0.6235, + "step": 566 + }, + { + "epoch": 0.6342281879194631, + "grad_norm": 0.41318175196647644, + "learning_rate": 4.903528584317583e-06, + "loss": 0.6315, + "step": 567 + }, + { + "epoch": 0.6353467561521253, + "grad_norm": 0.39332863688468933, + "learning_rate": 4.903117680389802e-06, + "loss": 0.5807, + "step": 568 + }, + { + "epoch": 0.6364653243847874, + "grad_norm": 0.4188497066497803, + "learning_rate": 4.902705920522638e-06, + "loss": 0.6176, + "step": 569 + }, + { + "epoch": 0.6375838926174496, + "grad_norm": 0.41399574279785156, + "learning_rate": 4.9022933048627496e-06, + "loss": 0.6067, + "step": 570 + }, + { + "epoch": 0.6387024608501118, + "grad_norm": 0.4197136461734772, + "learning_rate": 4.901879833557102e-06, + "loss": 0.6182, + "step": 571 + }, + { + "epoch": 0.639821029082774, + "grad_norm": 0.41715213656425476, + "learning_rate": 4.9014655067529645e-06, + "loss": 0.6088, + "step": 572 + }, + { + "epoch": 0.6409395973154363, + "grad_norm": 0.40957003831863403, + "learning_rate": 4.901050324597912e-06, + "loss": 0.5942, + "step": 573 + }, + { + "epoch": 0.6420581655480985, + "grad_norm": 0.4082324206829071, + "learning_rate": 4.9006342872398235e-06, + "loss": 0.6389, + "step": 574 + }, + { + "epoch": 0.6431767337807607, + "grad_norm": 0.41336655616760254, + "learning_rate": 4.900217394826882e-06, + "loss": 0.6122, + "step": 575 + }, + { + "epoch": 0.6442953020134228, + "grad_norm": 0.40878477692604065, + "learning_rate": 4.899799647507577e-06, + "loss": 0.6372, + "step": 576 + }, + { + "epoch": 0.645413870246085, + "grad_norm": 0.4028140604496002, + "learning_rate": 4.899381045430701e-06, + "loss": 0.5949, + "step": 577 + }, + { + "epoch": 0.6465324384787472, + "grad_norm": 0.42413756251335144, + "learning_rate": 4.89896158874535e-06, + "loss": 0.6217, + "step": 578 + }, + { + "epoch": 0.6476510067114094, + "grad_norm": 0.4108542501926422, + "learning_rate": 4.898541277600927e-06, + "loss": 0.6283, + "step": 579 + }, + { + "epoch": 0.6487695749440716, + "grad_norm": 0.4101778566837311, + "learning_rate": 4.898120112147135e-06, + "loss": 0.6028, + "step": 580 + }, + { + "epoch": 0.6498881431767338, + "grad_norm": 0.4104720652103424, + "learning_rate": 4.897698092533988e-06, + "loss": 0.6481, + "step": 581 + }, + { + "epoch": 0.6510067114093959, + "grad_norm": 0.4004007577896118, + "learning_rate": 4.897275218911799e-06, + "loss": 0.6042, + "step": 582 + }, + { + "epoch": 0.6521252796420581, + "grad_norm": 0.4042847752571106, + "learning_rate": 4.896851491431185e-06, + "loss": 0.6076, + "step": 583 + }, + { + "epoch": 0.6532438478747203, + "grad_norm": 0.40685543417930603, + "learning_rate": 4.89642691024307e-06, + "loss": 0.6066, + "step": 584 + }, + { + "epoch": 0.6543624161073825, + "grad_norm": 0.41699886322021484, + "learning_rate": 4.896001475498682e-06, + "loss": 0.6091, + "step": 585 + }, + { + "epoch": 0.6554809843400448, + "grad_norm": 0.3905144929885864, + "learning_rate": 4.89557518734955e-06, + "loss": 0.6272, + "step": 586 + }, + { + "epoch": 0.656599552572707, + "grad_norm": 0.40033814311027527, + "learning_rate": 4.895148045947509e-06, + "loss": 0.6183, + "step": 587 + }, + { + "epoch": 0.6577181208053692, + "grad_norm": 0.39996397495269775, + "learning_rate": 4.894720051444698e-06, + "loss": 0.5996, + "step": 588 + }, + { + "epoch": 0.6588366890380313, + "grad_norm": 0.42592981457710266, + "learning_rate": 4.894291203993561e-06, + "loss": 0.6506, + "step": 589 + }, + { + "epoch": 0.6599552572706935, + "grad_norm": 0.40710797905921936, + "learning_rate": 4.8938615037468405e-06, + "loss": 0.6044, + "step": 590 + }, + { + "epoch": 0.6610738255033557, + "grad_norm": 0.427405446767807, + "learning_rate": 4.893430950857591e-06, + "loss": 0.6236, + "step": 591 + }, + { + "epoch": 0.6621923937360179, + "grad_norm": 0.40190666913986206, + "learning_rate": 4.892999545479163e-06, + "loss": 0.6031, + "step": 592 + }, + { + "epoch": 0.6633109619686801, + "grad_norm": 0.4019568860530853, + "learning_rate": 4.8925672877652155e-06, + "loss": 0.6232, + "step": 593 + }, + { + "epoch": 0.6644295302013423, + "grad_norm": 0.40001606941223145, + "learning_rate": 4.892134177869709e-06, + "loss": 0.6141, + "step": 594 + }, + { + "epoch": 0.6655480984340044, + "grad_norm": 0.40650853514671326, + "learning_rate": 4.891700215946909e-06, + "loss": 0.6011, + "step": 595 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.41019386053085327, + "learning_rate": 4.8912654021513815e-06, + "loss": 0.6262, + "step": 596 + }, + { + "epoch": 0.6677852348993288, + "grad_norm": 0.4064581096172333, + "learning_rate": 4.890829736638e-06, + "loss": 0.6329, + "step": 597 + }, + { + "epoch": 0.668903803131991, + "grad_norm": 0.4091740846633911, + "learning_rate": 4.890393219561938e-06, + "loss": 0.6193, + "step": 598 + }, + { + "epoch": 0.6700223713646533, + "grad_norm": 0.407552570104599, + "learning_rate": 4.889955851078674e-06, + "loss": 0.6535, + "step": 599 + }, + { + "epoch": 0.6711409395973155, + "grad_norm": 0.3950585126876831, + "learning_rate": 4.889517631343988e-06, + "loss": 0.6033, + "step": 600 + }, + { + "epoch": 0.6722595078299777, + "grad_norm": 0.39515334367752075, + "learning_rate": 4.889078560513968e-06, + "loss": 0.6006, + "step": 601 + }, + { + "epoch": 0.6733780760626398, + "grad_norm": 0.4188336431980133, + "learning_rate": 4.888638638744999e-06, + "loss": 0.6333, + "step": 602 + }, + { + "epoch": 0.674496644295302, + "grad_norm": 0.4050522446632385, + "learning_rate": 4.888197866193772e-06, + "loss": 0.6329, + "step": 603 + }, + { + "epoch": 0.6756152125279642, + "grad_norm": 0.39428868889808655, + "learning_rate": 4.887756243017282e-06, + "loss": 0.6007, + "step": 604 + }, + { + "epoch": 0.6767337807606264, + "grad_norm": 0.402410626411438, + "learning_rate": 4.887313769372823e-06, + "loss": 0.5885, + "step": 605 + }, + { + "epoch": 0.6778523489932886, + "grad_norm": 0.4062318205833435, + "learning_rate": 4.886870445417998e-06, + "loss": 0.6312, + "step": 606 + }, + { + "epoch": 0.6789709172259508, + "grad_norm": 0.4123631715774536, + "learning_rate": 4.886426271310708e-06, + "loss": 0.619, + "step": 607 + }, + { + "epoch": 0.680089485458613, + "grad_norm": 0.40353336930274963, + "learning_rate": 4.885981247209159e-06, + "loss": 0.6324, + "step": 608 + }, + { + "epoch": 0.6812080536912751, + "grad_norm": 0.40726402401924133, + "learning_rate": 4.885535373271858e-06, + "loss": 0.5819, + "step": 609 + }, + { + "epoch": 0.6823266219239373, + "grad_norm": 0.4142349660396576, + "learning_rate": 4.885088649657618e-06, + "loss": 0.6175, + "step": 610 + }, + { + "epoch": 0.6834451901565995, + "grad_norm": 0.4165160059928894, + "learning_rate": 4.884641076525549e-06, + "loss": 0.597, + "step": 611 + }, + { + "epoch": 0.6845637583892618, + "grad_norm": 0.4037843942642212, + "learning_rate": 4.884192654035069e-06, + "loss": 0.6183, + "step": 612 + }, + { + "epoch": 0.685682326621924, + "grad_norm": 0.41777777671813965, + "learning_rate": 4.883743382345898e-06, + "loss": 0.6063, + "step": 613 + }, + { + "epoch": 0.6868008948545862, + "grad_norm": 0.41021421551704407, + "learning_rate": 4.883293261618054e-06, + "loss": 0.6134, + "step": 614 + }, + { + "epoch": 0.6879194630872483, + "grad_norm": 0.4215847849845886, + "learning_rate": 4.882842292011863e-06, + "loss": 0.6458, + "step": 615 + }, + { + "epoch": 0.6890380313199105, + "grad_norm": 0.42801633477211, + "learning_rate": 4.882390473687949e-06, + "loss": 0.6259, + "step": 616 + }, + { + "epoch": 0.6901565995525727, + "grad_norm": 0.40879589319229126, + "learning_rate": 4.881937806807241e-06, + "loss": 0.6208, + "step": 617 + }, + { + "epoch": 0.6912751677852349, + "grad_norm": 0.39453622698783875, + "learning_rate": 4.881484291530969e-06, + "loss": 0.5966, + "step": 618 + }, + { + "epoch": 0.6923937360178971, + "grad_norm": 0.3992539048194885, + "learning_rate": 4.881029928020666e-06, + "loss": 0.5976, + "step": 619 + }, + { + "epoch": 0.6935123042505593, + "grad_norm": 0.4175397753715515, + "learning_rate": 4.880574716438166e-06, + "loss": 0.6261, + "step": 620 + }, + { + "epoch": 0.6946308724832215, + "grad_norm": 0.40533408522605896, + "learning_rate": 4.880118656945606e-06, + "loss": 0.5945, + "step": 621 + }, + { + "epoch": 0.6957494407158836, + "grad_norm": 0.4089728593826294, + "learning_rate": 4.879661749705424e-06, + "loss": 0.6226, + "step": 622 + }, + { + "epoch": 0.6968680089485458, + "grad_norm": 0.4341566562652588, + "learning_rate": 4.879203994880362e-06, + "loss": 0.6463, + "step": 623 + }, + { + "epoch": 0.697986577181208, + "grad_norm": 0.44256189465522766, + "learning_rate": 4.878745392633462e-06, + "loss": 0.653, + "step": 624 + }, + { + "epoch": 0.6991051454138703, + "grad_norm": 0.4098159372806549, + "learning_rate": 4.878285943128067e-06, + "loss": 0.5808, + "step": 625 + }, + { + "epoch": 0.7002237136465325, + "grad_norm": 0.43130752444267273, + "learning_rate": 4.8778256465278245e-06, + "loss": 0.6261, + "step": 626 + }, + { + "epoch": 0.7013422818791947, + "grad_norm": 0.4110218286514282, + "learning_rate": 4.877364502996682e-06, + "loss": 0.5954, + "step": 627 + }, + { + "epoch": 0.7024608501118568, + "grad_norm": 0.42106136679649353, + "learning_rate": 4.87690251269889e-06, + "loss": 0.6026, + "step": 628 + }, + { + "epoch": 0.703579418344519, + "grad_norm": 0.4233524203300476, + "learning_rate": 4.876439675798997e-06, + "loss": 0.6432, + "step": 629 + }, + { + "epoch": 0.7046979865771812, + "grad_norm": 0.42107197642326355, + "learning_rate": 4.87597599246186e-06, + "loss": 0.6198, + "step": 630 + }, + { + "epoch": 0.7058165548098434, + "grad_norm": 0.43851831555366516, + "learning_rate": 4.875511462852628e-06, + "loss": 0.6293, + "step": 631 + }, + { + "epoch": 0.7069351230425056, + "grad_norm": 0.41345685720443726, + "learning_rate": 4.87504608713676e-06, + "loss": 0.6178, + "step": 632 + }, + { + "epoch": 0.7080536912751678, + "grad_norm": 0.41011178493499756, + "learning_rate": 4.874579865480013e-06, + "loss": 0.6441, + "step": 633 + }, + { + "epoch": 0.70917225950783, + "grad_norm": 0.41372135281562805, + "learning_rate": 4.874112798048442e-06, + "loss": 0.6142, + "step": 634 + }, + { + "epoch": 0.7102908277404921, + "grad_norm": 0.41231900453567505, + "learning_rate": 4.8736448850084105e-06, + "loss": 0.6277, + "step": 635 + }, + { + "epoch": 0.7114093959731543, + "grad_norm": 0.4147928059101105, + "learning_rate": 4.873176126526578e-06, + "loss": 0.6197, + "step": 636 + }, + { + "epoch": 0.7125279642058165, + "grad_norm": 0.4046717882156372, + "learning_rate": 4.8727065227699035e-06, + "loss": 0.6138, + "step": 637 + }, + { + "epoch": 0.7136465324384788, + "grad_norm": 0.4150887727737427, + "learning_rate": 4.872236073905654e-06, + "loss": 0.616, + "step": 638 + }, + { + "epoch": 0.714765100671141, + "grad_norm": 0.41429632902145386, + "learning_rate": 4.87176478010139e-06, + "loss": 0.6153, + "step": 639 + }, + { + "epoch": 0.7158836689038032, + "grad_norm": 0.41153407096862793, + "learning_rate": 4.8712926415249785e-06, + "loss": 0.6171, + "step": 640 + }, + { + "epoch": 0.7170022371364653, + "grad_norm": 0.4178698658943176, + "learning_rate": 4.870819658344584e-06, + "loss": 0.6417, + "step": 641 + }, + { + "epoch": 0.7181208053691275, + "grad_norm": 0.40587952733039856, + "learning_rate": 4.870345830728675e-06, + "loss": 0.6206, + "step": 642 + }, + { + "epoch": 0.7192393736017897, + "grad_norm": 0.42633864283561707, + "learning_rate": 4.869871158846016e-06, + "loss": 0.6246, + "step": 643 + }, + { + "epoch": 0.7203579418344519, + "grad_norm": 0.41023534536361694, + "learning_rate": 4.8693956428656766e-06, + "loss": 0.601, + "step": 644 + }, + { + "epoch": 0.7214765100671141, + "grad_norm": 0.40645042061805725, + "learning_rate": 4.868919282957024e-06, + "loss": 0.6193, + "step": 645 + }, + { + "epoch": 0.7225950782997763, + "grad_norm": 0.40088531374931335, + "learning_rate": 4.86844207928973e-06, + "loss": 0.5869, + "step": 646 + }, + { + "epoch": 0.7237136465324385, + "grad_norm": 0.4136696755886078, + "learning_rate": 4.8679640320337625e-06, + "loss": 0.6413, + "step": 647 + }, + { + "epoch": 0.7248322147651006, + "grad_norm": 0.40026187896728516, + "learning_rate": 4.867485141359394e-06, + "loss": 0.6075, + "step": 648 + }, + { + "epoch": 0.7259507829977628, + "grad_norm": 0.40911242365837097, + "learning_rate": 4.867005407437192e-06, + "loss": 0.6411, + "step": 649 + }, + { + "epoch": 0.727069351230425, + "grad_norm": 0.42306697368621826, + "learning_rate": 4.866524830438029e-06, + "loss": 0.6376, + "step": 650 + }, + { + "epoch": 0.7281879194630873, + "grad_norm": 0.40857061743736267, + "learning_rate": 4.866043410533077e-06, + "loss": 0.6071, + "step": 651 + }, + { + "epoch": 0.7293064876957495, + "grad_norm": 0.41601142287254333, + "learning_rate": 4.8655611478938055e-06, + "loss": 0.6079, + "step": 652 + }, + { + "epoch": 0.7304250559284117, + "grad_norm": 0.40857282280921936, + "learning_rate": 4.8650780426919895e-06, + "loss": 0.6246, + "step": 653 + }, + { + "epoch": 0.7315436241610739, + "grad_norm": 0.4063502252101898, + "learning_rate": 4.864594095099697e-06, + "loss": 0.6105, + "step": 654 + }, + { + "epoch": 0.732662192393736, + "grad_norm": 0.40278729796409607, + "learning_rate": 4.864109305289303e-06, + "loss": 0.5936, + "step": 655 + }, + { + "epoch": 0.7337807606263982, + "grad_norm": 0.4201098382472992, + "learning_rate": 4.863623673433478e-06, + "loss": 0.6081, + "step": 656 + }, + { + "epoch": 0.7348993288590604, + "grad_norm": 0.40003877878189087, + "learning_rate": 4.863137199705192e-06, + "loss": 0.6085, + "step": 657 + }, + { + "epoch": 0.7360178970917226, + "grad_norm": 0.41234898567199707, + "learning_rate": 4.86264988427772e-06, + "loss": 0.6252, + "step": 658 + }, + { + "epoch": 0.7371364653243848, + "grad_norm": 0.4233507513999939, + "learning_rate": 4.862161727324632e-06, + "loss": 0.5987, + "step": 659 + }, + { + "epoch": 0.738255033557047, + "grad_norm": 0.4099391996860504, + "learning_rate": 4.861672729019798e-06, + "loss": 0.6293, + "step": 660 + }, + { + "epoch": 0.7393736017897091, + "grad_norm": 0.4255772829055786, + "learning_rate": 4.861182889537389e-06, + "loss": 0.6268, + "step": 661 + }, + { + "epoch": 0.7404921700223713, + "grad_norm": 0.4317517578601837, + "learning_rate": 4.860692209051877e-06, + "loss": 0.6444, + "step": 662 + }, + { + "epoch": 0.7416107382550335, + "grad_norm": 0.4352816939353943, + "learning_rate": 4.86020068773803e-06, + "loss": 0.6304, + "step": 663 + }, + { + "epoch": 0.7427293064876958, + "grad_norm": 0.3987254500389099, + "learning_rate": 4.859708325770919e-06, + "loss": 0.611, + "step": 664 + }, + { + "epoch": 0.743847874720358, + "grad_norm": 0.4213384985923767, + "learning_rate": 4.859215123325912e-06, + "loss": 0.6292, + "step": 665 + }, + { + "epoch": 0.7449664429530202, + "grad_norm": 0.4062172472476959, + "learning_rate": 4.8587210805786785e-06, + "loss": 0.6197, + "step": 666 + }, + { + "epoch": 0.7460850111856824, + "grad_norm": 0.41443362832069397, + "learning_rate": 4.858226197705183e-06, + "loss": 0.6414, + "step": 667 + }, + { + "epoch": 0.7472035794183445, + "grad_norm": 0.4183506667613983, + "learning_rate": 4.857730474881696e-06, + "loss": 0.6294, + "step": 668 + }, + { + "epoch": 0.7483221476510067, + "grad_norm": 0.42685073614120483, + "learning_rate": 4.857233912284781e-06, + "loss": 0.6264, + "step": 669 + }, + { + "epoch": 0.7494407158836689, + "grad_norm": 0.4143792390823364, + "learning_rate": 4.856736510091304e-06, + "loss": 0.6575, + "step": 670 + }, + { + "epoch": 0.7505592841163311, + "grad_norm": 0.4124217629432678, + "learning_rate": 4.8562382684784284e-06, + "loss": 0.6295, + "step": 671 + }, + { + "epoch": 0.7516778523489933, + "grad_norm": 0.4060792624950409, + "learning_rate": 4.855739187623619e-06, + "loss": 0.5983, + "step": 672 + }, + { + "epoch": 0.7527964205816555, + "grad_norm": 0.4100533723831177, + "learning_rate": 4.855239267704635e-06, + "loss": 0.6271, + "step": 673 + }, + { + "epoch": 0.7539149888143176, + "grad_norm": 0.4047471582889557, + "learning_rate": 4.854738508899538e-06, + "loss": 0.5843, + "step": 674 + }, + { + "epoch": 0.7550335570469798, + "grad_norm": 0.41550201177597046, + "learning_rate": 4.854236911386689e-06, + "loss": 0.6015, + "step": 675 + }, + { + "epoch": 0.756152125279642, + "grad_norm": 0.4035356044769287, + "learning_rate": 4.853734475344745e-06, + "loss": 0.6085, + "step": 676 + }, + { + "epoch": 0.7572706935123042, + "grad_norm": 0.4054676294326782, + "learning_rate": 4.853231200952665e-06, + "loss": 0.5879, + "step": 677 + }, + { + "epoch": 0.7583892617449665, + "grad_norm": 0.4165349304676056, + "learning_rate": 4.852727088389702e-06, + "loss": 0.6065, + "step": 678 + }, + { + "epoch": 0.7595078299776287, + "grad_norm": 0.41854768991470337, + "learning_rate": 4.8522221378354125e-06, + "loss": 0.6115, + "step": 679 + }, + { + "epoch": 0.7606263982102909, + "grad_norm": 0.4189227223396301, + "learning_rate": 4.851716349469647e-06, + "loss": 0.6174, + "step": 680 + }, + { + "epoch": 0.761744966442953, + "grad_norm": 0.44432833790779114, + "learning_rate": 4.851209723472559e-06, + "loss": 0.6382, + "step": 681 + }, + { + "epoch": 0.7628635346756152, + "grad_norm": 0.4199828803539276, + "learning_rate": 4.8507022600245954e-06, + "loss": 0.6125, + "step": 682 + }, + { + "epoch": 0.7639821029082774, + "grad_norm": 0.44079893827438354, + "learning_rate": 4.850193959306506e-06, + "loss": 0.6263, + "step": 683 + }, + { + "epoch": 0.7651006711409396, + "grad_norm": 0.41406047344207764, + "learning_rate": 4.8496848214993355e-06, + "loss": 0.5979, + "step": 684 + }, + { + "epoch": 0.7662192393736018, + "grad_norm": 0.43209850788116455, + "learning_rate": 4.849174846784428e-06, + "loss": 0.6451, + "step": 685 + }, + { + "epoch": 0.767337807606264, + "grad_norm": 0.4180072844028473, + "learning_rate": 4.848664035343425e-06, + "loss": 0.6009, + "step": 686 + }, + { + "epoch": 0.7684563758389261, + "grad_norm": 0.4092356860637665, + "learning_rate": 4.8481523873582685e-06, + "loss": 0.6431, + "step": 687 + }, + { + "epoch": 0.7695749440715883, + "grad_norm": 0.41440829634666443, + "learning_rate": 4.847639903011196e-06, + "loss": 0.6001, + "step": 688 + }, + { + "epoch": 0.7706935123042505, + "grad_norm": 0.4246008098125458, + "learning_rate": 4.8471265824847415e-06, + "loss": 0.6137, + "step": 689 + }, + { + "epoch": 0.7718120805369127, + "grad_norm": 0.4177666902542114, + "learning_rate": 4.846612425961742e-06, + "loss": 0.6026, + "step": 690 + }, + { + "epoch": 0.772930648769575, + "grad_norm": 0.4130840003490448, + "learning_rate": 4.846097433625327e-06, + "loss": 0.6183, + "step": 691 + }, + { + "epoch": 0.7740492170022372, + "grad_norm": 0.406780868768692, + "learning_rate": 4.845581605658926e-06, + "loss": 0.5992, + "step": 692 + }, + { + "epoch": 0.7751677852348994, + "grad_norm": 0.42086103558540344, + "learning_rate": 4.845064942246267e-06, + "loss": 0.6057, + "step": 693 + }, + { + "epoch": 0.7762863534675615, + "grad_norm": 0.4122505486011505, + "learning_rate": 4.844547443571374e-06, + "loss": 0.6134, + "step": 694 + }, + { + "epoch": 0.7774049217002237, + "grad_norm": 0.43634387850761414, + "learning_rate": 4.8440291098185686e-06, + "loss": 0.6044, + "step": 695 + }, + { + "epoch": 0.7785234899328859, + "grad_norm": 0.4160690903663635, + "learning_rate": 4.843509941172471e-06, + "loss": 0.6046, + "step": 696 + }, + { + "epoch": 0.7796420581655481, + "grad_norm": 0.41897231340408325, + "learning_rate": 4.842989937817997e-06, + "loss": 0.6186, + "step": 697 + }, + { + "epoch": 0.7807606263982103, + "grad_norm": 0.4187341034412384, + "learning_rate": 4.842469099940361e-06, + "loss": 0.6266, + "step": 698 + }, + { + "epoch": 0.7818791946308725, + "grad_norm": 0.4075968563556671, + "learning_rate": 4.841947427725076e-06, + "loss": 0.5772, + "step": 699 + }, + { + "epoch": 0.7829977628635347, + "grad_norm": 0.4157114028930664, + "learning_rate": 4.841424921357948e-06, + "loss": 0.5999, + "step": 700 + }, + { + "epoch": 0.7841163310961968, + "grad_norm": 0.4198933243751526, + "learning_rate": 4.840901581025083e-06, + "loss": 0.6273, + "step": 701 + }, + { + "epoch": 0.785234899328859, + "grad_norm": 0.42646607756614685, + "learning_rate": 4.840377406912887e-06, + "loss": 0.6074, + "step": 702 + }, + { + "epoch": 0.7863534675615212, + "grad_norm": 0.42644554376602173, + "learning_rate": 4.839852399208056e-06, + "loss": 0.5872, + "step": 703 + }, + { + "epoch": 0.7874720357941835, + "grad_norm": 0.43172845244407654, + "learning_rate": 4.839326558097587e-06, + "loss": 0.633, + "step": 704 + }, + { + "epoch": 0.7885906040268457, + "grad_norm": 0.4165332317352295, + "learning_rate": 4.838799883768775e-06, + "loss": 0.6206, + "step": 705 + }, + { + "epoch": 0.7897091722595079, + "grad_norm": 0.4209877550601959, + "learning_rate": 4.83827237640921e-06, + "loss": 0.6015, + "step": 706 + }, + { + "epoch": 0.79082774049217, + "grad_norm": 0.4267021715641022, + "learning_rate": 4.837744036206777e-06, + "loss": 0.5975, + "step": 707 + }, + { + "epoch": 0.7919463087248322, + "grad_norm": 0.4415457546710968, + "learning_rate": 4.837214863349662e-06, + "loss": 0.6251, + "step": 708 + }, + { + "epoch": 0.7930648769574944, + "grad_norm": 0.43104031682014465, + "learning_rate": 4.836684858026343e-06, + "loss": 0.6048, + "step": 709 + }, + { + "epoch": 0.7941834451901566, + "grad_norm": 0.41736820340156555, + "learning_rate": 4.8361540204255985e-06, + "loss": 0.5948, + "step": 710 + }, + { + "epoch": 0.7953020134228188, + "grad_norm": 0.4202009439468384, + "learning_rate": 4.835622350736499e-06, + "loss": 0.6099, + "step": 711 + }, + { + "epoch": 0.796420581655481, + "grad_norm": 0.42279568314552307, + "learning_rate": 4.8350898491484175e-06, + "loss": 0.6247, + "step": 712 + }, + { + "epoch": 0.7975391498881432, + "grad_norm": 0.4266239404678345, + "learning_rate": 4.8345565158510176e-06, + "loss": 0.6136, + "step": 713 + }, + { + "epoch": 0.7986577181208053, + "grad_norm": 0.42605841159820557, + "learning_rate": 4.83402235103426e-06, + "loss": 0.6001, + "step": 714 + }, + { + "epoch": 0.7997762863534675, + "grad_norm": 0.42846307158470154, + "learning_rate": 4.8334873548884055e-06, + "loss": 0.5941, + "step": 715 + }, + { + "epoch": 0.8008948545861297, + "grad_norm": 0.44009047746658325, + "learning_rate": 4.832951527604007e-06, + "loss": 0.622, + "step": 716 + }, + { + "epoch": 0.802013422818792, + "grad_norm": 0.44512951374053955, + "learning_rate": 4.8324148693719145e-06, + "loss": 0.6507, + "step": 717 + }, + { + "epoch": 0.8031319910514542, + "grad_norm": 0.455010324716568, + "learning_rate": 4.831877380383276e-06, + "loss": 0.6201, + "step": 718 + }, + { + "epoch": 0.8042505592841164, + "grad_norm": 0.43456459045410156, + "learning_rate": 4.83133906082953e-06, + "loss": 0.623, + "step": 719 + }, + { + "epoch": 0.8053691275167785, + "grad_norm": 0.42063653469085693, + "learning_rate": 4.830799910902418e-06, + "loss": 0.5841, + "step": 720 + }, + { + "epoch": 0.8064876957494407, + "grad_norm": 0.41323843598365784, + "learning_rate": 4.8302599307939725e-06, + "loss": 0.6127, + "step": 721 + }, + { + "epoch": 0.8076062639821029, + "grad_norm": 0.41982001066207886, + "learning_rate": 4.829719120696523e-06, + "loss": 0.6274, + "step": 722 + }, + { + "epoch": 0.8087248322147651, + "grad_norm": 0.43330860137939453, + "learning_rate": 4.829177480802694e-06, + "loss": 0.6416, + "step": 723 + }, + { + "epoch": 0.8098434004474273, + "grad_norm": 0.4351330101490021, + "learning_rate": 4.828635011305407e-06, + "loss": 0.6399, + "step": 724 + }, + { + "epoch": 0.8109619686800895, + "grad_norm": 0.4017598032951355, + "learning_rate": 4.828091712397878e-06, + "loss": 0.5817, + "step": 725 + }, + { + "epoch": 0.8120805369127517, + "grad_norm": 0.42594751715660095, + "learning_rate": 4.827547584273618e-06, + "loss": 0.6438, + "step": 726 + }, + { + "epoch": 0.8131991051454138, + "grad_norm": 0.409135639667511, + "learning_rate": 4.827002627126433e-06, + "loss": 0.5797, + "step": 727 + }, + { + "epoch": 0.814317673378076, + "grad_norm": 0.4304857850074768, + "learning_rate": 4.826456841150428e-06, + "loss": 0.6173, + "step": 728 + }, + { + "epoch": 0.8154362416107382, + "grad_norm": 0.446872740983963, + "learning_rate": 4.825910226539997e-06, + "loss": 0.6059, + "step": 729 + }, + { + "epoch": 0.8165548098434005, + "grad_norm": 0.42625290155410767, + "learning_rate": 4.8253627834898355e-06, + "loss": 0.5994, + "step": 730 + }, + { + "epoch": 0.8176733780760627, + "grad_norm": 0.42183029651641846, + "learning_rate": 4.824814512194929e-06, + "loss": 0.6202, + "step": 731 + }, + { + "epoch": 0.8187919463087249, + "grad_norm": 0.4235664904117584, + "learning_rate": 4.824265412850559e-06, + "loss": 0.6263, + "step": 732 + }, + { + "epoch": 0.819910514541387, + "grad_norm": 0.4118615686893463, + "learning_rate": 4.823715485652307e-06, + "loss": 0.6058, + "step": 733 + }, + { + "epoch": 0.8210290827740492, + "grad_norm": 0.43514224886894226, + "learning_rate": 4.823164730796042e-06, + "loss": 0.6092, + "step": 734 + }, + { + "epoch": 0.8221476510067114, + "grad_norm": 0.41756734251976013, + "learning_rate": 4.8226131484779325e-06, + "loss": 0.6281, + "step": 735 + }, + { + "epoch": 0.8232662192393736, + "grad_norm": 0.438475638628006, + "learning_rate": 4.822060738894439e-06, + "loss": 0.6122, + "step": 736 + }, + { + "epoch": 0.8243847874720358, + "grad_norm": 0.426792174577713, + "learning_rate": 4.821507502242321e-06, + "loss": 0.6407, + "step": 737 + }, + { + "epoch": 0.825503355704698, + "grad_norm": 0.42697012424468994, + "learning_rate": 4.820953438718626e-06, + "loss": 0.5996, + "step": 738 + }, + { + "epoch": 0.8266219239373602, + "grad_norm": 0.42373016476631165, + "learning_rate": 4.820398548520702e-06, + "loss": 0.6075, + "step": 739 + }, + { + "epoch": 0.8277404921700223, + "grad_norm": 0.42235615849494934, + "learning_rate": 4.81984283184619e-06, + "loss": 0.608, + "step": 740 + }, + { + "epoch": 0.8288590604026845, + "grad_norm": 0.41180866956710815, + "learning_rate": 4.819286288893022e-06, + "loss": 0.6127, + "step": 741 + }, + { + "epoch": 0.8299776286353467, + "grad_norm": 0.4207548499107361, + "learning_rate": 4.818728919859426e-06, + "loss": 0.6131, + "step": 742 + }, + { + "epoch": 0.831096196868009, + "grad_norm": 0.4295390546321869, + "learning_rate": 4.818170724943928e-06, + "loss": 0.629, + "step": 743 + }, + { + "epoch": 0.8322147651006712, + "grad_norm": 0.4099291265010834, + "learning_rate": 4.817611704345344e-06, + "loss": 0.6055, + "step": 744 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.4142029583454132, + "learning_rate": 4.817051858262785e-06, + "loss": 0.6127, + "step": 745 + }, + { + "epoch": 0.8344519015659956, + "grad_norm": 0.41662877798080444, + "learning_rate": 4.816491186895656e-06, + "loss": 0.6171, + "step": 746 + }, + { + "epoch": 0.8355704697986577, + "grad_norm": 0.4345078766345978, + "learning_rate": 4.815929690443657e-06, + "loss": 0.6091, + "step": 747 + }, + { + "epoch": 0.8366890380313199, + "grad_norm": 0.43010810017585754, + "learning_rate": 4.8153673691067806e-06, + "loss": 0.626, + "step": 748 + }, + { + "epoch": 0.8378076062639821, + "grad_norm": 0.4256346821784973, + "learning_rate": 4.814804223085313e-06, + "loss": 0.6216, + "step": 749 + }, + { + "epoch": 0.8389261744966443, + "grad_norm": 0.42812031507492065, + "learning_rate": 4.814240252579836e-06, + "loss": 0.6138, + "step": 750 + }, + { + "epoch": 0.8400447427293065, + "grad_norm": 0.41682377457618713, + "learning_rate": 4.813675457791224e-06, + "loss": 0.5783, + "step": 751 + }, + { + "epoch": 0.8411633109619687, + "grad_norm": 0.4257197380065918, + "learning_rate": 4.8131098389206435e-06, + "loss": 0.6006, + "step": 752 + }, + { + "epoch": 0.8422818791946308, + "grad_norm": 0.42872053384780884, + "learning_rate": 4.812543396169557e-06, + "loss": 0.6272, + "step": 753 + }, + { + "epoch": 0.843400447427293, + "grad_norm": 0.4263060986995697, + "learning_rate": 4.81197612973972e-06, + "loss": 0.6093, + "step": 754 + }, + { + "epoch": 0.8445190156599552, + "grad_norm": 0.4151028096675873, + "learning_rate": 4.811408039833178e-06, + "loss": 0.5773, + "step": 755 + }, + { + "epoch": 0.8456375838926175, + "grad_norm": 0.4382091164588928, + "learning_rate": 4.810839126652275e-06, + "loss": 0.596, + "step": 756 + }, + { + "epoch": 0.8467561521252797, + "grad_norm": 0.4290934205055237, + "learning_rate": 4.810269390399646e-06, + "loss": 0.5904, + "step": 757 + }, + { + "epoch": 0.8478747203579419, + "grad_norm": 0.4299798011779785, + "learning_rate": 4.809698831278217e-06, + "loss": 0.6449, + "step": 758 + }, + { + "epoch": 0.8489932885906041, + "grad_norm": 0.4188365042209625, + "learning_rate": 4.809127449491211e-06, + "loss": 0.6007, + "step": 759 + }, + { + "epoch": 0.8501118568232662, + "grad_norm": 0.41594186425209045, + "learning_rate": 4.808555245242141e-06, + "loss": 0.5888, + "step": 760 + }, + { + "epoch": 0.8512304250559284, + "grad_norm": 0.4184630513191223, + "learning_rate": 4.807982218734814e-06, + "loss": 0.6495, + "step": 761 + }, + { + "epoch": 0.8523489932885906, + "grad_norm": 0.418849378824234, + "learning_rate": 4.80740837017333e-06, + "loss": 0.6128, + "step": 762 + }, + { + "epoch": 0.8534675615212528, + "grad_norm": 0.42030590772628784, + "learning_rate": 4.8068336997620804e-06, + "loss": 0.6294, + "step": 763 + }, + { + "epoch": 0.854586129753915, + "grad_norm": 0.4204208552837372, + "learning_rate": 4.806258207705753e-06, + "loss": 0.6279, + "step": 764 + }, + { + "epoch": 0.8557046979865772, + "grad_norm": 0.41544729471206665, + "learning_rate": 4.805681894209324e-06, + "loss": 0.6235, + "step": 765 + }, + { + "epoch": 0.8568232662192393, + "grad_norm": 0.430301308631897, + "learning_rate": 4.805104759478065e-06, + "loss": 0.5876, + "step": 766 + }, + { + "epoch": 0.8579418344519015, + "grad_norm": 0.4640876054763794, + "learning_rate": 4.804526803717539e-06, + "loss": 0.6264, + "step": 767 + }, + { + "epoch": 0.8590604026845637, + "grad_norm": 0.43245360255241394, + "learning_rate": 4.8039480271336005e-06, + "loss": 0.5871, + "step": 768 + }, + { + "epoch": 0.860178970917226, + "grad_norm": 0.41617804765701294, + "learning_rate": 4.803368429932399e-06, + "loss": 0.6218, + "step": 769 + }, + { + "epoch": 0.8612975391498882, + "grad_norm": 0.4325237274169922, + "learning_rate": 4.8027880123203726e-06, + "loss": 0.5874, + "step": 770 + }, + { + "epoch": 0.8624161073825504, + "grad_norm": 0.4480580687522888, + "learning_rate": 4.802206774504255e-06, + "loss": 0.6093, + "step": 771 + }, + { + "epoch": 0.8635346756152126, + "grad_norm": 0.44137299060821533, + "learning_rate": 4.801624716691072e-06, + "loss": 0.6031, + "step": 772 + }, + { + "epoch": 0.8646532438478747, + "grad_norm": 0.4269614815711975, + "learning_rate": 4.801041839088139e-06, + "loss": 0.5963, + "step": 773 + }, + { + "epoch": 0.8657718120805369, + "grad_norm": 0.4120911657810211, + "learning_rate": 4.800458141903064e-06, + "loss": 0.5959, + "step": 774 + }, + { + "epoch": 0.8668903803131991, + "grad_norm": 0.4332381784915924, + "learning_rate": 4.799873625343747e-06, + "loss": 0.6007, + "step": 775 + }, + { + "epoch": 0.8680089485458613, + "grad_norm": 0.4361085295677185, + "learning_rate": 4.7992882896183825e-06, + "loss": 0.6012, + "step": 776 + }, + { + "epoch": 0.8691275167785235, + "grad_norm": 0.43251463770866394, + "learning_rate": 4.798702134935454e-06, + "loss": 0.5799, + "step": 777 + }, + { + "epoch": 0.8702460850111857, + "grad_norm": 0.4305305778980255, + "learning_rate": 4.798115161503735e-06, + "loss": 0.6068, + "step": 778 + }, + { + "epoch": 0.8713646532438478, + "grad_norm": 0.4410499930381775, + "learning_rate": 4.797527369532296e-06, + "loss": 0.6486, + "step": 779 + }, + { + "epoch": 0.87248322147651, + "grad_norm": 0.43271416425704956, + "learning_rate": 4.796938759230494e-06, + "loss": 0.6367, + "step": 780 + }, + { + "epoch": 0.8736017897091722, + "grad_norm": 0.430905282497406, + "learning_rate": 4.7963493308079815e-06, + "loss": 0.5753, + "step": 781 + }, + { + "epoch": 0.8747203579418344, + "grad_norm": 0.4054125249385834, + "learning_rate": 4.7957590844746986e-06, + "loss": 0.5806, + "step": 782 + }, + { + "epoch": 0.8758389261744967, + "grad_norm": 0.4322145879268646, + "learning_rate": 4.795168020440878e-06, + "loss": 0.5989, + "step": 783 + }, + { + "epoch": 0.8769574944071589, + "grad_norm": 0.433716744184494, + "learning_rate": 4.7945761389170464e-06, + "loss": 0.6284, + "step": 784 + }, + { + "epoch": 0.8780760626398211, + "grad_norm": 0.4301673471927643, + "learning_rate": 4.793983440114018e-06, + "loss": 0.6469, + "step": 785 + }, + { + "epoch": 0.8791946308724832, + "grad_norm": 0.4279182553291321, + "learning_rate": 4.7933899242428986e-06, + "loss": 0.6032, + "step": 786 + }, + { + "epoch": 0.8803131991051454, + "grad_norm": 0.4133903682231903, + "learning_rate": 4.792795591515087e-06, + "loss": 0.5745, + "step": 787 + }, + { + "epoch": 0.8814317673378076, + "grad_norm": 0.45094674825668335, + "learning_rate": 4.792200442142273e-06, + "loss": 0.6212, + "step": 788 + }, + { + "epoch": 0.8825503355704698, + "grad_norm": 0.4304371774196625, + "learning_rate": 4.7916044763364344e-06, + "loss": 0.61, + "step": 789 + }, + { + "epoch": 0.883668903803132, + "grad_norm": 0.4232103228569031, + "learning_rate": 4.791007694309842e-06, + "loss": 0.5942, + "step": 790 + }, + { + "epoch": 0.8847874720357942, + "grad_norm": 0.41822549700737, + "learning_rate": 4.790410096275057e-06, + "loss": 0.5829, + "step": 791 + }, + { + "epoch": 0.8859060402684564, + "grad_norm": 0.44473201036453247, + "learning_rate": 4.789811682444931e-06, + "loss": 0.6359, + "step": 792 + }, + { + "epoch": 0.8870246085011185, + "grad_norm": 0.42226001620292664, + "learning_rate": 4.7892124530326065e-06, + "loss": 0.5966, + "step": 793 + }, + { + "epoch": 0.8881431767337807, + "grad_norm": 0.4254050552845001, + "learning_rate": 4.788612408251517e-06, + "loss": 0.6211, + "step": 794 + }, + { + "epoch": 0.889261744966443, + "grad_norm": 0.42909836769104004, + "learning_rate": 4.788011548315383e-06, + "loss": 0.6039, + "step": 795 + }, + { + "epoch": 0.8903803131991052, + "grad_norm": 0.4296148419380188, + "learning_rate": 4.78740987343822e-06, + "loss": 0.6055, + "step": 796 + }, + { + "epoch": 0.8914988814317674, + "grad_norm": 0.4225505292415619, + "learning_rate": 4.786807383834332e-06, + "loss": 0.5947, + "step": 797 + }, + { + "epoch": 0.8926174496644296, + "grad_norm": 0.4271566867828369, + "learning_rate": 4.786204079718314e-06, + "loss": 0.6002, + "step": 798 + }, + { + "epoch": 0.8937360178970917, + "grad_norm": 0.42522063851356506, + "learning_rate": 4.785599961305048e-06, + "loss": 0.6231, + "step": 799 + }, + { + "epoch": 0.8948545861297539, + "grad_norm": 0.4384607970714569, + "learning_rate": 4.784995028809707e-06, + "loss": 0.6072, + "step": 800 + }, + { + "epoch": 0.8959731543624161, + "grad_norm": 0.4194418787956238, + "learning_rate": 4.784389282447759e-06, + "loss": 0.5979, + "step": 801 + }, + { + "epoch": 0.8970917225950783, + "grad_norm": 0.43825826048851013, + "learning_rate": 4.7837827224349544e-06, + "loss": 0.6256, + "step": 802 + }, + { + "epoch": 0.8982102908277405, + "grad_norm": 0.43381622433662415, + "learning_rate": 4.783175348987339e-06, + "loss": 0.5932, + "step": 803 + }, + { + "epoch": 0.8993288590604027, + "grad_norm": 0.4468502700328827, + "learning_rate": 4.7825671623212456e-06, + "loss": 0.618, + "step": 804 + }, + { + "epoch": 0.9004474272930649, + "grad_norm": 0.4352877140045166, + "learning_rate": 4.781958162653298e-06, + "loss": 0.5898, + "step": 805 + }, + { + "epoch": 0.901565995525727, + "grad_norm": 0.4242689907550812, + "learning_rate": 4.781348350200408e-06, + "loss": 0.5856, + "step": 806 + }, + { + "epoch": 0.9026845637583892, + "grad_norm": 0.4262087941169739, + "learning_rate": 4.780737725179778e-06, + "loss": 0.5994, + "step": 807 + }, + { + "epoch": 0.9038031319910514, + "grad_norm": 0.42303264141082764, + "learning_rate": 4.780126287808899e-06, + "loss": 0.6106, + "step": 808 + }, + { + "epoch": 0.9049217002237137, + "grad_norm": 0.43589121103286743, + "learning_rate": 4.779514038305555e-06, + "loss": 0.6251, + "step": 809 + }, + { + "epoch": 0.9060402684563759, + "grad_norm": 0.43768516182899475, + "learning_rate": 4.778900976887813e-06, + "loss": 0.6124, + "step": 810 + }, + { + "epoch": 0.9071588366890381, + "grad_norm": 0.4439849257469177, + "learning_rate": 4.778287103774033e-06, + "loss": 0.6397, + "step": 811 + }, + { + "epoch": 0.9082774049217002, + "grad_norm": 0.44813254475593567, + "learning_rate": 4.777672419182863e-06, + "loss": 0.6213, + "step": 812 + }, + { + "epoch": 0.9093959731543624, + "grad_norm": 0.4133831858634949, + "learning_rate": 4.777056923333244e-06, + "loss": 0.6138, + "step": 813 + }, + { + "epoch": 0.9105145413870246, + "grad_norm": 0.4255264699459076, + "learning_rate": 4.7764406164444e-06, + "loss": 0.6143, + "step": 814 + }, + { + "epoch": 0.9116331096196868, + "grad_norm": 0.42810630798339844, + "learning_rate": 4.775823498735845e-06, + "loss": 0.6253, + "step": 815 + }, + { + "epoch": 0.912751677852349, + "grad_norm": 0.42162856459617615, + "learning_rate": 4.775205570427386e-06, + "loss": 0.602, + "step": 816 + }, + { + "epoch": 0.9138702460850112, + "grad_norm": 0.4342280328273773, + "learning_rate": 4.7745868317391135e-06, + "loss": 0.6088, + "step": 817 + }, + { + "epoch": 0.9149888143176734, + "grad_norm": 0.42438629269599915, + "learning_rate": 4.773967282891411e-06, + "loss": 0.5788, + "step": 818 + }, + { + "epoch": 0.9161073825503355, + "grad_norm": 0.437950074672699, + "learning_rate": 4.7733469241049475e-06, + "loss": 0.6277, + "step": 819 + }, + { + "epoch": 0.9172259507829977, + "grad_norm": 0.4286377429962158, + "learning_rate": 4.772725755600682e-06, + "loss": 0.6024, + "step": 820 + }, + { + "epoch": 0.9183445190156599, + "grad_norm": 0.4317566156387329, + "learning_rate": 4.772103777599861e-06, + "loss": 0.6048, + "step": 821 + }, + { + "epoch": 0.9194630872483222, + "grad_norm": 0.4509202837944031, + "learning_rate": 4.771480990324021e-06, + "loss": 0.6219, + "step": 822 + }, + { + "epoch": 0.9205816554809844, + "grad_norm": 0.4387308955192566, + "learning_rate": 4.7708573939949845e-06, + "loss": 0.6082, + "step": 823 + }, + { + "epoch": 0.9217002237136466, + "grad_norm": 0.457883358001709, + "learning_rate": 4.770232988834864e-06, + "loss": 0.6112, + "step": 824 + }, + { + "epoch": 0.9228187919463087, + "grad_norm": 0.44200408458709717, + "learning_rate": 4.769607775066058e-06, + "loss": 0.6146, + "step": 825 + }, + { + "epoch": 0.9239373601789709, + "grad_norm": 0.44704675674438477, + "learning_rate": 4.768981752911256e-06, + "loss": 0.5921, + "step": 826 + }, + { + "epoch": 0.9250559284116331, + "grad_norm": 0.4367467164993286, + "learning_rate": 4.768354922593433e-06, + "loss": 0.6075, + "step": 827 + }, + { + "epoch": 0.9261744966442953, + "grad_norm": 0.4321734309196472, + "learning_rate": 4.767727284335852e-06, + "loss": 0.6041, + "step": 828 + }, + { + "epoch": 0.9272930648769575, + "grad_norm": 0.42991605401039124, + "learning_rate": 4.767098838362065e-06, + "loss": 0.5804, + "step": 829 + }, + { + "epoch": 0.9284116331096197, + "grad_norm": 0.43791651725769043, + "learning_rate": 4.766469584895912e-06, + "loss": 0.6005, + "step": 830 + }, + { + "epoch": 0.9295302013422819, + "grad_norm": 0.41972237825393677, + "learning_rate": 4.765839524161518e-06, + "loss": 0.582, + "step": 831 + }, + { + "epoch": 0.930648769574944, + "grad_norm": 0.4424271881580353, + "learning_rate": 4.765208656383299e-06, + "loss": 0.5978, + "step": 832 + }, + { + "epoch": 0.9317673378076062, + "grad_norm": 0.45667219161987305, + "learning_rate": 4.7645769817859554e-06, + "loss": 0.6208, + "step": 833 + }, + { + "epoch": 0.9328859060402684, + "grad_norm": 0.4423377811908722, + "learning_rate": 4.763944500594476e-06, + "loss": 0.6061, + "step": 834 + }, + { + "epoch": 0.9340044742729307, + "grad_norm": 0.4316536784172058, + "learning_rate": 4.7633112130341385e-06, + "loss": 0.6116, + "step": 835 + }, + { + "epoch": 0.9351230425055929, + "grad_norm": 0.4591672718524933, + "learning_rate": 4.762677119330505e-06, + "loss": 0.5729, + "step": 836 + }, + { + "epoch": 0.9362416107382551, + "grad_norm": 0.4469880759716034, + "learning_rate": 4.762042219709427e-06, + "loss": 0.6025, + "step": 837 + }, + { + "epoch": 0.9373601789709173, + "grad_norm": 0.4560692012310028, + "learning_rate": 4.761406514397042e-06, + "loss": 0.6103, + "step": 838 + }, + { + "epoch": 0.9384787472035794, + "grad_norm": 0.4428820013999939, + "learning_rate": 4.760770003619775e-06, + "loss": 0.6258, + "step": 839 + }, + { + "epoch": 0.9395973154362416, + "grad_norm": 0.44238874316215515, + "learning_rate": 4.760132687604338e-06, + "loss": 0.6032, + "step": 840 + }, + { + "epoch": 0.9407158836689038, + "grad_norm": 0.46432724595069885, + "learning_rate": 4.759494566577727e-06, + "loss": 0.6266, + "step": 841 + }, + { + "epoch": 0.941834451901566, + "grad_norm": 0.42941901087760925, + "learning_rate": 4.75885564076723e-06, + "loss": 0.5927, + "step": 842 + }, + { + "epoch": 0.9429530201342282, + "grad_norm": 0.43781232833862305, + "learning_rate": 4.758215910400418e-06, + "loss": 0.5967, + "step": 843 + }, + { + "epoch": 0.9440715883668904, + "grad_norm": 0.45641666650772095, + "learning_rate": 4.757575375705149e-06, + "loss": 0.6423, + "step": 844 + }, + { + "epoch": 0.9451901565995525, + "grad_norm": 0.43784114718437195, + "learning_rate": 4.756934036909567e-06, + "loss": 0.606, + "step": 845 + }, + { + "epoch": 0.9463087248322147, + "grad_norm": 0.4379528760910034, + "learning_rate": 4.756291894242106e-06, + "loss": 0.6201, + "step": 846 + }, + { + "epoch": 0.9474272930648769, + "grad_norm": 0.42831459641456604, + "learning_rate": 4.755648947931479e-06, + "loss": 0.6121, + "step": 847 + }, + { + "epoch": 0.9485458612975392, + "grad_norm": 0.43790462613105774, + "learning_rate": 4.7550051982066945e-06, + "loss": 0.5785, + "step": 848 + }, + { + "epoch": 0.9496644295302014, + "grad_norm": 0.4407269358634949, + "learning_rate": 4.75436064529704e-06, + "loss": 0.6127, + "step": 849 + }, + { + "epoch": 0.9507829977628636, + "grad_norm": 0.4252265393733978, + "learning_rate": 4.753715289432092e-06, + "loss": 0.6129, + "step": 850 + }, + { + "epoch": 0.9519015659955258, + "grad_norm": 0.4376990795135498, + "learning_rate": 4.753069130841712e-06, + "loss": 0.614, + "step": 851 + }, + { + "epoch": 0.9530201342281879, + "grad_norm": 0.43123552203178406, + "learning_rate": 4.752422169756048e-06, + "loss": 0.6169, + "step": 852 + }, + { + "epoch": 0.9541387024608501, + "grad_norm": 0.4513196349143982, + "learning_rate": 4.7517744064055345e-06, + "loss": 0.6381, + "step": 853 + }, + { + "epoch": 0.9552572706935123, + "grad_norm": 0.44663751125335693, + "learning_rate": 4.751125841020891e-06, + "loss": 0.605, + "step": 854 + }, + { + "epoch": 0.9563758389261745, + "grad_norm": 0.44196903705596924, + "learning_rate": 4.750476473833123e-06, + "loss": 0.6163, + "step": 855 + }, + { + "epoch": 0.9574944071588367, + "grad_norm": 0.40786847472190857, + "learning_rate": 4.74982630507352e-06, + "loss": 0.5624, + "step": 856 + }, + { + "epoch": 0.9586129753914989, + "grad_norm": 0.43438002467155457, + "learning_rate": 4.749175334973659e-06, + "loss": 0.6183, + "step": 857 + }, + { + "epoch": 0.959731543624161, + "grad_norm": 0.43120619654655457, + "learning_rate": 4.748523563765401e-06, + "loss": 0.6097, + "step": 858 + }, + { + "epoch": 0.9608501118568232, + "grad_norm": 0.4761989414691925, + "learning_rate": 4.747870991680895e-06, + "loss": 0.6029, + "step": 859 + }, + { + "epoch": 0.9619686800894854, + "grad_norm": 0.44484785199165344, + "learning_rate": 4.747217618952571e-06, + "loss": 0.5955, + "step": 860 + }, + { + "epoch": 0.9630872483221476, + "grad_norm": 0.4473284184932709, + "learning_rate": 4.746563445813148e-06, + "loss": 0.6367, + "step": 861 + }, + { + "epoch": 0.9642058165548099, + "grad_norm": 0.4486042857170105, + "learning_rate": 4.745908472495628e-06, + "loss": 0.5917, + "step": 862 + }, + { + "epoch": 0.9653243847874721, + "grad_norm": 0.45661088824272156, + "learning_rate": 4.745252699233298e-06, + "loss": 0.61, + "step": 863 + }, + { + "epoch": 0.9664429530201343, + "grad_norm": 0.4235589802265167, + "learning_rate": 4.744596126259731e-06, + "loss": 0.5887, + "step": 864 + }, + { + "epoch": 0.9675615212527964, + "grad_norm": 0.45463138818740845, + "learning_rate": 4.743938753808785e-06, + "loss": 0.6295, + "step": 865 + }, + { + "epoch": 0.9686800894854586, + "grad_norm": 0.4576405882835388, + "learning_rate": 4.743280582114601e-06, + "loss": 0.6301, + "step": 866 + }, + { + "epoch": 0.9697986577181208, + "grad_norm": 0.4380008280277252, + "learning_rate": 4.742621611411606e-06, + "loss": 0.619, + "step": 867 + }, + { + "epoch": 0.970917225950783, + "grad_norm": 0.4485333263874054, + "learning_rate": 4.7419618419345124e-06, + "loss": 0.6311, + "step": 868 + }, + { + "epoch": 0.9720357941834452, + "grad_norm": 0.44375064969062805, + "learning_rate": 4.741301273918314e-06, + "loss": 0.6095, + "step": 869 + }, + { + "epoch": 0.9731543624161074, + "grad_norm": 0.4440104365348816, + "learning_rate": 4.740639907598293e-06, + "loss": 0.6173, + "step": 870 + }, + { + "epoch": 0.9742729306487695, + "grad_norm": 0.4545641243457794, + "learning_rate": 4.739977743210014e-06, + "loss": 0.6046, + "step": 871 + }, + { + "epoch": 0.9753914988814317, + "grad_norm": 0.41214534640312195, + "learning_rate": 4.739314780989324e-06, + "loss": 0.6072, + "step": 872 + }, + { + "epoch": 0.9765100671140939, + "grad_norm": 0.4223095178604126, + "learning_rate": 4.738651021172357e-06, + "loss": 0.5878, + "step": 873 + }, + { + "epoch": 0.9776286353467561, + "grad_norm": 0.42885127663612366, + "learning_rate": 4.7379864639955304e-06, + "loss": 0.577, + "step": 874 + }, + { + "epoch": 0.9787472035794184, + "grad_norm": 0.42921069264411926, + "learning_rate": 4.737321109695546e-06, + "loss": 0.5844, + "step": 875 + }, + { + "epoch": 0.9798657718120806, + "grad_norm": 0.43462637066841125, + "learning_rate": 4.736654958509387e-06, + "loss": 0.6135, + "step": 876 + }, + { + "epoch": 0.9809843400447428, + "grad_norm": 0.43558555841445923, + "learning_rate": 4.735988010674324e-06, + "loss": 0.6255, + "step": 877 + }, + { + "epoch": 0.9821029082774049, + "grad_norm": 0.44332823157310486, + "learning_rate": 4.735320266427909e-06, + "loss": 0.6266, + "step": 878 + }, + { + "epoch": 0.9832214765100671, + "grad_norm": 0.4158824682235718, + "learning_rate": 4.734651726007978e-06, + "loss": 0.585, + "step": 879 + }, + { + "epoch": 0.9843400447427293, + "grad_norm": 0.4264974296092987, + "learning_rate": 4.733982389652652e-06, + "loss": 0.5871, + "step": 880 + }, + { + "epoch": 0.9854586129753915, + "grad_norm": 0.44846397638320923, + "learning_rate": 4.733312257600332e-06, + "loss": 0.6441, + "step": 881 + }, + { + "epoch": 0.9865771812080537, + "grad_norm": 0.46592265367507935, + "learning_rate": 4.732641330089707e-06, + "loss": 0.6326, + "step": 882 + }, + { + "epoch": 0.9876957494407159, + "grad_norm": 0.423447847366333, + "learning_rate": 4.731969607359747e-06, + "loss": 0.5922, + "step": 883 + }, + { + "epoch": 0.9888143176733781, + "grad_norm": 0.43406492471694946, + "learning_rate": 4.731297089649704e-06, + "loss": 0.6234, + "step": 884 + }, + { + "epoch": 0.9899328859060402, + "grad_norm": 0.443352073431015, + "learning_rate": 4.730623777199115e-06, + "loss": 0.6397, + "step": 885 + }, + { + "epoch": 0.9910514541387024, + "grad_norm": 0.4311400353908539, + "learning_rate": 4.7299496702478e-06, + "loss": 0.6073, + "step": 886 + }, + { + "epoch": 0.9921700223713646, + "grad_norm": 0.4217356741428375, + "learning_rate": 4.729274769035861e-06, + "loss": 0.6177, + "step": 887 + }, + { + "epoch": 0.9932885906040269, + "grad_norm": 0.45120054483413696, + "learning_rate": 4.728599073803685e-06, + "loss": 0.6181, + "step": 888 + }, + { + "epoch": 0.9944071588366891, + "grad_norm": 0.4567187428474426, + "learning_rate": 4.7279225847919375e-06, + "loss": 0.5839, + "step": 889 + }, + { + "epoch": 0.9955257270693513, + "grad_norm": 0.4429774880409241, + "learning_rate": 4.727245302241572e-06, + "loss": 0.6033, + "step": 890 + }, + { + "epoch": 0.9966442953020134, + "grad_norm": 0.44755569100379944, + "learning_rate": 4.726567226393821e-06, + "loss": 0.5877, + "step": 891 + }, + { + "epoch": 0.9977628635346756, + "grad_norm": 0.45908474922180176, + "learning_rate": 4.725888357490201e-06, + "loss": 0.6017, + "step": 892 + }, + { + "epoch": 0.9988814317673378, + "grad_norm": 0.4454707205295563, + "learning_rate": 4.725208695772511e-06, + "loss": 0.6007, + "step": 893 + }, + { + "epoch": 1.0, + "grad_norm": 0.43688732385635376, + "learning_rate": 4.7245282414828305e-06, + "loss": 0.6202, + "step": 894 + }, + { + "epoch": 1.0011185682326622, + "grad_norm": 0.43844303488731384, + "learning_rate": 4.723846994863524e-06, + "loss": 0.5886, + "step": 895 + }, + { + "epoch": 1.0022371364653244, + "grad_norm": 0.4296126067638397, + "learning_rate": 4.7231649561572376e-06, + "loss": 0.5759, + "step": 896 + }, + { + "epoch": 1.0033557046979866, + "grad_norm": 0.4716373682022095, + "learning_rate": 4.722482125606898e-06, + "loss": 0.6078, + "step": 897 + }, + { + "epoch": 1.0044742729306488, + "grad_norm": 0.45032379031181335, + "learning_rate": 4.721798503455716e-06, + "loss": 0.6145, + "step": 898 + }, + { + "epoch": 1.005592841163311, + "grad_norm": 0.42808797955513, + "learning_rate": 4.721114089947181e-06, + "loss": 0.5446, + "step": 899 + }, + { + "epoch": 1.0067114093959733, + "grad_norm": 0.4501192271709442, + "learning_rate": 4.7204288853250694e-06, + "loss": 0.5951, + "step": 900 + }, + { + "epoch": 1.0078299776286352, + "grad_norm": 0.43108487129211426, + "learning_rate": 4.719742889833434e-06, + "loss": 0.5826, + "step": 901 + }, + { + "epoch": 1.0089485458612975, + "grad_norm": 0.44014376401901245, + "learning_rate": 4.7190561037166135e-06, + "loss": 0.6138, + "step": 902 + }, + { + "epoch": 1.0100671140939597, + "grad_norm": 0.4504956305027008, + "learning_rate": 4.718368527219226e-06, + "loss": 0.6125, + "step": 903 + }, + { + "epoch": 1.0111856823266219, + "grad_norm": 0.4215417504310608, + "learning_rate": 4.717680160586172e-06, + "loss": 0.5761, + "step": 904 + }, + { + "epoch": 1.012304250559284, + "grad_norm": 0.44780582189559937, + "learning_rate": 4.716991004062632e-06, + "loss": 0.6056, + "step": 905 + }, + { + "epoch": 1.0134228187919463, + "grad_norm": 0.462046355009079, + "learning_rate": 4.7163010578940695e-06, + "loss": 0.6188, + "step": 906 + }, + { + "epoch": 1.0145413870246085, + "grad_norm": 0.42324090003967285, + "learning_rate": 4.715610322326229e-06, + "loss": 0.6086, + "step": 907 + }, + { + "epoch": 1.0156599552572707, + "grad_norm": 0.4330340623855591, + "learning_rate": 4.714918797605135e-06, + "loss": 0.5894, + "step": 908 + }, + { + "epoch": 1.016778523489933, + "grad_norm": 0.4506567418575287, + "learning_rate": 4.714226483977095e-06, + "loss": 0.6048, + "step": 909 + }, + { + "epoch": 1.0178970917225951, + "grad_norm": 0.4513271152973175, + "learning_rate": 4.713533381688695e-06, + "loss": 0.6139, + "step": 910 + }, + { + "epoch": 1.0190156599552573, + "grad_norm": 0.46496498584747314, + "learning_rate": 4.712839490986804e-06, + "loss": 0.607, + "step": 911 + }, + { + "epoch": 1.0201342281879195, + "grad_norm": 0.4421592354774475, + "learning_rate": 4.7121448121185716e-06, + "loss": 0.6044, + "step": 912 + }, + { + "epoch": 1.0212527964205818, + "grad_norm": 0.4436202943325043, + "learning_rate": 4.711449345331427e-06, + "loss": 0.6216, + "step": 913 + }, + { + "epoch": 1.0223713646532437, + "grad_norm": 0.43512076139450073, + "learning_rate": 4.7107530908730815e-06, + "loss": 0.5738, + "step": 914 + }, + { + "epoch": 1.023489932885906, + "grad_norm": 0.4467792510986328, + "learning_rate": 4.710056048991525e-06, + "loss": 0.6205, + "step": 915 + }, + { + "epoch": 1.0246085011185682, + "grad_norm": 0.4396659731864929, + "learning_rate": 4.709358219935028e-06, + "loss": 0.5902, + "step": 916 + }, + { + "epoch": 1.0257270693512304, + "grad_norm": 0.4392754137516022, + "learning_rate": 4.708659603952146e-06, + "loss": 0.6032, + "step": 917 + }, + { + "epoch": 1.0268456375838926, + "grad_norm": 0.4519026577472687, + "learning_rate": 4.707960201291708e-06, + "loss": 0.6024, + "step": 918 + }, + { + "epoch": 1.0279642058165548, + "grad_norm": 0.45630815625190735, + "learning_rate": 4.707260012202826e-06, + "loss": 0.5928, + "step": 919 + }, + { + "epoch": 1.029082774049217, + "grad_norm": 0.43156692385673523, + "learning_rate": 4.706559036934896e-06, + "loss": 0.5819, + "step": 920 + }, + { + "epoch": 1.0302013422818792, + "grad_norm": 0.4461636543273926, + "learning_rate": 4.705857275737587e-06, + "loss": 0.5727, + "step": 921 + }, + { + "epoch": 1.0313199105145414, + "grad_norm": 0.43878501653671265, + "learning_rate": 4.705154728860853e-06, + "loss": 0.5784, + "step": 922 + }, + { + "epoch": 1.0324384787472036, + "grad_norm": 0.42566847801208496, + "learning_rate": 4.704451396554925e-06, + "loss": 0.5607, + "step": 923 + }, + { + "epoch": 1.0335570469798658, + "grad_norm": 0.43960490822792053, + "learning_rate": 4.703747279070318e-06, + "loss": 0.6085, + "step": 924 + }, + { + "epoch": 1.034675615212528, + "grad_norm": 0.4411300718784332, + "learning_rate": 4.7030423766578194e-06, + "loss": 0.5821, + "step": 925 + }, + { + "epoch": 1.0357941834451903, + "grad_norm": 0.43742841482162476, + "learning_rate": 4.702336689568503e-06, + "loss": 0.5791, + "step": 926 + }, + { + "epoch": 1.0369127516778525, + "grad_norm": 0.4498448669910431, + "learning_rate": 4.70163021805372e-06, + "loss": 0.5803, + "step": 927 + }, + { + "epoch": 1.0380313199105144, + "grad_norm": 0.44952791929244995, + "learning_rate": 4.7009229623650986e-06, + "loss": 0.5954, + "step": 928 + }, + { + "epoch": 1.0391498881431767, + "grad_norm": 0.44882333278656006, + "learning_rate": 4.7002149227545505e-06, + "loss": 0.5866, + "step": 929 + }, + { + "epoch": 1.0402684563758389, + "grad_norm": 0.43540847301483154, + "learning_rate": 4.699506099474261e-06, + "loss": 0.5742, + "step": 930 + }, + { + "epoch": 1.041387024608501, + "grad_norm": 0.44182834029197693, + "learning_rate": 4.6987964927767015e-06, + "loss": 0.5712, + "step": 931 + }, + { + "epoch": 1.0425055928411633, + "grad_norm": 0.43853750824928284, + "learning_rate": 4.6980861029146174e-06, + "loss": 0.5847, + "step": 932 + }, + { + "epoch": 1.0436241610738255, + "grad_norm": 0.4608674943447113, + "learning_rate": 4.697374930141034e-06, + "loss": 0.6076, + "step": 933 + }, + { + "epoch": 1.0447427293064877, + "grad_norm": 0.45616960525512695, + "learning_rate": 4.696662974709256e-06, + "loss": 0.6088, + "step": 934 + }, + { + "epoch": 1.04586129753915, + "grad_norm": 0.44517964124679565, + "learning_rate": 4.695950236872867e-06, + "loss": 0.5795, + "step": 935 + }, + { + "epoch": 1.0469798657718121, + "grad_norm": 0.44067806005477905, + "learning_rate": 4.69523671688573e-06, + "loss": 0.6035, + "step": 936 + }, + { + "epoch": 1.0480984340044743, + "grad_norm": 0.44141000509262085, + "learning_rate": 4.694522415001984e-06, + "loss": 0.5676, + "step": 937 + }, + { + "epoch": 1.0492170022371365, + "grad_norm": 0.47550198435783386, + "learning_rate": 4.693807331476049e-06, + "loss": 0.5774, + "step": 938 + }, + { + "epoch": 1.0503355704697988, + "grad_norm": 0.4472981095314026, + "learning_rate": 4.6930914665626215e-06, + "loss": 0.5884, + "step": 939 + }, + { + "epoch": 1.0514541387024607, + "grad_norm": 0.4382828176021576, + "learning_rate": 4.692374820516679e-06, + "loss": 0.5941, + "step": 940 + }, + { + "epoch": 1.052572706935123, + "grad_norm": 0.4618227481842041, + "learning_rate": 4.691657393593475e-06, + "loss": 0.6139, + "step": 941 + }, + { + "epoch": 1.0536912751677852, + "grad_norm": 0.44602257013320923, + "learning_rate": 4.690939186048541e-06, + "loss": 0.5824, + "step": 942 + }, + { + "epoch": 1.0548098434004474, + "grad_norm": 0.4528485834598541, + "learning_rate": 4.690220198137688e-06, + "loss": 0.6059, + "step": 943 + }, + { + "epoch": 1.0559284116331096, + "grad_norm": 0.4452129304409027, + "learning_rate": 4.689500430117005e-06, + "loss": 0.5792, + "step": 944 + }, + { + "epoch": 1.0570469798657718, + "grad_norm": 0.4420737028121948, + "learning_rate": 4.688779882242855e-06, + "loss": 0.5834, + "step": 945 + }, + { + "epoch": 1.058165548098434, + "grad_norm": 0.46812739968299866, + "learning_rate": 4.6880585547718845e-06, + "loss": 0.6169, + "step": 946 + }, + { + "epoch": 1.0592841163310962, + "grad_norm": 0.4635341465473175, + "learning_rate": 4.687336447961015e-06, + "loss": 0.5954, + "step": 947 + }, + { + "epoch": 1.0604026845637584, + "grad_norm": 0.45534729957580566, + "learning_rate": 4.686613562067444e-06, + "loss": 0.5906, + "step": 948 + }, + { + "epoch": 1.0615212527964206, + "grad_norm": 0.4358477294445038, + "learning_rate": 4.685889897348649e-06, + "loss": 0.5701, + "step": 949 + }, + { + "epoch": 1.0626398210290828, + "grad_norm": 0.4336836338043213, + "learning_rate": 4.685165454062385e-06, + "loss": 0.5552, + "step": 950 + }, + { + "epoch": 1.063758389261745, + "grad_norm": 0.4392858147621155, + "learning_rate": 4.684440232466682e-06, + "loss": 0.572, + "step": 951 + }, + { + "epoch": 1.0648769574944073, + "grad_norm": 0.4643237292766571, + "learning_rate": 4.683714232819848e-06, + "loss": 0.6037, + "step": 952 + }, + { + "epoch": 1.0659955257270695, + "grad_norm": 0.45478659868240356, + "learning_rate": 4.682987455380469e-06, + "loss": 0.6188, + "step": 953 + }, + { + "epoch": 1.0671140939597314, + "grad_norm": 0.4337981939315796, + "learning_rate": 4.682259900407409e-06, + "loss": 0.5715, + "step": 954 + }, + { + "epoch": 1.0682326621923937, + "grad_norm": 0.4659992456436157, + "learning_rate": 4.6815315681598065e-06, + "loss": 0.5939, + "step": 955 + }, + { + "epoch": 1.0693512304250559, + "grad_norm": 0.4571925103664398, + "learning_rate": 4.680802458897078e-06, + "loss": 0.605, + "step": 956 + }, + { + "epoch": 1.070469798657718, + "grad_norm": 0.4513433575630188, + "learning_rate": 4.6800725728789164e-06, + "loss": 0.5856, + "step": 957 + }, + { + "epoch": 1.0715883668903803, + "grad_norm": 0.45815905928611755, + "learning_rate": 4.6793419103652914e-06, + "loss": 0.6006, + "step": 958 + }, + { + "epoch": 1.0727069351230425, + "grad_norm": 0.4600979685783386, + "learning_rate": 4.678610471616451e-06, + "loss": 0.5863, + "step": 959 + }, + { + "epoch": 1.0738255033557047, + "grad_norm": 0.4419516623020172, + "learning_rate": 4.677878256892917e-06, + "loss": 0.5796, + "step": 960 + }, + { + "epoch": 1.074944071588367, + "grad_norm": 0.458377867937088, + "learning_rate": 4.677145266455489e-06, + "loss": 0.5719, + "step": 961 + }, + { + "epoch": 1.0760626398210291, + "grad_norm": 0.445962131023407, + "learning_rate": 4.676411500565241e-06, + "loss": 0.585, + "step": 962 + }, + { + "epoch": 1.0771812080536913, + "grad_norm": 0.4639750123023987, + "learning_rate": 4.675676959483528e-06, + "loss": 0.6117, + "step": 963 + }, + { + "epoch": 1.0782997762863535, + "grad_norm": 0.4576922059059143, + "learning_rate": 4.6749416434719755e-06, + "loss": 0.592, + "step": 964 + }, + { + "epoch": 1.0794183445190157, + "grad_norm": 0.45511624217033386, + "learning_rate": 4.674205552792487e-06, + "loss": 0.5888, + "step": 965 + }, + { + "epoch": 1.0805369127516777, + "grad_norm": 0.4598134756088257, + "learning_rate": 4.673468687707244e-06, + "loss": 0.5705, + "step": 966 + }, + { + "epoch": 1.08165548098434, + "grad_norm": 0.4522278606891632, + "learning_rate": 4.672731048478702e-06, + "loss": 0.5741, + "step": 967 + }, + { + "epoch": 1.0827740492170022, + "grad_norm": 0.4419742822647095, + "learning_rate": 4.671992635369592e-06, + "loss": 0.5832, + "step": 968 + }, + { + "epoch": 1.0838926174496644, + "grad_norm": 0.4407685399055481, + "learning_rate": 4.67125344864292e-06, + "loss": 0.5978, + "step": 969 + }, + { + "epoch": 1.0850111856823266, + "grad_norm": 0.4517025053501129, + "learning_rate": 4.67051348856197e-06, + "loss": 0.6115, + "step": 970 + }, + { + "epoch": 1.0861297539149888, + "grad_norm": 0.4652605354785919, + "learning_rate": 4.6697727553903e-06, + "loss": 0.5838, + "step": 971 + }, + { + "epoch": 1.087248322147651, + "grad_norm": 0.45211100578308105, + "learning_rate": 4.6690312493917424e-06, + "loss": 0.5905, + "step": 972 + }, + { + "epoch": 1.0883668903803132, + "grad_norm": 0.45999446511268616, + "learning_rate": 4.668288970830407e-06, + "loss": 0.589, + "step": 973 + }, + { + "epoch": 1.0894854586129754, + "grad_norm": 0.4458955228328705, + "learning_rate": 4.667545919970676e-06, + "loss": 0.5927, + "step": 974 + }, + { + "epoch": 1.0906040268456376, + "grad_norm": 0.46779292821884155, + "learning_rate": 4.666802097077211e-06, + "loss": 0.5955, + "step": 975 + }, + { + "epoch": 1.0917225950782998, + "grad_norm": 0.4617951810359955, + "learning_rate": 4.666057502414942e-06, + "loss": 0.6117, + "step": 976 + }, + { + "epoch": 1.092841163310962, + "grad_norm": 0.45327237248420715, + "learning_rate": 4.665312136249082e-06, + "loss": 0.5863, + "step": 977 + }, + { + "epoch": 1.0939597315436242, + "grad_norm": 0.4517632722854614, + "learning_rate": 4.664565998845112e-06, + "loss": 0.5832, + "step": 978 + }, + { + "epoch": 1.0950782997762865, + "grad_norm": 0.43310609459877014, + "learning_rate": 4.663819090468791e-06, + "loss": 0.5647, + "step": 979 + }, + { + "epoch": 1.0961968680089484, + "grad_norm": 0.47837111353874207, + "learning_rate": 4.6630714113861505e-06, + "loss": 0.6195, + "step": 980 + }, + { + "epoch": 1.0973154362416107, + "grad_norm": 0.4558872878551483, + "learning_rate": 4.662322961863501e-06, + "loss": 0.5933, + "step": 981 + }, + { + "epoch": 1.0984340044742729, + "grad_norm": 0.45069193840026855, + "learning_rate": 4.661573742167421e-06, + "loss": 0.6066, + "step": 982 + }, + { + "epoch": 1.099552572706935, + "grad_norm": 0.44924792647361755, + "learning_rate": 4.660823752564769e-06, + "loss": 0.5773, + "step": 983 + }, + { + "epoch": 1.1006711409395973, + "grad_norm": 0.44701021909713745, + "learning_rate": 4.660072993322674e-06, + "loss": 0.5663, + "step": 984 + }, + { + "epoch": 1.1017897091722595, + "grad_norm": 0.4506109952926636, + "learning_rate": 4.659321464708541e-06, + "loss": 0.593, + "step": 985 + }, + { + "epoch": 1.1029082774049217, + "grad_norm": 0.4586380422115326, + "learning_rate": 4.658569166990048e-06, + "loss": 0.6014, + "step": 986 + }, + { + "epoch": 1.104026845637584, + "grad_norm": 0.45552054047584534, + "learning_rate": 4.657816100435147e-06, + "loss": 0.6043, + "step": 987 + }, + { + "epoch": 1.1051454138702461, + "grad_norm": 0.4566228985786438, + "learning_rate": 4.657062265312065e-06, + "loss": 0.6088, + "step": 988 + }, + { + "epoch": 1.1062639821029083, + "grad_norm": 0.45366841554641724, + "learning_rate": 4.6563076618893045e-06, + "loss": 0.5918, + "step": 989 + }, + { + "epoch": 1.1073825503355705, + "grad_norm": 0.46025100350379944, + "learning_rate": 4.6555522904356344e-06, + "loss": 0.5876, + "step": 990 + }, + { + "epoch": 1.1085011185682327, + "grad_norm": 0.46731919050216675, + "learning_rate": 4.654796151220106e-06, + "loss": 0.5992, + "step": 991 + }, + { + "epoch": 1.109619686800895, + "grad_norm": 0.45841652154922485, + "learning_rate": 4.654039244512036e-06, + "loss": 0.5833, + "step": 992 + }, + { + "epoch": 1.110738255033557, + "grad_norm": 0.46612823009490967, + "learning_rate": 4.653281570581023e-06, + "loss": 0.6137, + "step": 993 + }, + { + "epoch": 1.1118568232662192, + "grad_norm": 0.4462451934814453, + "learning_rate": 4.6525231296969305e-06, + "loss": 0.545, + "step": 994 + }, + { + "epoch": 1.1129753914988814, + "grad_norm": 0.4724692404270172, + "learning_rate": 4.651763922129901e-06, + "loss": 0.6144, + "step": 995 + }, + { + "epoch": 1.1140939597315436, + "grad_norm": 0.47240084409713745, + "learning_rate": 4.651003948150349e-06, + "loss": 0.6037, + "step": 996 + }, + { + "epoch": 1.1152125279642058, + "grad_norm": 0.44295522570610046, + "learning_rate": 4.650243208028958e-06, + "loss": 0.5571, + "step": 997 + }, + { + "epoch": 1.116331096196868, + "grad_norm": 0.4727005958557129, + "learning_rate": 4.649481702036691e-06, + "loss": 0.5851, + "step": 998 + }, + { + "epoch": 1.1174496644295302, + "grad_norm": 0.4520561397075653, + "learning_rate": 4.648719430444777e-06, + "loss": 0.6016, + "step": 999 + }, + { + "epoch": 1.1185682326621924, + "grad_norm": 0.5022072792053223, + "learning_rate": 4.647956393524723e-06, + "loss": 0.5993, + "step": 1000 + }, + { + "epoch": 1.1196868008948546, + "grad_norm": 0.45666182041168213, + "learning_rate": 4.647192591548305e-06, + "loss": 0.6092, + "step": 1001 + }, + { + "epoch": 1.1208053691275168, + "grad_norm": 0.4551488161087036, + "learning_rate": 4.646428024787575e-06, + "loss": 0.5938, + "step": 1002 + }, + { + "epoch": 1.121923937360179, + "grad_norm": 0.4763842821121216, + "learning_rate": 4.645662693514853e-06, + "loss": 0.6016, + "step": 1003 + }, + { + "epoch": 1.1230425055928412, + "grad_norm": 0.4607378840446472, + "learning_rate": 4.644896598002736e-06, + "loss": 0.5785, + "step": 1004 + }, + { + "epoch": 1.1241610738255035, + "grad_norm": 0.4715465009212494, + "learning_rate": 4.64412973852409e-06, + "loss": 0.588, + "step": 1005 + }, + { + "epoch": 1.1252796420581657, + "grad_norm": 0.47461339831352234, + "learning_rate": 4.643362115352053e-06, + "loss": 0.6296, + "step": 1006 + }, + { + "epoch": 1.1263982102908277, + "grad_norm": 0.4571302831172943, + "learning_rate": 4.642593728760038e-06, + "loss": 0.5493, + "step": 1007 + }, + { + "epoch": 1.1275167785234899, + "grad_norm": 0.46194958686828613, + "learning_rate": 4.641824579021726e-06, + "loss": 0.5884, + "step": 1008 + }, + { + "epoch": 1.128635346756152, + "grad_norm": 0.4678124487400055, + "learning_rate": 4.6410546664110736e-06, + "loss": 0.5967, + "step": 1009 + }, + { + "epoch": 1.1297539149888143, + "grad_norm": 0.45504289865493774, + "learning_rate": 4.640283991202306e-06, + "loss": 0.5696, + "step": 1010 + }, + { + "epoch": 1.1308724832214765, + "grad_norm": 0.46907755732536316, + "learning_rate": 4.639512553669921e-06, + "loss": 0.6261, + "step": 1011 + }, + { + "epoch": 1.1319910514541387, + "grad_norm": 0.46805256605148315, + "learning_rate": 4.63874035408869e-06, + "loss": 0.5696, + "step": 1012 + }, + { + "epoch": 1.133109619686801, + "grad_norm": 0.4621639549732208, + "learning_rate": 4.637967392733652e-06, + "loss": 0.563, + "step": 1013 + }, + { + "epoch": 1.1342281879194631, + "grad_norm": 0.43852928280830383, + "learning_rate": 4.6371936698801215e-06, + "loss": 0.5434, + "step": 1014 + }, + { + "epoch": 1.1353467561521253, + "grad_norm": 0.46553272008895874, + "learning_rate": 4.636419185803681e-06, + "loss": 0.5825, + "step": 1015 + }, + { + "epoch": 1.1364653243847875, + "grad_norm": 0.46866393089294434, + "learning_rate": 4.635643940780184e-06, + "loss": 0.5723, + "step": 1016 + }, + { + "epoch": 1.1375838926174497, + "grad_norm": 0.46932780742645264, + "learning_rate": 4.634867935085758e-06, + "loss": 0.5972, + "step": 1017 + }, + { + "epoch": 1.138702460850112, + "grad_norm": 0.45771515369415283, + "learning_rate": 4.634091168996801e-06, + "loss": 0.5871, + "step": 1018 + }, + { + "epoch": 1.139821029082774, + "grad_norm": 0.461588978767395, + "learning_rate": 4.633313642789976e-06, + "loss": 0.6246, + "step": 1019 + }, + { + "epoch": 1.1409395973154361, + "grad_norm": 0.47002744674682617, + "learning_rate": 4.632535356742226e-06, + "loss": 0.581, + "step": 1020 + }, + { + "epoch": 1.1420581655480984, + "grad_norm": 0.46632397174835205, + "learning_rate": 4.631756311130757e-06, + "loss": 0.6203, + "step": 1021 + }, + { + "epoch": 1.1431767337807606, + "grad_norm": 0.46423229575157166, + "learning_rate": 4.6309765062330504e-06, + "loss": 0.6074, + "step": 1022 + }, + { + "epoch": 1.1442953020134228, + "grad_norm": 0.47234588861465454, + "learning_rate": 4.630195942326855e-06, + "loss": 0.6076, + "step": 1023 + }, + { + "epoch": 1.145413870246085, + "grad_norm": 0.46395570039749146, + "learning_rate": 4.62941461969019e-06, + "loss": 0.5775, + "step": 1024 + }, + { + "epoch": 1.1465324384787472, + "grad_norm": 0.456720232963562, + "learning_rate": 4.628632538601349e-06, + "loss": 0.5924, + "step": 1025 + }, + { + "epoch": 1.1476510067114094, + "grad_norm": 0.4563044309616089, + "learning_rate": 4.62784969933889e-06, + "loss": 0.5883, + "step": 1026 + }, + { + "epoch": 1.1487695749440716, + "grad_norm": 0.46938541531562805, + "learning_rate": 4.627066102181645e-06, + "loss": 0.5602, + "step": 1027 + }, + { + "epoch": 1.1498881431767338, + "grad_norm": 0.48113343119621277, + "learning_rate": 4.626281747408713e-06, + "loss": 0.609, + "step": 1028 + }, + { + "epoch": 1.151006711409396, + "grad_norm": 0.46784207224845886, + "learning_rate": 4.6254966352994666e-06, + "loss": 0.6018, + "step": 1029 + }, + { + "epoch": 1.1521252796420582, + "grad_norm": 0.46613121032714844, + "learning_rate": 4.624710766133544e-06, + "loss": 0.5579, + "step": 1030 + }, + { + "epoch": 1.1532438478747205, + "grad_norm": 0.4715366065502167, + "learning_rate": 4.6239241401908575e-06, + "loss": 0.5767, + "step": 1031 + }, + { + "epoch": 1.1543624161073827, + "grad_norm": 0.48206454515457153, + "learning_rate": 4.623136757751584e-06, + "loss": 0.6118, + "step": 1032 + }, + { + "epoch": 1.1554809843400446, + "grad_norm": 0.46678969264030457, + "learning_rate": 4.622348619096174e-06, + "loss": 0.5876, + "step": 1033 + }, + { + "epoch": 1.1565995525727069, + "grad_norm": 0.4533562958240509, + "learning_rate": 4.621559724505346e-06, + "loss": 0.6038, + "step": 1034 + }, + { + "epoch": 1.157718120805369, + "grad_norm": 0.473532497882843, + "learning_rate": 4.620770074260084e-06, + "loss": 0.5802, + "step": 1035 + }, + { + "epoch": 1.1588366890380313, + "grad_norm": 0.4734630882740021, + "learning_rate": 4.6199796686416505e-06, + "loss": 0.5906, + "step": 1036 + }, + { + "epoch": 1.1599552572706935, + "grad_norm": 0.48316991329193115, + "learning_rate": 4.6191885079315665e-06, + "loss": 0.561, + "step": 1037 + }, + { + "epoch": 1.1610738255033557, + "grad_norm": 0.4646005630493164, + "learning_rate": 4.618396592411628e-06, + "loss": 0.5901, + "step": 1038 + }, + { + "epoch": 1.162192393736018, + "grad_norm": 0.4684715270996094, + "learning_rate": 4.617603922363899e-06, + "loss": 0.5915, + "step": 1039 + }, + { + "epoch": 1.1633109619686801, + "grad_norm": 0.46257463097572327, + "learning_rate": 4.6168104980707105e-06, + "loss": 0.577, + "step": 1040 + }, + { + "epoch": 1.1644295302013423, + "grad_norm": 0.4703442454338074, + "learning_rate": 4.616016319814664e-06, + "loss": 0.564, + "step": 1041 + }, + { + "epoch": 1.1655480984340045, + "grad_norm": 0.4818289577960968, + "learning_rate": 4.615221387878631e-06, + "loss": 0.5955, + "step": 1042 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 0.4658922553062439, + "learning_rate": 4.614425702545745e-06, + "loss": 0.5944, + "step": 1043 + }, + { + "epoch": 1.167785234899329, + "grad_norm": 0.4584915041923523, + "learning_rate": 4.613629264099416e-06, + "loss": 0.5625, + "step": 1044 + }, + { + "epoch": 1.168903803131991, + "grad_norm": 0.4565418064594269, + "learning_rate": 4.6128320728233164e-06, + "loss": 0.5942, + "step": 1045 + }, + { + "epoch": 1.1700223713646531, + "grad_norm": 0.48082879185676575, + "learning_rate": 4.61203412900139e-06, + "loss": 0.6366, + "step": 1046 + }, + { + "epoch": 1.1711409395973154, + "grad_norm": 0.47267577052116394, + "learning_rate": 4.611235432917845e-06, + "loss": 0.5804, + "step": 1047 + }, + { + "epoch": 1.1722595078299776, + "grad_norm": 0.48486819863319397, + "learning_rate": 4.610435984857163e-06, + "loss": 0.5755, + "step": 1048 + }, + { + "epoch": 1.1733780760626398, + "grad_norm": 0.48791539669036865, + "learning_rate": 4.609635785104089e-06, + "loss": 0.6168, + "step": 1049 + }, + { + "epoch": 1.174496644295302, + "grad_norm": 0.45250606536865234, + "learning_rate": 4.6088348339436365e-06, + "loss": 0.5833, + "step": 1050 + }, + { + "epoch": 1.1756152125279642, + "grad_norm": 0.4777594804763794, + "learning_rate": 4.608033131661089e-06, + "loss": 0.6048, + "step": 1051 + }, + { + "epoch": 1.1767337807606264, + "grad_norm": 0.47242894768714905, + "learning_rate": 4.607230678541993e-06, + "loss": 0.5945, + "step": 1052 + }, + { + "epoch": 1.1778523489932886, + "grad_norm": 0.47580021619796753, + "learning_rate": 4.606427474872169e-06, + "loss": 0.593, + "step": 1053 + }, + { + "epoch": 1.1789709172259508, + "grad_norm": 0.45570215582847595, + "learning_rate": 4.605623520937698e-06, + "loss": 0.5617, + "step": 1054 + }, + { + "epoch": 1.180089485458613, + "grad_norm": 0.4642491936683655, + "learning_rate": 4.604818817024932e-06, + "loss": 0.604, + "step": 1055 + }, + { + "epoch": 1.1812080536912752, + "grad_norm": 0.46808069944381714, + "learning_rate": 4.60401336342049e-06, + "loss": 0.5829, + "step": 1056 + }, + { + "epoch": 1.1823266219239374, + "grad_norm": 0.482730507850647, + "learning_rate": 4.603207160411257e-06, + "loss": 0.6246, + "step": 1057 + }, + { + "epoch": 1.1834451901565997, + "grad_norm": 0.4554171562194824, + "learning_rate": 4.6024002082843855e-06, + "loss": 0.5797, + "step": 1058 + }, + { + "epoch": 1.1845637583892619, + "grad_norm": 0.4731457233428955, + "learning_rate": 4.6015925073272945e-06, + "loss": 0.5883, + "step": 1059 + }, + { + "epoch": 1.1856823266219239, + "grad_norm": 0.45422542095184326, + "learning_rate": 4.600784057827672e-06, + "loss": 0.5914, + "step": 1060 + }, + { + "epoch": 1.186800894854586, + "grad_norm": 0.4871719181537628, + "learning_rate": 4.599974860073466e-06, + "loss": 0.6146, + "step": 1061 + }, + { + "epoch": 1.1879194630872483, + "grad_norm": 0.47315844893455505, + "learning_rate": 4.5991649143529e-06, + "loss": 0.6044, + "step": 1062 + }, + { + "epoch": 1.1890380313199105, + "grad_norm": 0.45965805649757385, + "learning_rate": 4.5983542209544565e-06, + "loss": 0.5628, + "step": 1063 + }, + { + "epoch": 1.1901565995525727, + "grad_norm": 0.4573105573654175, + "learning_rate": 4.5975427801668886e-06, + "loss": 0.581, + "step": 1064 + }, + { + "epoch": 1.191275167785235, + "grad_norm": 0.45358559489250183, + "learning_rate": 4.596730592279214e-06, + "loss": 0.5851, + "step": 1065 + }, + { + "epoch": 1.192393736017897, + "grad_norm": 0.45561692118644714, + "learning_rate": 4.595917657580716e-06, + "loss": 0.5636, + "step": 1066 + }, + { + "epoch": 1.1935123042505593, + "grad_norm": 0.45746201276779175, + "learning_rate": 4.5951039763609464e-06, + "loss": 0.5943, + "step": 1067 + }, + { + "epoch": 1.1946308724832215, + "grad_norm": 0.4627397656440735, + "learning_rate": 4.594289548909719e-06, + "loss": 0.598, + "step": 1068 + }, + { + "epoch": 1.1957494407158837, + "grad_norm": 0.4608605206012726, + "learning_rate": 4.593474375517118e-06, + "loss": 0.5861, + "step": 1069 + }, + { + "epoch": 1.196868008948546, + "grad_norm": 0.45715343952178955, + "learning_rate": 4.592658456473489e-06, + "loss": 0.6024, + "step": 1070 + }, + { + "epoch": 1.197986577181208, + "grad_norm": 0.4674588143825531, + "learning_rate": 4.591841792069445e-06, + "loss": 0.5779, + "step": 1071 + }, + { + "epoch": 1.1991051454138701, + "grad_norm": 0.4705957770347595, + "learning_rate": 4.591024382595864e-06, + "loss": 0.588, + "step": 1072 + }, + { + "epoch": 1.2002237136465324, + "grad_norm": 0.47805118560791016, + "learning_rate": 4.590206228343892e-06, + "loss": 0.5796, + "step": 1073 + }, + { + "epoch": 1.2013422818791946, + "grad_norm": 0.45588305592536926, + "learning_rate": 4.589387329604936e-06, + "loss": 0.5653, + "step": 1074 + }, + { + "epoch": 1.2024608501118568, + "grad_norm": 0.47811266779899597, + "learning_rate": 4.588567686670672e-06, + "loss": 0.6207, + "step": 1075 + }, + { + "epoch": 1.203579418344519, + "grad_norm": 0.46762150526046753, + "learning_rate": 4.587747299833039e-06, + "loss": 0.5697, + "step": 1076 + }, + { + "epoch": 1.2046979865771812, + "grad_norm": 0.46018633246421814, + "learning_rate": 4.586926169384239e-06, + "loss": 0.5772, + "step": 1077 + }, + { + "epoch": 1.2058165548098434, + "grad_norm": 0.46897298097610474, + "learning_rate": 4.586104295616744e-06, + "loss": 0.5733, + "step": 1078 + }, + { + "epoch": 1.2069351230425056, + "grad_norm": 0.466189980506897, + "learning_rate": 4.585281678823288e-06, + "loss": 0.6027, + "step": 1079 + }, + { + "epoch": 1.2080536912751678, + "grad_norm": 0.4684281647205353, + "learning_rate": 4.584458319296868e-06, + "loss": 0.5766, + "step": 1080 + }, + { + "epoch": 1.20917225950783, + "grad_norm": 0.4513910710811615, + "learning_rate": 4.583634217330747e-06, + "loss": 0.5891, + "step": 1081 + }, + { + "epoch": 1.2102908277404922, + "grad_norm": 0.476510226726532, + "learning_rate": 4.5828093732184545e-06, + "loss": 0.601, + "step": 1082 + }, + { + "epoch": 1.2114093959731544, + "grad_norm": 0.47996675968170166, + "learning_rate": 4.581983787253781e-06, + "loss": 0.598, + "step": 1083 + }, + { + "epoch": 1.2125279642058167, + "grad_norm": 0.4757002890110016, + "learning_rate": 4.581157459730783e-06, + "loss": 0.5561, + "step": 1084 + }, + { + "epoch": 1.2136465324384789, + "grad_norm": 0.45650506019592285, + "learning_rate": 4.5803303909437805e-06, + "loss": 0.5702, + "step": 1085 + }, + { + "epoch": 1.2147651006711409, + "grad_norm": 0.4679136872291565, + "learning_rate": 4.579502581187358e-06, + "loss": 0.5854, + "step": 1086 + }, + { + "epoch": 1.215883668903803, + "grad_norm": 0.46434950828552246, + "learning_rate": 4.578674030756364e-06, + "loss": 0.5623, + "step": 1087 + }, + { + "epoch": 1.2170022371364653, + "grad_norm": 0.4812433123588562, + "learning_rate": 4.57784473994591e-06, + "loss": 0.6006, + "step": 1088 + }, + { + "epoch": 1.2181208053691275, + "grad_norm": 0.4599030017852783, + "learning_rate": 4.577014709051372e-06, + "loss": 0.5614, + "step": 1089 + }, + { + "epoch": 1.2192393736017897, + "grad_norm": 0.48703038692474365, + "learning_rate": 4.576183938368389e-06, + "loss": 0.5987, + "step": 1090 + }, + { + "epoch": 1.220357941834452, + "grad_norm": 0.48404622077941895, + "learning_rate": 4.575352428192865e-06, + "loss": 0.592, + "step": 1091 + }, + { + "epoch": 1.221476510067114, + "grad_norm": 0.47787612676620483, + "learning_rate": 4.574520178820965e-06, + "loss": 0.5733, + "step": 1092 + }, + { + "epoch": 1.2225950782997763, + "grad_norm": 0.4672394394874573, + "learning_rate": 4.573687190549119e-06, + "loss": 0.5736, + "step": 1093 + }, + { + "epoch": 1.2237136465324385, + "grad_norm": 0.47233250737190247, + "learning_rate": 4.57285346367402e-06, + "loss": 0.6045, + "step": 1094 + }, + { + "epoch": 1.2248322147651007, + "grad_norm": 0.4758702516555786, + "learning_rate": 4.572018998492623e-06, + "loss": 0.5734, + "step": 1095 + }, + { + "epoch": 1.225950782997763, + "grad_norm": 0.4715832769870758, + "learning_rate": 4.571183795302147e-06, + "loss": 0.599, + "step": 1096 + }, + { + "epoch": 1.227069351230425, + "grad_norm": 0.4783090651035309, + "learning_rate": 4.570347854400074e-06, + "loss": 0.608, + "step": 1097 + }, + { + "epoch": 1.2281879194630871, + "grad_norm": 0.4721231162548065, + "learning_rate": 4.569511176084148e-06, + "loss": 0.5964, + "step": 1098 + }, + { + "epoch": 1.2293064876957494, + "grad_norm": 0.47731295228004456, + "learning_rate": 4.568673760652377e-06, + "loss": 0.5968, + "step": 1099 + }, + { + "epoch": 1.2304250559284116, + "grad_norm": 0.4623177945613861, + "learning_rate": 4.5678356084030286e-06, + "loss": 0.5669, + "step": 1100 + }, + { + "epoch": 1.2315436241610738, + "grad_norm": 0.46821466088294983, + "learning_rate": 4.566996719634636e-06, + "loss": 0.5895, + "step": 1101 + }, + { + "epoch": 1.232662192393736, + "grad_norm": 0.47642552852630615, + "learning_rate": 4.566157094645994e-06, + "loss": 0.5645, + "step": 1102 + }, + { + "epoch": 1.2337807606263982, + "grad_norm": 0.46859392523765564, + "learning_rate": 4.565316733736159e-06, + "loss": 0.6204, + "step": 1103 + }, + { + "epoch": 1.2348993288590604, + "grad_norm": 0.47033944725990295, + "learning_rate": 4.564475637204449e-06, + "loss": 0.5884, + "step": 1104 + }, + { + "epoch": 1.2360178970917226, + "grad_norm": 0.500777006149292, + "learning_rate": 4.563633805350443e-06, + "loss": 0.5922, + "step": 1105 + }, + { + "epoch": 1.2371364653243848, + "grad_norm": 0.49536949396133423, + "learning_rate": 4.562791238473988e-06, + "loss": 0.5953, + "step": 1106 + }, + { + "epoch": 1.238255033557047, + "grad_norm": 0.47423887252807617, + "learning_rate": 4.5619479368751855e-06, + "loss": 0.5772, + "step": 1107 + }, + { + "epoch": 1.2393736017897092, + "grad_norm": 0.5001300573348999, + "learning_rate": 4.561103900854401e-06, + "loss": 0.6128, + "step": 1108 + }, + { + "epoch": 1.2404921700223714, + "grad_norm": 0.4864799380302429, + "learning_rate": 4.560259130712264e-06, + "loss": 0.5821, + "step": 1109 + }, + { + "epoch": 1.2416107382550337, + "grad_norm": 0.5224222540855408, + "learning_rate": 4.559413626749662e-06, + "loss": 0.6192, + "step": 1110 + }, + { + "epoch": 1.2427293064876959, + "grad_norm": 0.4829416275024414, + "learning_rate": 4.558567389267748e-06, + "loss": 0.5825, + "step": 1111 + }, + { + "epoch": 1.2438478747203578, + "grad_norm": 0.46932196617126465, + "learning_rate": 4.557720418567931e-06, + "loss": 0.5887, + "step": 1112 + }, + { + "epoch": 1.24496644295302, + "grad_norm": 0.47189292311668396, + "learning_rate": 4.556872714951886e-06, + "loss": 0.5777, + "step": 1113 + }, + { + "epoch": 1.2460850111856823, + "grad_norm": 0.47725993394851685, + "learning_rate": 4.5560242787215444e-06, + "loss": 0.5801, + "step": 1114 + }, + { + "epoch": 1.2472035794183445, + "grad_norm": 0.4747818112373352, + "learning_rate": 4.555175110179104e-06, + "loss": 0.5833, + "step": 1115 + }, + { + "epoch": 1.2483221476510067, + "grad_norm": 0.48167556524276733, + "learning_rate": 4.554325209627019e-06, + "loss": 0.5917, + "step": 1116 + }, + { + "epoch": 1.249440715883669, + "grad_norm": 0.4757511615753174, + "learning_rate": 4.553474577368006e-06, + "loss": 0.5955, + "step": 1117 + }, + { + "epoch": 1.250559284116331, + "grad_norm": 0.4677585959434509, + "learning_rate": 4.552623213705043e-06, + "loss": 0.5864, + "step": 1118 + }, + { + "epoch": 1.2516778523489933, + "grad_norm": 0.46574166417121887, + "learning_rate": 4.551771118941367e-06, + "loss": 0.562, + "step": 1119 + }, + { + "epoch": 1.2527964205816555, + "grad_norm": 0.46505433320999146, + "learning_rate": 4.5509182933804754e-06, + "loss": 0.5829, + "step": 1120 + }, + { + "epoch": 1.2539149888143177, + "grad_norm": 0.47223830223083496, + "learning_rate": 4.550064737326127e-06, + "loss": 0.5833, + "step": 1121 + }, + { + "epoch": 1.25503355704698, + "grad_norm": 0.4903958737850189, + "learning_rate": 4.549210451082342e-06, + "loss": 0.5612, + "step": 1122 + }, + { + "epoch": 1.256152125279642, + "grad_norm": 0.4613335132598877, + "learning_rate": 4.548355434953397e-06, + "loss": 0.5826, + "step": 1123 + }, + { + "epoch": 1.2572706935123041, + "grad_norm": 0.47294437885284424, + "learning_rate": 4.5474996892438296e-06, + "loss": 0.5834, + "step": 1124 + }, + { + "epoch": 1.2583892617449663, + "grad_norm": 0.4580760598182678, + "learning_rate": 4.546643214258441e-06, + "loss": 0.6002, + "step": 1125 + }, + { + "epoch": 1.2595078299776286, + "grad_norm": 0.4971761703491211, + "learning_rate": 4.545786010302287e-06, + "loss": 0.6082, + "step": 1126 + }, + { + "epoch": 1.2606263982102908, + "grad_norm": 0.47834956645965576, + "learning_rate": 4.544928077680687e-06, + "loss": 0.5697, + "step": 1127 + }, + { + "epoch": 1.261744966442953, + "grad_norm": 0.47802045941352844, + "learning_rate": 4.5440694166992175e-06, + "loss": 0.5709, + "step": 1128 + }, + { + "epoch": 1.2628635346756152, + "grad_norm": 0.46790894865989685, + "learning_rate": 4.5432100276637156e-06, + "loss": 0.574, + "step": 1129 + }, + { + "epoch": 1.2639821029082774, + "grad_norm": 0.4805370271205902, + "learning_rate": 4.542349910880277e-06, + "loss": 0.5665, + "step": 1130 + }, + { + "epoch": 1.2651006711409396, + "grad_norm": 0.4909112751483917, + "learning_rate": 4.5414890666552575e-06, + "loss": 0.5853, + "step": 1131 + }, + { + "epoch": 1.2662192393736018, + "grad_norm": 0.4830676317214966, + "learning_rate": 4.540627495295271e-06, + "loss": 0.5977, + "step": 1132 + }, + { + "epoch": 1.267337807606264, + "grad_norm": 0.48109954595565796, + "learning_rate": 4.539765197107191e-06, + "loss": 0.6083, + "step": 1133 + }, + { + "epoch": 1.2684563758389262, + "grad_norm": 0.48772624135017395, + "learning_rate": 4.538902172398151e-06, + "loss": 0.5858, + "step": 1134 + }, + { + "epoch": 1.2695749440715884, + "grad_norm": 0.4889027774333954, + "learning_rate": 4.53803842147554e-06, + "loss": 0.6107, + "step": 1135 + }, + { + "epoch": 1.2706935123042506, + "grad_norm": 0.4655384123325348, + "learning_rate": 4.5371739446470085e-06, + "loss": 0.575, + "step": 1136 + }, + { + "epoch": 1.2718120805369129, + "grad_norm": 0.5289780497550964, + "learning_rate": 4.536308742220466e-06, + "loss": 0.5875, + "step": 1137 + }, + { + "epoch": 1.272930648769575, + "grad_norm": 0.44917261600494385, + "learning_rate": 4.535442814504077e-06, + "loss": 0.5555, + "step": 1138 + }, + { + "epoch": 1.274049217002237, + "grad_norm": 0.45869240164756775, + "learning_rate": 4.534576161806269e-06, + "loss": 0.5673, + "step": 1139 + }, + { + "epoch": 1.2751677852348993, + "grad_norm": 0.47333377599716187, + "learning_rate": 4.533708784435722e-06, + "loss": 0.5843, + "step": 1140 + }, + { + "epoch": 1.2762863534675615, + "grad_norm": 0.4835321307182312, + "learning_rate": 4.5328406827013806e-06, + "loss": 0.6049, + "step": 1141 + }, + { + "epoch": 1.2774049217002237, + "grad_norm": 0.4710775315761566, + "learning_rate": 4.531971856912443e-06, + "loss": 0.5723, + "step": 1142 + }, + { + "epoch": 1.278523489932886, + "grad_norm": 0.48879000544548035, + "learning_rate": 4.531102307378366e-06, + "loss": 0.5847, + "step": 1143 + }, + { + "epoch": 1.279642058165548, + "grad_norm": 0.476485013961792, + "learning_rate": 4.530232034408864e-06, + "loss": 0.583, + "step": 1144 + }, + { + "epoch": 1.2807606263982103, + "grad_norm": 0.48762884736061096, + "learning_rate": 4.529361038313912e-06, + "loss": 0.6017, + "step": 1145 + }, + { + "epoch": 1.2818791946308725, + "grad_norm": 0.4625467360019684, + "learning_rate": 4.528489319403737e-06, + "loss": 0.5835, + "step": 1146 + }, + { + "epoch": 1.2829977628635347, + "grad_norm": 0.47126954793930054, + "learning_rate": 4.52761687798883e-06, + "loss": 0.5503, + "step": 1147 + }, + { + "epoch": 1.284116331096197, + "grad_norm": 0.46396777033805847, + "learning_rate": 4.526743714379934e-06, + "loss": 0.5669, + "step": 1148 + }, + { + "epoch": 1.285234899328859, + "grad_norm": 0.4775722920894623, + "learning_rate": 4.5258698288880535e-06, + "loss": 0.5727, + "step": 1149 + }, + { + "epoch": 1.2863534675615211, + "grad_norm": 0.5126634240150452, + "learning_rate": 4.524995221824445e-06, + "loss": 0.6249, + "step": 1150 + }, + { + "epoch": 1.2874720357941833, + "grad_norm": 0.47814854979515076, + "learning_rate": 4.524119893500627e-06, + "loss": 0.5881, + "step": 1151 + }, + { + "epoch": 1.2885906040268456, + "grad_norm": 0.47940847277641296, + "learning_rate": 4.523243844228372e-06, + "loss": 0.562, + "step": 1152 + }, + { + "epoch": 1.2897091722595078, + "grad_norm": 0.48836034536361694, + "learning_rate": 4.52236707431971e-06, + "loss": 0.5812, + "step": 1153 + }, + { + "epoch": 1.29082774049217, + "grad_norm": 0.47201383113861084, + "learning_rate": 4.521489584086929e-06, + "loss": 0.5998, + "step": 1154 + }, + { + "epoch": 1.2919463087248322, + "grad_norm": 0.4711892604827881, + "learning_rate": 4.52061137384257e-06, + "loss": 0.5868, + "step": 1155 + }, + { + "epoch": 1.2930648769574944, + "grad_norm": 0.49769148230552673, + "learning_rate": 4.519732443899435e-06, + "loss": 0.6405, + "step": 1156 + }, + { + "epoch": 1.2941834451901566, + "grad_norm": 0.4788062870502472, + "learning_rate": 4.51885279457058e-06, + "loss": 0.5838, + "step": 1157 + }, + { + "epoch": 1.2953020134228188, + "grad_norm": 0.47234484553337097, + "learning_rate": 4.5179724261693154e-06, + "loss": 0.5794, + "step": 1158 + }, + { + "epoch": 1.296420581655481, + "grad_norm": 0.464893639087677, + "learning_rate": 4.517091339009212e-06, + "loss": 0.5741, + "step": 1159 + }, + { + "epoch": 1.2975391498881432, + "grad_norm": 0.47124040126800537, + "learning_rate": 4.516209533404092e-06, + "loss": 0.5974, + "step": 1160 + }, + { + "epoch": 1.2986577181208054, + "grad_norm": 0.470726877450943, + "learning_rate": 4.5153270096680395e-06, + "loss": 0.5686, + "step": 1161 + }, + { + "epoch": 1.2997762863534676, + "grad_norm": 0.4629981815814972, + "learning_rate": 4.514443768115386e-06, + "loss": 0.5794, + "step": 1162 + }, + { + "epoch": 1.3008948545861299, + "grad_norm": 0.4796271026134491, + "learning_rate": 4.513559809060727e-06, + "loss": 0.5741, + "step": 1163 + }, + { + "epoch": 1.302013422818792, + "grad_norm": 0.4912780523300171, + "learning_rate": 4.512675132818908e-06, + "loss": 0.5726, + "step": 1164 + }, + { + "epoch": 1.3031319910514543, + "grad_norm": 0.4748340845108032, + "learning_rate": 4.511789739705033e-06, + "loss": 0.5788, + "step": 1165 + }, + { + "epoch": 1.3042505592841163, + "grad_norm": 0.47445476055145264, + "learning_rate": 4.510903630034458e-06, + "loss": 0.5924, + "step": 1166 + }, + { + "epoch": 1.3053691275167785, + "grad_norm": 0.46978822350502014, + "learning_rate": 4.510016804122799e-06, + "loss": 0.5525, + "step": 1167 + }, + { + "epoch": 1.3064876957494407, + "grad_norm": 0.48169562220573425, + "learning_rate": 4.509129262285924e-06, + "loss": 0.5944, + "step": 1168 + }, + { + "epoch": 1.307606263982103, + "grad_norm": 0.4853328764438629, + "learning_rate": 4.5082410048399555e-06, + "loss": 0.5774, + "step": 1169 + }, + { + "epoch": 1.308724832214765, + "grad_norm": 0.4685922861099243, + "learning_rate": 4.507352032101273e-06, + "loss": 0.583, + "step": 1170 + }, + { + "epoch": 1.3098434004474273, + "grad_norm": 0.466190904378891, + "learning_rate": 4.5064623443865085e-06, + "loss": 0.5676, + "step": 1171 + }, + { + "epoch": 1.3109619686800895, + "grad_norm": 0.4621398448944092, + "learning_rate": 4.50557194201255e-06, + "loss": 0.5674, + "step": 1172 + }, + { + "epoch": 1.3120805369127517, + "grad_norm": 0.48330026865005493, + "learning_rate": 4.504680825296542e-06, + "loss": 0.588, + "step": 1173 + }, + { + "epoch": 1.313199105145414, + "grad_norm": 0.48152461647987366, + "learning_rate": 4.503788994555878e-06, + "loss": 0.5975, + "step": 1174 + }, + { + "epoch": 1.3143176733780761, + "grad_norm": 0.47018247842788696, + "learning_rate": 4.502896450108211e-06, + "loss": 0.5707, + "step": 1175 + }, + { + "epoch": 1.3154362416107381, + "grad_norm": 0.48568496108055115, + "learning_rate": 4.502003192271447e-06, + "loss": 0.5923, + "step": 1176 + }, + { + "epoch": 1.3165548098434003, + "grad_norm": 0.5025852918624878, + "learning_rate": 4.501109221363744e-06, + "loss": 0.6094, + "step": 1177 + }, + { + "epoch": 1.3176733780760626, + "grad_norm": 0.46192529797554016, + "learning_rate": 4.500214537703515e-06, + "loss": 0.5815, + "step": 1178 + }, + { + "epoch": 1.3187919463087248, + "grad_norm": 0.4682995080947876, + "learning_rate": 4.499319141609429e-06, + "loss": 0.5526, + "step": 1179 + }, + { + "epoch": 1.319910514541387, + "grad_norm": 0.4849640130996704, + "learning_rate": 4.498423033400408e-06, + "loss": 0.6011, + "step": 1180 + }, + { + "epoch": 1.3210290827740492, + "grad_norm": 0.4680434465408325, + "learning_rate": 4.4975262133956235e-06, + "loss": 0.5763, + "step": 1181 + }, + { + "epoch": 1.3221476510067114, + "grad_norm": 0.47362959384918213, + "learning_rate": 4.496628681914505e-06, + "loss": 0.5793, + "step": 1182 + }, + { + "epoch": 1.3232662192393736, + "grad_norm": 0.48274803161621094, + "learning_rate": 4.495730439276734e-06, + "loss": 0.5806, + "step": 1183 + }, + { + "epoch": 1.3243847874720358, + "grad_norm": 0.5004247426986694, + "learning_rate": 4.4948314858022456e-06, + "loss": 0.5982, + "step": 1184 + }, + { + "epoch": 1.325503355704698, + "grad_norm": 0.4769462049007416, + "learning_rate": 4.4939318218112284e-06, + "loss": 0.593, + "step": 1185 + }, + { + "epoch": 1.3266219239373602, + "grad_norm": 0.4818820357322693, + "learning_rate": 4.493031447624125e-06, + "loss": 0.5999, + "step": 1186 + }, + { + "epoch": 1.3277404921700224, + "grad_norm": 0.4788765609264374, + "learning_rate": 4.492130363561625e-06, + "loss": 0.5978, + "step": 1187 + }, + { + "epoch": 1.3288590604026846, + "grad_norm": 0.4671361446380615, + "learning_rate": 4.491228569944679e-06, + "loss": 0.5715, + "step": 1188 + }, + { + "epoch": 1.3299776286353469, + "grad_norm": 0.4770296514034271, + "learning_rate": 4.4903260670944855e-06, + "loss": 0.5913, + "step": 1189 + }, + { + "epoch": 1.331096196868009, + "grad_norm": 0.47733429074287415, + "learning_rate": 4.489422855332497e-06, + "loss": 0.5914, + "step": 1190 + }, + { + "epoch": 1.3322147651006713, + "grad_norm": 0.49386340379714966, + "learning_rate": 4.488518934980419e-06, + "loss": 0.5866, + "step": 1191 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.4807666838169098, + "learning_rate": 4.487614306360208e-06, + "loss": 0.5769, + "step": 1192 + }, + { + "epoch": 1.3344519015659955, + "grad_norm": 0.46346330642700195, + "learning_rate": 4.4867089697940735e-06, + "loss": 0.5889, + "step": 1193 + }, + { + "epoch": 1.3355704697986577, + "grad_norm": 0.46165117621421814, + "learning_rate": 4.485802925604476e-06, + "loss": 0.5638, + "step": 1194 + }, + { + "epoch": 1.3366890380313199, + "grad_norm": 0.4789906442165375, + "learning_rate": 4.484896174114132e-06, + "loss": 0.5984, + "step": 1195 + }, + { + "epoch": 1.337807606263982, + "grad_norm": 0.47423994541168213, + "learning_rate": 4.4839887156460045e-06, + "loss": 0.6043, + "step": 1196 + }, + { + "epoch": 1.3389261744966443, + "grad_norm": 0.45627936720848083, + "learning_rate": 4.4830805505233125e-06, + "loss": 0.5781, + "step": 1197 + }, + { + "epoch": 1.3400447427293065, + "grad_norm": 0.49102070927619934, + "learning_rate": 4.482171679069524e-06, + "loss": 0.5854, + "step": 1198 + }, + { + "epoch": 1.3411633109619687, + "grad_norm": 0.48892414569854736, + "learning_rate": 4.48126210160836e-06, + "loss": 0.6194, + "step": 1199 + }, + { + "epoch": 1.342281879194631, + "grad_norm": 0.481579065322876, + "learning_rate": 4.480351818463793e-06, + "loss": 0.5901, + "step": 1200 + }, + { + "epoch": 1.3434004474272931, + "grad_norm": 0.4813998341560364, + "learning_rate": 4.479440829960045e-06, + "loss": 0.6095, + "step": 1201 + }, + { + "epoch": 1.3445190156599551, + "grad_norm": 0.466458261013031, + "learning_rate": 4.478529136421593e-06, + "loss": 0.5481, + "step": 1202 + }, + { + "epoch": 1.3456375838926173, + "grad_norm": 0.45974788069725037, + "learning_rate": 4.477616738173162e-06, + "loss": 0.5832, + "step": 1203 + }, + { + "epoch": 1.3467561521252795, + "grad_norm": 0.4749172329902649, + "learning_rate": 4.476703635539728e-06, + "loss": 0.6021, + "step": 1204 + }, + { + "epoch": 1.3478747203579418, + "grad_norm": 0.47382864356040955, + "learning_rate": 4.475789828846519e-06, + "loss": 0.5936, + "step": 1205 + }, + { + "epoch": 1.348993288590604, + "grad_norm": 0.4810637831687927, + "learning_rate": 4.474875318419015e-06, + "loss": 0.5914, + "step": 1206 + }, + { + "epoch": 1.3501118568232662, + "grad_norm": 0.46880605816841125, + "learning_rate": 4.473960104582943e-06, + "loss": 0.569, + "step": 1207 + }, + { + "epoch": 1.3512304250559284, + "grad_norm": 0.4803348183631897, + "learning_rate": 4.473044187664284e-06, + "loss": 0.5942, + "step": 1208 + }, + { + "epoch": 1.3523489932885906, + "grad_norm": 0.4822969436645508, + "learning_rate": 4.472127567989268e-06, + "loss": 0.5893, + "step": 1209 + }, + { + "epoch": 1.3534675615212528, + "grad_norm": 0.5106329321861267, + "learning_rate": 4.4712102458843755e-06, + "loss": 0.5954, + "step": 1210 + }, + { + "epoch": 1.354586129753915, + "grad_norm": 0.4909040629863739, + "learning_rate": 4.470292221676336e-06, + "loss": 0.5968, + "step": 1211 + }, + { + "epoch": 1.3557046979865772, + "grad_norm": 0.48394322395324707, + "learning_rate": 4.469373495692132e-06, + "loss": 0.5516, + "step": 1212 + }, + { + "epoch": 1.3568232662192394, + "grad_norm": 0.46607136726379395, + "learning_rate": 4.4684540682589925e-06, + "loss": 0.5588, + "step": 1213 + }, + { + "epoch": 1.3579418344519016, + "grad_norm": 0.48817023634910583, + "learning_rate": 4.467533939704398e-06, + "loss": 0.5877, + "step": 1214 + }, + { + "epoch": 1.3590604026845639, + "grad_norm": 0.4760921895503998, + "learning_rate": 4.466613110356081e-06, + "loss": 0.5482, + "step": 1215 + }, + { + "epoch": 1.360178970917226, + "grad_norm": 0.49914559721946716, + "learning_rate": 4.465691580542019e-06, + "loss": 0.6012, + "step": 1216 + }, + { + "epoch": 1.3612975391498883, + "grad_norm": 0.4916679561138153, + "learning_rate": 4.464769350590441e-06, + "loss": 0.6107, + "step": 1217 + }, + { + "epoch": 1.3624161073825503, + "grad_norm": 0.47448763251304626, + "learning_rate": 4.463846420829828e-06, + "loss": 0.5555, + "step": 1218 + }, + { + "epoch": 1.3635346756152125, + "grad_norm": 0.48127514123916626, + "learning_rate": 4.462922791588906e-06, + "loss": 0.5826, + "step": 1219 + }, + { + "epoch": 1.3646532438478747, + "grad_norm": 0.4912678003311157, + "learning_rate": 4.461998463196653e-06, + "loss": 0.5727, + "step": 1220 + }, + { + "epoch": 1.3657718120805369, + "grad_norm": 0.4831150472164154, + "learning_rate": 4.461073435982295e-06, + "loss": 0.6161, + "step": 1221 + }, + { + "epoch": 1.366890380313199, + "grad_norm": 0.4749965965747833, + "learning_rate": 4.460147710275306e-06, + "loss": 0.6009, + "step": 1222 + }, + { + "epoch": 1.3680089485458613, + "grad_norm": 0.5110952854156494, + "learning_rate": 4.459221286405411e-06, + "loss": 0.6029, + "step": 1223 + }, + { + "epoch": 1.3691275167785235, + "grad_norm": 0.4717468321323395, + "learning_rate": 4.458294164702582e-06, + "loss": 0.5699, + "step": 1224 + }, + { + "epoch": 1.3702460850111857, + "grad_norm": 0.4701862633228302, + "learning_rate": 4.457366345497041e-06, + "loss": 0.5575, + "step": 1225 + }, + { + "epoch": 1.371364653243848, + "grad_norm": 0.47513946890830994, + "learning_rate": 4.456437829119256e-06, + "loss": 0.565, + "step": 1226 + }, + { + "epoch": 1.3724832214765101, + "grad_norm": 0.4824097752571106, + "learning_rate": 4.455508615899945e-06, + "loss": 0.5883, + "step": 1227 + }, + { + "epoch": 1.3736017897091721, + "grad_norm": 0.4734891355037689, + "learning_rate": 4.454578706170075e-06, + "loss": 0.5682, + "step": 1228 + }, + { + "epoch": 1.3747203579418343, + "grad_norm": 0.4862266182899475, + "learning_rate": 4.453648100260859e-06, + "loss": 0.5886, + "step": 1229 + }, + { + "epoch": 1.3758389261744965, + "grad_norm": 0.4855569005012512, + "learning_rate": 4.452716798503759e-06, + "loss": 0.5971, + "step": 1230 + }, + { + "epoch": 1.3769574944071588, + "grad_norm": 0.4638337194919586, + "learning_rate": 4.451784801230487e-06, + "loss": 0.578, + "step": 1231 + }, + { + "epoch": 1.378076062639821, + "grad_norm": 0.47969067096710205, + "learning_rate": 4.4508521087729975e-06, + "loss": 0.6131, + "step": 1232 + }, + { + "epoch": 1.3791946308724832, + "grad_norm": 0.45764845609664917, + "learning_rate": 4.449918721463497e-06, + "loss": 0.5439, + "step": 1233 + }, + { + "epoch": 1.3803131991051454, + "grad_norm": 0.4725312292575836, + "learning_rate": 4.448984639634439e-06, + "loss": 0.5774, + "step": 1234 + }, + { + "epoch": 1.3814317673378076, + "grad_norm": 0.4911689758300781, + "learning_rate": 4.448049863618522e-06, + "loss": 0.5891, + "step": 1235 + }, + { + "epoch": 1.3825503355704698, + "grad_norm": 0.47672075033187866, + "learning_rate": 4.447114393748694e-06, + "loss": 0.5706, + "step": 1236 + }, + { + "epoch": 1.383668903803132, + "grad_norm": 0.4902832806110382, + "learning_rate": 4.446178230358151e-06, + "loss": 0.6043, + "step": 1237 + }, + { + "epoch": 1.3847874720357942, + "grad_norm": 0.496720552444458, + "learning_rate": 4.445241373780333e-06, + "loss": 0.5831, + "step": 1238 + }, + { + "epoch": 1.3859060402684564, + "grad_norm": 0.4805045425891876, + "learning_rate": 4.444303824348927e-06, + "loss": 0.5769, + "step": 1239 + }, + { + "epoch": 1.3870246085011186, + "grad_norm": 0.48103493452072144, + "learning_rate": 4.44336558239787e-06, + "loss": 0.5778, + "step": 1240 + }, + { + "epoch": 1.3881431767337808, + "grad_norm": 0.4719166159629822, + "learning_rate": 4.4424266482613445e-06, + "loss": 0.5666, + "step": 1241 + }, + { + "epoch": 1.389261744966443, + "grad_norm": 0.4719798266887665, + "learning_rate": 4.4414870222737775e-06, + "loss": 0.5579, + "step": 1242 + }, + { + "epoch": 1.3903803131991053, + "grad_norm": 0.49560073018074036, + "learning_rate": 4.440546704769845e-06, + "loss": 0.5858, + "step": 1243 + }, + { + "epoch": 1.3914988814317675, + "grad_norm": 0.4905194044113159, + "learning_rate": 4.439605696084465e-06, + "loss": 0.5747, + "step": 1244 + }, + { + "epoch": 1.3926174496644295, + "grad_norm": 0.502269446849823, + "learning_rate": 4.438663996552809e-06, + "loss": 0.5965, + "step": 1245 + }, + { + "epoch": 1.3937360178970917, + "grad_norm": 0.47396889328956604, + "learning_rate": 4.437721606510288e-06, + "loss": 0.6069, + "step": 1246 + }, + { + "epoch": 1.3948545861297539, + "grad_norm": 0.4756862223148346, + "learning_rate": 4.436778526292561e-06, + "loss": 0.5563, + "step": 1247 + }, + { + "epoch": 1.395973154362416, + "grad_norm": 0.47935912013053894, + "learning_rate": 4.435834756235534e-06, + "loss": 0.5811, + "step": 1248 + }, + { + "epoch": 1.3970917225950783, + "grad_norm": 0.47267550230026245, + "learning_rate": 4.434890296675358e-06, + "loss": 0.5477, + "step": 1249 + }, + { + "epoch": 1.3982102908277405, + "grad_norm": 0.4694240391254425, + "learning_rate": 4.433945147948428e-06, + "loss": 0.5586, + "step": 1250 + }, + { + "epoch": 1.3993288590604027, + "grad_norm": 0.47105926275253296, + "learning_rate": 4.432999310391387e-06, + "loss": 0.5684, + "step": 1251 + }, + { + "epoch": 1.400447427293065, + "grad_norm": 0.4700409173965454, + "learning_rate": 4.432052784341122e-06, + "loss": 0.5748, + "step": 1252 + }, + { + "epoch": 1.4015659955257271, + "grad_norm": 0.47937822341918945, + "learning_rate": 4.431105570134766e-06, + "loss": 0.5638, + "step": 1253 + }, + { + "epoch": 1.4026845637583891, + "grad_norm": 0.47045087814331055, + "learning_rate": 4.4301576681096956e-06, + "loss": 0.5597, + "step": 1254 + }, + { + "epoch": 1.4038031319910513, + "grad_norm": 0.48519641160964966, + "learning_rate": 4.429209078603534e-06, + "loss": 0.555, + "step": 1255 + }, + { + "epoch": 1.4049217002237135, + "grad_norm": 0.4835051894187927, + "learning_rate": 4.428259801954148e-06, + "loss": 0.6021, + "step": 1256 + }, + { + "epoch": 1.4060402684563758, + "grad_norm": 0.49831488728523254, + "learning_rate": 4.427309838499651e-06, + "loss": 0.5739, + "step": 1257 + }, + { + "epoch": 1.407158836689038, + "grad_norm": 0.5223401784896851, + "learning_rate": 4.4263591885783976e-06, + "loss": 0.5881, + "step": 1258 + }, + { + "epoch": 1.4082774049217002, + "grad_norm": 0.47920864820480347, + "learning_rate": 4.425407852528991e-06, + "loss": 0.5981, + "step": 1259 + }, + { + "epoch": 1.4093959731543624, + "grad_norm": 0.5153250694274902, + "learning_rate": 4.424455830690275e-06, + "loss": 0.5715, + "step": 1260 + }, + { + "epoch": 1.4105145413870246, + "grad_norm": 0.48909991979599, + "learning_rate": 4.4235031234013425e-06, + "loss": 0.602, + "step": 1261 + }, + { + "epoch": 1.4116331096196868, + "grad_norm": 0.4935949444770813, + "learning_rate": 4.422549731001524e-06, + "loss": 0.6008, + "step": 1262 + }, + { + "epoch": 1.412751677852349, + "grad_norm": 0.49626025557518005, + "learning_rate": 4.421595653830401e-06, + "loss": 0.6158, + "step": 1263 + }, + { + "epoch": 1.4138702460850112, + "grad_norm": 0.4757371246814728, + "learning_rate": 4.420640892227793e-06, + "loss": 0.573, + "step": 1264 + }, + { + "epoch": 1.4149888143176734, + "grad_norm": 0.4638062119483948, + "learning_rate": 4.4196854465337664e-06, + "loss": 0.5808, + "step": 1265 + }, + { + "epoch": 1.4161073825503356, + "grad_norm": 0.4798492193222046, + "learning_rate": 4.418729317088631e-06, + "loss": 0.5706, + "step": 1266 + }, + { + "epoch": 1.4172259507829978, + "grad_norm": 0.4708186388015747, + "learning_rate": 4.41777250423294e-06, + "loss": 0.576, + "step": 1267 + }, + { + "epoch": 1.41834451901566, + "grad_norm": 0.49962857365608215, + "learning_rate": 4.416815008307488e-06, + "loss": 0.596, + "step": 1268 + }, + { + "epoch": 1.4194630872483223, + "grad_norm": 0.4764310121536255, + "learning_rate": 4.415856829653318e-06, + "loss": 0.5903, + "step": 1269 + }, + { + "epoch": 1.4205816554809845, + "grad_norm": 0.4788293242454529, + "learning_rate": 4.4148979686117095e-06, + "loss": 0.5803, + "step": 1270 + }, + { + "epoch": 1.4217002237136465, + "grad_norm": 0.45909953117370605, + "learning_rate": 4.41393842552419e-06, + "loss": 0.5782, + "step": 1271 + }, + { + "epoch": 1.4228187919463087, + "grad_norm": 0.4760737717151642, + "learning_rate": 4.412978200732528e-06, + "loss": 0.6256, + "step": 1272 + }, + { + "epoch": 1.4239373601789709, + "grad_norm": 0.485414981842041, + "learning_rate": 4.412017294578737e-06, + "loss": 0.5895, + "step": 1273 + }, + { + "epoch": 1.425055928411633, + "grad_norm": 0.4804149568080902, + "learning_rate": 4.411055707405068e-06, + "loss": 0.5755, + "step": 1274 + }, + { + "epoch": 1.4261744966442953, + "grad_norm": 0.4825088679790497, + "learning_rate": 4.410093439554019e-06, + "loss": 0.5884, + "step": 1275 + }, + { + "epoch": 1.4272930648769575, + "grad_norm": 0.49632760882377625, + "learning_rate": 4.409130491368331e-06, + "loss": 0.6117, + "step": 1276 + }, + { + "epoch": 1.4284116331096197, + "grad_norm": 0.4739936888217926, + "learning_rate": 4.408166863190983e-06, + "loss": 0.5696, + "step": 1277 + }, + { + "epoch": 1.429530201342282, + "grad_norm": 0.5045065879821777, + "learning_rate": 4.407202555365202e-06, + "loss": 0.6204, + "step": 1278 + }, + { + "epoch": 1.4306487695749441, + "grad_norm": 0.4909072816371918, + "learning_rate": 4.406237568234451e-06, + "loss": 0.5891, + "step": 1279 + }, + { + "epoch": 1.4317673378076063, + "grad_norm": 0.4925367534160614, + "learning_rate": 4.4052719021424395e-06, + "loss": 0.5963, + "step": 1280 + }, + { + "epoch": 1.4328859060402683, + "grad_norm": 0.4968665838241577, + "learning_rate": 4.404305557433118e-06, + "loss": 0.5971, + "step": 1281 + }, + { + "epoch": 1.4340044742729305, + "grad_norm": 0.49153050780296326, + "learning_rate": 4.403338534450675e-06, + "loss": 0.6104, + "step": 1282 + }, + { + "epoch": 1.4351230425055927, + "grad_norm": 0.47348082065582275, + "learning_rate": 4.4023708335395455e-06, + "loss": 0.5746, + "step": 1283 + }, + { + "epoch": 1.436241610738255, + "grad_norm": 0.47999560832977295, + "learning_rate": 4.401402455044405e-06, + "loss": 0.6035, + "step": 1284 + }, + { + "epoch": 1.4373601789709172, + "grad_norm": 0.4812726080417633, + "learning_rate": 4.4004333993101665e-06, + "loss": 0.5946, + "step": 1285 + }, + { + "epoch": 1.4384787472035794, + "grad_norm": 0.4776507318019867, + "learning_rate": 4.39946366668199e-06, + "loss": 0.6048, + "step": 1286 + }, + { + "epoch": 1.4395973154362416, + "grad_norm": 0.4812582731246948, + "learning_rate": 4.398493257505271e-06, + "loss": 0.5949, + "step": 1287 + }, + { + "epoch": 1.4407158836689038, + "grad_norm": 0.4620709717273712, + "learning_rate": 4.397522172125651e-06, + "loss": 0.5566, + "step": 1288 + }, + { + "epoch": 1.441834451901566, + "grad_norm": 0.47055062651634216, + "learning_rate": 4.3965504108890075e-06, + "loss": 0.5507, + "step": 1289 + }, + { + "epoch": 1.4429530201342282, + "grad_norm": 0.4810755252838135, + "learning_rate": 4.395577974141464e-06, + "loss": 0.5789, + "step": 1290 + }, + { + "epoch": 1.4440715883668904, + "grad_norm": 0.4749661982059479, + "learning_rate": 4.394604862229379e-06, + "loss": 0.5686, + "step": 1291 + }, + { + "epoch": 1.4451901565995526, + "grad_norm": 0.4877249598503113, + "learning_rate": 4.393631075499356e-06, + "loss": 0.5726, + "step": 1292 + }, + { + "epoch": 1.4463087248322148, + "grad_norm": 0.4861728250980377, + "learning_rate": 4.392656614298236e-06, + "loss": 0.6094, + "step": 1293 + }, + { + "epoch": 1.447427293064877, + "grad_norm": 0.4913595914840698, + "learning_rate": 4.391681478973103e-06, + "loss": 0.5971, + "step": 1294 + }, + { + "epoch": 1.4485458612975393, + "grad_norm": 0.4858112037181854, + "learning_rate": 4.390705669871278e-06, + "loss": 0.5675, + "step": 1295 + }, + { + "epoch": 1.4496644295302015, + "grad_norm": 0.5002705454826355, + "learning_rate": 4.389729187340323e-06, + "loss": 0.5944, + "step": 1296 + }, + { + "epoch": 1.4507829977628635, + "grad_norm": 0.4777258038520813, + "learning_rate": 4.388752031728042e-06, + "loss": 0.6017, + "step": 1297 + }, + { + "epoch": 1.4519015659955257, + "grad_norm": 0.4859464466571808, + "learning_rate": 4.387774203382476e-06, + "loss": 0.5914, + "step": 1298 + }, + { + "epoch": 1.4530201342281879, + "grad_norm": 0.4924688935279846, + "learning_rate": 4.386795702651907e-06, + "loss": 0.5778, + "step": 1299 + }, + { + "epoch": 1.45413870246085, + "grad_norm": 0.4761926531791687, + "learning_rate": 4.385816529884856e-06, + "loss": 0.5977, + "step": 1300 + }, + { + "epoch": 1.4552572706935123, + "grad_norm": 0.4806959331035614, + "learning_rate": 4.3848366854300834e-06, + "loss": 0.5986, + "step": 1301 + }, + { + "epoch": 1.4563758389261745, + "grad_norm": 0.4975976049900055, + "learning_rate": 4.383856169636589e-06, + "loss": 0.5788, + "step": 1302 + }, + { + "epoch": 1.4574944071588367, + "grad_norm": 0.4763396382331848, + "learning_rate": 4.382874982853611e-06, + "loss": 0.5679, + "step": 1303 + }, + { + "epoch": 1.458612975391499, + "grad_norm": 0.47432610392570496, + "learning_rate": 4.381893125430629e-06, + "loss": 0.566, + "step": 1304 + }, + { + "epoch": 1.4597315436241611, + "grad_norm": 0.4684997498989105, + "learning_rate": 4.380910597717357e-06, + "loss": 0.578, + "step": 1305 + }, + { + "epoch": 1.4608501118568233, + "grad_norm": 0.4823257327079773, + "learning_rate": 4.379927400063754e-06, + "loss": 0.5848, + "step": 1306 + }, + { + "epoch": 1.4619686800894853, + "grad_norm": 0.47135570645332336, + "learning_rate": 4.378943532820011e-06, + "loss": 0.5781, + "step": 1307 + }, + { + "epoch": 1.4630872483221475, + "grad_norm": 0.4889175593852997, + "learning_rate": 4.377958996336563e-06, + "loss": 0.5307, + "step": 1308 + }, + { + "epoch": 1.4642058165548097, + "grad_norm": 0.49853044748306274, + "learning_rate": 4.376973790964078e-06, + "loss": 0.5791, + "step": 1309 + }, + { + "epoch": 1.465324384787472, + "grad_norm": 0.47457244992256165, + "learning_rate": 4.375987917053468e-06, + "loss": 0.5552, + "step": 1310 + }, + { + "epoch": 1.4664429530201342, + "grad_norm": 0.4792843163013458, + "learning_rate": 4.37500137495588e-06, + "loss": 0.5753, + "step": 1311 + }, + { + "epoch": 1.4675615212527964, + "grad_norm": 0.4924052059650421, + "learning_rate": 4.3740141650226975e-06, + "loss": 0.6032, + "step": 1312 + }, + { + "epoch": 1.4686800894854586, + "grad_norm": 0.485517680644989, + "learning_rate": 4.373026287605545e-06, + "loss": 0.5956, + "step": 1313 + }, + { + "epoch": 1.4697986577181208, + "grad_norm": 0.5009534358978271, + "learning_rate": 4.372037743056283e-06, + "loss": 0.587, + "step": 1314 + }, + { + "epoch": 1.470917225950783, + "grad_norm": 0.5067222118377686, + "learning_rate": 4.371048531727009e-06, + "loss": 0.5663, + "step": 1315 + }, + { + "epoch": 1.4720357941834452, + "grad_norm": 0.48260119557380676, + "learning_rate": 4.370058653970062e-06, + "loss": 0.5736, + "step": 1316 + }, + { + "epoch": 1.4731543624161074, + "grad_norm": 0.4996930658817291, + "learning_rate": 4.369068110138013e-06, + "loss": 0.5796, + "step": 1317 + }, + { + "epoch": 1.4742729306487696, + "grad_norm": 0.501705527305603, + "learning_rate": 4.368076900583673e-06, + "loss": 0.6125, + "step": 1318 + }, + { + "epoch": 1.4753914988814318, + "grad_norm": 0.4803929328918457, + "learning_rate": 4.36708502566009e-06, + "loss": 0.5737, + "step": 1319 + }, + { + "epoch": 1.476510067114094, + "grad_norm": 0.4732499122619629, + "learning_rate": 4.366092485720549e-06, + "loss": 0.597, + "step": 1320 + }, + { + "epoch": 1.4776286353467563, + "grad_norm": 0.4887276589870453, + "learning_rate": 4.365099281118571e-06, + "loss": 0.5893, + "step": 1321 + }, + { + "epoch": 1.4787472035794185, + "grad_norm": 0.48169007897377014, + "learning_rate": 4.364105412207914e-06, + "loss": 0.6035, + "step": 1322 + }, + { + "epoch": 1.4798657718120805, + "grad_norm": 0.4954248070716858, + "learning_rate": 4.363110879342575e-06, + "loss": 0.6149, + "step": 1323 + }, + { + "epoch": 1.4809843400447427, + "grad_norm": 0.47244513034820557, + "learning_rate": 4.362115682876783e-06, + "loss": 0.5794, + "step": 1324 + }, + { + "epoch": 1.4821029082774049, + "grad_norm": 0.48339948058128357, + "learning_rate": 4.361119823165007e-06, + "loss": 0.5821, + "step": 1325 + }, + { + "epoch": 1.483221476510067, + "grad_norm": 0.49129050970077515, + "learning_rate": 4.3601233005619515e-06, + "loss": 0.5851, + "step": 1326 + }, + { + "epoch": 1.4843400447427293, + "grad_norm": 0.4731924533843994, + "learning_rate": 4.359126115422555e-06, + "loss": 0.55, + "step": 1327 + }, + { + "epoch": 1.4854586129753915, + "grad_norm": 0.4816952347755432, + "learning_rate": 4.358128268101996e-06, + "loss": 0.5781, + "step": 1328 + }, + { + "epoch": 1.4865771812080537, + "grad_norm": 0.4917982518672943, + "learning_rate": 4.357129758955685e-06, + "loss": 0.6017, + "step": 1329 + }, + { + "epoch": 1.487695749440716, + "grad_norm": 0.5033697485923767, + "learning_rate": 4.356130588339269e-06, + "loss": 0.5783, + "step": 1330 + }, + { + "epoch": 1.4888143176733781, + "grad_norm": 0.48702743649482727, + "learning_rate": 4.355130756608632e-06, + "loss": 0.5757, + "step": 1331 + }, + { + "epoch": 1.4899328859060403, + "grad_norm": 0.49924343824386597, + "learning_rate": 4.354130264119894e-06, + "loss": 0.5985, + "step": 1332 + }, + { + "epoch": 1.4910514541387023, + "grad_norm": 0.4950959384441376, + "learning_rate": 4.353129111229408e-06, + "loss": 0.5688, + "step": 1333 + }, + { + "epoch": 1.4921700223713645, + "grad_norm": 0.47743096947669983, + "learning_rate": 4.352127298293764e-06, + "loss": 0.5655, + "step": 1334 + }, + { + "epoch": 1.4932885906040267, + "grad_norm": 0.4949822723865509, + "learning_rate": 4.351124825669785e-06, + "loss": 0.5915, + "step": 1335 + }, + { + "epoch": 1.494407158836689, + "grad_norm": 0.48831161856651306, + "learning_rate": 4.350121693714531e-06, + "loss": 0.5983, + "step": 1336 + }, + { + "epoch": 1.4955257270693512, + "grad_norm": 0.5018520355224609, + "learning_rate": 4.349117902785297e-06, + "loss": 0.5872, + "step": 1337 + }, + { + "epoch": 1.4966442953020134, + "grad_norm": 0.49204087257385254, + "learning_rate": 4.3481134532396116e-06, + "loss": 0.5895, + "step": 1338 + }, + { + "epoch": 1.4977628635346756, + "grad_norm": 0.47245603799819946, + "learning_rate": 4.347108345435238e-06, + "loss": 0.5585, + "step": 1339 + }, + { + "epoch": 1.4988814317673378, + "grad_norm": 0.4722137749195099, + "learning_rate": 4.3461025797301745e-06, + "loss": 0.5454, + "step": 1340 + }, + { + "epoch": 1.5, + "grad_norm": 0.4701415002346039, + "learning_rate": 4.345096156482655e-06, + "loss": 0.565, + "step": 1341 + }, + { + "epoch": 1.5011185682326622, + "grad_norm": 0.4957852065563202, + "learning_rate": 4.344089076051143e-06, + "loss": 0.6197, + "step": 1342 + }, + { + "epoch": 1.5022371364653244, + "grad_norm": 0.4819275140762329, + "learning_rate": 4.3430813387943405e-06, + "loss": 0.5635, + "step": 1343 + }, + { + "epoch": 1.5033557046979866, + "grad_norm": 0.5286908149719238, + "learning_rate": 4.342072945071183e-06, + "loss": 0.6067, + "step": 1344 + }, + { + "epoch": 1.5044742729306488, + "grad_norm": 0.4800012409687042, + "learning_rate": 4.3410638952408375e-06, + "loss": 0.58, + "step": 1345 + }, + { + "epoch": 1.505592841163311, + "grad_norm": 0.4997807443141937, + "learning_rate": 4.340054189662707e-06, + "loss": 0.5966, + "step": 1346 + }, + { + "epoch": 1.5067114093959733, + "grad_norm": 0.4921199083328247, + "learning_rate": 4.339043828696427e-06, + "loss": 0.5876, + "step": 1347 + }, + { + "epoch": 1.5078299776286355, + "grad_norm": 0.47939038276672363, + "learning_rate": 4.3380328127018666e-06, + "loss": 0.5758, + "step": 1348 + }, + { + "epoch": 1.5089485458612977, + "grad_norm": 0.5145376324653625, + "learning_rate": 4.337021142039127e-06, + "loss": 0.5815, + "step": 1349 + }, + { + "epoch": 1.5100671140939599, + "grad_norm": 0.4857310652732849, + "learning_rate": 4.336008817068546e-06, + "loss": 0.5686, + "step": 1350 + }, + { + "epoch": 1.5111856823266219, + "grad_norm": 0.4951683580875397, + "learning_rate": 4.33499583815069e-06, + "loss": 0.5872, + "step": 1351 + }, + { + "epoch": 1.512304250559284, + "grad_norm": 0.5085970163345337, + "learning_rate": 4.3339822056463624e-06, + "loss": 0.605, + "step": 1352 + }, + { + "epoch": 1.5134228187919463, + "grad_norm": 0.5023934841156006, + "learning_rate": 4.332967919916596e-06, + "loss": 0.5969, + "step": 1353 + }, + { + "epoch": 1.5145413870246085, + "grad_norm": 0.5029785633087158, + "learning_rate": 4.331952981322658e-06, + "loss": 0.5963, + "step": 1354 + }, + { + "epoch": 1.5156599552572707, + "grad_norm": 0.49423643946647644, + "learning_rate": 4.330937390226049e-06, + "loss": 0.5591, + "step": 1355 + }, + { + "epoch": 1.516778523489933, + "grad_norm": 0.48507627844810486, + "learning_rate": 4.3299211469885e-06, + "loss": 0.5816, + "step": 1356 + }, + { + "epoch": 1.5178970917225951, + "grad_norm": 0.496153861284256, + "learning_rate": 4.328904251971976e-06, + "loss": 0.5784, + "step": 1357 + }, + { + "epoch": 1.5190156599552571, + "grad_norm": 0.4916113018989563, + "learning_rate": 4.327886705538672e-06, + "loss": 0.5666, + "step": 1358 + }, + { + "epoch": 1.5201342281879193, + "grad_norm": 0.49055805802345276, + "learning_rate": 4.326868508051018e-06, + "loss": 0.5975, + "step": 1359 + }, + { + "epoch": 1.5212527964205815, + "grad_norm": 0.47937023639678955, + "learning_rate": 4.325849659871674e-06, + "loss": 0.5716, + "step": 1360 + }, + { + "epoch": 1.5223713646532437, + "grad_norm": 0.49050694704055786, + "learning_rate": 4.3248301613635306e-06, + "loss": 0.584, + "step": 1361 + }, + { + "epoch": 1.523489932885906, + "grad_norm": 0.4832146167755127, + "learning_rate": 4.323810012889713e-06, + "loss": 0.579, + "step": 1362 + }, + { + "epoch": 1.5246085011185682, + "grad_norm": 0.5009722709655762, + "learning_rate": 4.3227892148135755e-06, + "loss": 0.6014, + "step": 1363 + }, + { + "epoch": 1.5257270693512304, + "grad_norm": 0.4904753267765045, + "learning_rate": 4.321767767498705e-06, + "loss": 0.6009, + "step": 1364 + }, + { + "epoch": 1.5268456375838926, + "grad_norm": 0.5005730986595154, + "learning_rate": 4.32074567130892e-06, + "loss": 0.5975, + "step": 1365 + }, + { + "epoch": 1.5279642058165548, + "grad_norm": 0.4825516939163208, + "learning_rate": 4.319722926608268e-06, + "loss": 0.5726, + "step": 1366 + }, + { + "epoch": 1.529082774049217, + "grad_norm": 0.5070571899414062, + "learning_rate": 4.31869953376103e-06, + "loss": 0.6022, + "step": 1367 + }, + { + "epoch": 1.5302013422818792, + "grad_norm": 0.5236569046974182, + "learning_rate": 4.3176754931317154e-06, + "loss": 0.5973, + "step": 1368 + }, + { + "epoch": 1.5313199105145414, + "grad_norm": 0.49975070357322693, + "learning_rate": 4.316650805085068e-06, + "loss": 0.5881, + "step": 1369 + }, + { + "epoch": 1.5324384787472036, + "grad_norm": 0.49533969163894653, + "learning_rate": 4.315625469986058e-06, + "loss": 0.5748, + "step": 1370 + }, + { + "epoch": 1.5335570469798658, + "grad_norm": 0.4943258762359619, + "learning_rate": 4.314599488199889e-06, + "loss": 0.5832, + "step": 1371 + }, + { + "epoch": 1.534675615212528, + "grad_norm": 0.48157885670661926, + "learning_rate": 4.313572860091993e-06, + "loss": 0.5586, + "step": 1372 + }, + { + "epoch": 1.5357941834451903, + "grad_norm": 0.5046592354774475, + "learning_rate": 4.312545586028033e-06, + "loss": 0.5777, + "step": 1373 + }, + { + "epoch": 1.5369127516778525, + "grad_norm": 0.5014451146125793, + "learning_rate": 4.311517666373902e-06, + "loss": 0.5875, + "step": 1374 + }, + { + "epoch": 1.5380313199105147, + "grad_norm": 0.5050190687179565, + "learning_rate": 4.310489101495725e-06, + "loss": 0.6041, + "step": 1375 + }, + { + "epoch": 1.5391498881431769, + "grad_norm": 0.49115487933158875, + "learning_rate": 4.309459891759853e-06, + "loss": 0.5888, + "step": 1376 + }, + { + "epoch": 1.540268456375839, + "grad_norm": 0.5117396712303162, + "learning_rate": 4.308430037532871e-06, + "loss": 0.5689, + "step": 1377 + }, + { + "epoch": 1.541387024608501, + "grad_norm": 0.4883919656276703, + "learning_rate": 4.307399539181587e-06, + "loss": 0.5982, + "step": 1378 + }, + { + "epoch": 1.5425055928411633, + "grad_norm": 0.48665109276771545, + "learning_rate": 4.306368397073046e-06, + "loss": 0.5899, + "step": 1379 + }, + { + "epoch": 1.5436241610738255, + "grad_norm": 0.4853532910346985, + "learning_rate": 4.305336611574518e-06, + "loss": 0.5695, + "step": 1380 + }, + { + "epoch": 1.5447427293064877, + "grad_norm": 0.4923276901245117, + "learning_rate": 4.304304183053502e-06, + "loss": 0.5831, + "step": 1381 + }, + { + "epoch": 1.54586129753915, + "grad_norm": 0.5048404335975647, + "learning_rate": 4.303271111877729e-06, + "loss": 0.6238, + "step": 1382 + }, + { + "epoch": 1.5469798657718121, + "grad_norm": 0.49105462431907654, + "learning_rate": 4.302237398415156e-06, + "loss": 0.5976, + "step": 1383 + }, + { + "epoch": 1.548098434004474, + "grad_norm": 0.4881378412246704, + "learning_rate": 4.301203043033969e-06, + "loss": 0.5739, + "step": 1384 + }, + { + "epoch": 1.5492170022371363, + "grad_norm": 0.48858603835105896, + "learning_rate": 4.3001680461025844e-06, + "loss": 0.5792, + "step": 1385 + }, + { + "epoch": 1.5503355704697985, + "grad_norm": 0.48439061641693115, + "learning_rate": 4.299132407989646e-06, + "loss": 0.5742, + "step": 1386 + }, + { + "epoch": 1.5514541387024607, + "grad_norm": 0.47701379656791687, + "learning_rate": 4.298096129064026e-06, + "loss": 0.5844, + "step": 1387 + }, + { + "epoch": 1.552572706935123, + "grad_norm": 0.49393609166145325, + "learning_rate": 4.297059209694824e-06, + "loss": 0.5886, + "step": 1388 + }, + { + "epoch": 1.5536912751677852, + "grad_norm": 0.5089290738105774, + "learning_rate": 4.296021650251369e-06, + "loss": 0.5856, + "step": 1389 + }, + { + "epoch": 1.5548098434004474, + "grad_norm": 0.509134829044342, + "learning_rate": 4.294983451103219e-06, + "loss": 0.5939, + "step": 1390 + }, + { + "epoch": 1.5559284116331096, + "grad_norm": 0.5126070976257324, + "learning_rate": 4.293944612620157e-06, + "loss": 0.5692, + "step": 1391 + }, + { + "epoch": 1.5570469798657718, + "grad_norm": 0.5223609805107117, + "learning_rate": 4.2929051351721956e-06, + "loss": 0.599, + "step": 1392 + }, + { + "epoch": 1.558165548098434, + "grad_norm": 0.48266369104385376, + "learning_rate": 4.291865019129575e-06, + "loss": 0.5827, + "step": 1393 + }, + { + "epoch": 1.5592841163310962, + "grad_norm": 0.4978017210960388, + "learning_rate": 4.290824264862761e-06, + "loss": 0.5994, + "step": 1394 + }, + { + "epoch": 1.5604026845637584, + "grad_norm": 0.5075487494468689, + "learning_rate": 4.2897828727424495e-06, + "loss": 0.5641, + "step": 1395 + }, + { + "epoch": 1.5615212527964206, + "grad_norm": 0.5072858929634094, + "learning_rate": 4.2887408431395615e-06, + "loss": 0.5751, + "step": 1396 + }, + { + "epoch": 1.5626398210290828, + "grad_norm": 0.513059139251709, + "learning_rate": 4.287698176425246e-06, + "loss": 0.5909, + "step": 1397 + }, + { + "epoch": 1.563758389261745, + "grad_norm": 0.49780142307281494, + "learning_rate": 4.286654872970879e-06, + "loss": 0.5928, + "step": 1398 + }, + { + "epoch": 1.5648769574944073, + "grad_norm": 0.4992021918296814, + "learning_rate": 4.285610933148062e-06, + "loss": 0.583, + "step": 1399 + }, + { + "epoch": 1.5659955257270695, + "grad_norm": 0.4856356978416443, + "learning_rate": 4.284566357328625e-06, + "loss": 0.6213, + "step": 1400 + }, + { + "epoch": 1.5671140939597317, + "grad_norm": 0.48052698373794556, + "learning_rate": 4.283521145884625e-06, + "loss": 0.5753, + "step": 1401 + }, + { + "epoch": 1.5682326621923939, + "grad_norm": 0.4759695827960968, + "learning_rate": 4.2824752991883415e-06, + "loss": 0.5764, + "step": 1402 + }, + { + "epoch": 1.569351230425056, + "grad_norm": 0.49083444476127625, + "learning_rate": 4.2814288176122846e-06, + "loss": 0.5839, + "step": 1403 + }, + { + "epoch": 1.570469798657718, + "grad_norm": 0.4828498661518097, + "learning_rate": 4.280381701529187e-06, + "loss": 0.5734, + "step": 1404 + }, + { + "epoch": 1.5715883668903803, + "grad_norm": 0.4739566147327423, + "learning_rate": 4.27933395131201e-06, + "loss": 0.5958, + "step": 1405 + }, + { + "epoch": 1.5727069351230425, + "grad_norm": 0.4890132546424866, + "learning_rate": 4.278285567333942e-06, + "loss": 0.556, + "step": 1406 + }, + { + "epoch": 1.5738255033557047, + "grad_norm": 0.49774521589279175, + "learning_rate": 4.277236549968392e-06, + "loss": 0.5779, + "step": 1407 + }, + { + "epoch": 1.574944071588367, + "grad_norm": 0.5137577652931213, + "learning_rate": 4.276186899588999e-06, + "loss": 0.5947, + "step": 1408 + }, + { + "epoch": 1.5760626398210291, + "grad_norm": 0.5089177489280701, + "learning_rate": 4.275136616569626e-06, + "loss": 0.5825, + "step": 1409 + }, + { + "epoch": 1.5771812080536913, + "grad_norm": 0.49170440435409546, + "learning_rate": 4.2740857012843625e-06, + "loss": 0.5887, + "step": 1410 + }, + { + "epoch": 1.5782997762863533, + "grad_norm": 0.48788514733314514, + "learning_rate": 4.27303415410752e-06, + "loss": 0.565, + "step": 1411 + }, + { + "epoch": 1.5794183445190155, + "grad_norm": 0.4836443364620209, + "learning_rate": 4.2719819754136395e-06, + "loss": 0.5679, + "step": 1412 + }, + { + "epoch": 1.5805369127516777, + "grad_norm": 0.4887832999229431, + "learning_rate": 4.270929165577483e-06, + "loss": 0.5954, + "step": 1413 + }, + { + "epoch": 1.58165548098434, + "grad_norm": 0.5417852401733398, + "learning_rate": 4.26987572497404e-06, + "loss": 0.5747, + "step": 1414 + }, + { + "epoch": 1.5827740492170022, + "grad_norm": 0.4931381642818451, + "learning_rate": 4.268821653978522e-06, + "loss": 0.5964, + "step": 1415 + }, + { + "epoch": 1.5838926174496644, + "grad_norm": 0.48656824231147766, + "learning_rate": 4.267766952966369e-06, + "loss": 0.5919, + "step": 1416 + }, + { + "epoch": 1.5850111856823266, + "grad_norm": 0.5253188610076904, + "learning_rate": 4.266711622313242e-06, + "loss": 0.5917, + "step": 1417 + }, + { + "epoch": 1.5861297539149888, + "grad_norm": 0.5214728713035583, + "learning_rate": 4.2656556623950265e-06, + "loss": 0.5545, + "step": 1418 + }, + { + "epoch": 1.587248322147651, + "grad_norm": 0.4981340169906616, + "learning_rate": 4.264599073587834e-06, + "loss": 0.5737, + "step": 1419 + }, + { + "epoch": 1.5883668903803132, + "grad_norm": 0.49596238136291504, + "learning_rate": 4.263541856267999e-06, + "loss": 0.5841, + "step": 1420 + }, + { + "epoch": 1.5894854586129754, + "grad_norm": 0.47956663370132446, + "learning_rate": 4.262484010812079e-06, + "loss": 0.5566, + "step": 1421 + }, + { + "epoch": 1.5906040268456376, + "grad_norm": 0.4902280569076538, + "learning_rate": 4.261425537596857e-06, + "loss": 0.5907, + "step": 1422 + }, + { + "epoch": 1.5917225950782998, + "grad_norm": 0.5098991990089417, + "learning_rate": 4.260366436999338e-06, + "loss": 0.5997, + "step": 1423 + }, + { + "epoch": 1.592841163310962, + "grad_norm": 0.48146405816078186, + "learning_rate": 4.259306709396751e-06, + "loss": 0.5693, + "step": 1424 + }, + { + "epoch": 1.5939597315436242, + "grad_norm": 0.4839426577091217, + "learning_rate": 4.258246355166548e-06, + "loss": 0.5734, + "step": 1425 + }, + { + "epoch": 1.5950782997762865, + "grad_norm": 0.4717272222042084, + "learning_rate": 4.257185374686405e-06, + "loss": 0.5716, + "step": 1426 + }, + { + "epoch": 1.5961968680089487, + "grad_norm": 0.4848939776420593, + "learning_rate": 4.256123768334223e-06, + "loss": 0.5592, + "step": 1427 + }, + { + "epoch": 1.5973154362416109, + "grad_norm": 0.505172073841095, + "learning_rate": 4.25506153648812e-06, + "loss": 0.5822, + "step": 1428 + }, + { + "epoch": 1.598434004474273, + "grad_norm": 0.5064011216163635, + "learning_rate": 4.253998679526442e-06, + "loss": 0.583, + "step": 1429 + }, + { + "epoch": 1.599552572706935, + "grad_norm": 0.5151503086090088, + "learning_rate": 4.252935197827756e-06, + "loss": 0.6022, + "step": 1430 + }, + { + "epoch": 1.6006711409395973, + "grad_norm": 0.48397284746170044, + "learning_rate": 4.251871091770852e-06, + "loss": 0.5859, + "step": 1431 + }, + { + "epoch": 1.6017897091722595, + "grad_norm": 0.48410117626190186, + "learning_rate": 4.2508063617347415e-06, + "loss": 0.5724, + "step": 1432 + }, + { + "epoch": 1.6029082774049217, + "grad_norm": 0.4976502060890198, + "learning_rate": 4.249741008098658e-06, + "loss": 0.5968, + "step": 1433 + }, + { + "epoch": 1.604026845637584, + "grad_norm": 0.5036817193031311, + "learning_rate": 4.2486750312420585e-06, + "loss": 0.5922, + "step": 1434 + }, + { + "epoch": 1.6051454138702461, + "grad_norm": 0.5002356171607971, + "learning_rate": 4.247608431544622e-06, + "loss": 0.5829, + "step": 1435 + }, + { + "epoch": 1.6062639821029083, + "grad_norm": 0.4937509000301361, + "learning_rate": 4.246541209386247e-06, + "loss": 0.578, + "step": 1436 + }, + { + "epoch": 1.6073825503355703, + "grad_norm": 0.489439457654953, + "learning_rate": 4.245473365147056e-06, + "loss": 0.5793, + "step": 1437 + }, + { + "epoch": 1.6085011185682325, + "grad_norm": 0.48611974716186523, + "learning_rate": 4.244404899207393e-06, + "loss": 0.5952, + "step": 1438 + }, + { + "epoch": 1.6096196868008947, + "grad_norm": 0.48233819007873535, + "learning_rate": 4.2433358119478215e-06, + "loss": 0.586, + "step": 1439 + }, + { + "epoch": 1.610738255033557, + "grad_norm": 0.4946385324001312, + "learning_rate": 4.24226610374913e-06, + "loss": 0.5832, + "step": 1440 + }, + { + "epoch": 1.6118568232662192, + "grad_norm": 0.49822622537612915, + "learning_rate": 4.241195774992323e-06, + "loss": 0.5929, + "step": 1441 + }, + { + "epoch": 1.6129753914988814, + "grad_norm": 0.4824593663215637, + "learning_rate": 4.240124826058631e-06, + "loss": 0.5603, + "step": 1442 + }, + { + "epoch": 1.6140939597315436, + "grad_norm": 0.4935947358608246, + "learning_rate": 4.239053257329502e-06, + "loss": 0.5708, + "step": 1443 + }, + { + "epoch": 1.6152125279642058, + "grad_norm": 0.48947572708129883, + "learning_rate": 4.237981069186606e-06, + "loss": 0.5576, + "step": 1444 + }, + { + "epoch": 1.616331096196868, + "grad_norm": 0.4911848306655884, + "learning_rate": 4.236908262011834e-06, + "loss": 0.6014, + "step": 1445 + }, + { + "epoch": 1.6174496644295302, + "grad_norm": 0.5052220225334167, + "learning_rate": 4.2358348361872975e-06, + "loss": 0.6186, + "step": 1446 + }, + { + "epoch": 1.6185682326621924, + "grad_norm": 0.47427546977996826, + "learning_rate": 4.234760792095327e-06, + "loss": 0.554, + "step": 1447 + }, + { + "epoch": 1.6196868008948546, + "grad_norm": 0.495304673910141, + "learning_rate": 4.2336861301184754e-06, + "loss": 0.6066, + "step": 1448 + }, + { + "epoch": 1.6208053691275168, + "grad_norm": 0.49640989303588867, + "learning_rate": 4.2326108506395125e-06, + "loss": 0.5935, + "step": 1449 + }, + { + "epoch": 1.621923937360179, + "grad_norm": 0.4849659204483032, + "learning_rate": 4.231534954041432e-06, + "loss": 0.5707, + "step": 1450 + }, + { + "epoch": 1.6230425055928412, + "grad_norm": 0.48401832580566406, + "learning_rate": 4.230458440707443e-06, + "loss": 0.6059, + "step": 1451 + }, + { + "epoch": 1.6241610738255035, + "grad_norm": 0.48183032870292664, + "learning_rate": 4.2293813110209795e-06, + "loss": 0.5926, + "step": 1452 + }, + { + "epoch": 1.6252796420581657, + "grad_norm": 0.47466862201690674, + "learning_rate": 4.22830356536569e-06, + "loss": 0.5472, + "step": 1453 + }, + { + "epoch": 1.6263982102908279, + "grad_norm": 0.4778653383255005, + "learning_rate": 4.227225204125447e-06, + "loss": 0.5331, + "step": 1454 + }, + { + "epoch": 1.62751677852349, + "grad_norm": 0.49169987440109253, + "learning_rate": 4.226146227684337e-06, + "loss": 0.5892, + "step": 1455 + }, + { + "epoch": 1.6286353467561523, + "grad_norm": 0.4861750304698944, + "learning_rate": 4.225066636426669e-06, + "loss": 0.5902, + "step": 1456 + }, + { + "epoch": 1.6297539149888143, + "grad_norm": 0.5020110607147217, + "learning_rate": 4.223986430736972e-06, + "loss": 0.5878, + "step": 1457 + }, + { + "epoch": 1.6308724832214765, + "grad_norm": 0.5259606242179871, + "learning_rate": 4.2229056109999915e-06, + "loss": 0.5855, + "step": 1458 + }, + { + "epoch": 1.6319910514541387, + "grad_norm": 0.5020241737365723, + "learning_rate": 4.221824177600692e-06, + "loss": 0.5728, + "step": 1459 + }, + { + "epoch": 1.633109619686801, + "grad_norm": 0.4941030740737915, + "learning_rate": 4.220742130924257e-06, + "loss": 0.5834, + "step": 1460 + }, + { + "epoch": 1.6342281879194631, + "grad_norm": 0.49204495549201965, + "learning_rate": 4.21965947135609e-06, + "loss": 0.6003, + "step": 1461 + }, + { + "epoch": 1.6353467561521253, + "grad_norm": 0.5063990950584412, + "learning_rate": 4.218576199281809e-06, + "loss": 0.5886, + "step": 1462 + }, + { + "epoch": 1.6364653243847873, + "grad_norm": 0.49553683400154114, + "learning_rate": 4.217492315087255e-06, + "loss": 0.5917, + "step": 1463 + }, + { + "epoch": 1.6375838926174495, + "grad_norm": 0.48495718836784363, + "learning_rate": 4.216407819158482e-06, + "loss": 0.5649, + "step": 1464 + }, + { + "epoch": 1.6387024608501117, + "grad_norm": 0.4799686074256897, + "learning_rate": 4.215322711881766e-06, + "loss": 0.5572, + "step": 1465 + }, + { + "epoch": 1.639821029082774, + "grad_norm": 0.5015504956245422, + "learning_rate": 4.2142369936435986e-06, + "loss": 0.5829, + "step": 1466 + }, + { + "epoch": 1.6409395973154361, + "grad_norm": 0.5071560144424438, + "learning_rate": 4.21315066483069e-06, + "loss": 0.6074, + "step": 1467 + }, + { + "epoch": 1.6420581655480984, + "grad_norm": 0.5041237473487854, + "learning_rate": 4.212063725829966e-06, + "loss": 0.5996, + "step": 1468 + }, + { + "epoch": 1.6431767337807606, + "grad_norm": 0.5061616897583008, + "learning_rate": 4.210976177028573e-06, + "loss": 0.6098, + "step": 1469 + }, + { + "epoch": 1.6442953020134228, + "grad_norm": 0.488351047039032, + "learning_rate": 4.209888018813872e-06, + "loss": 0.5902, + "step": 1470 + }, + { + "epoch": 1.645413870246085, + "grad_norm": 0.494767427444458, + "learning_rate": 4.208799251573441e-06, + "loss": 0.5632, + "step": 1471 + }, + { + "epoch": 1.6465324384787472, + "grad_norm": 0.5081776976585388, + "learning_rate": 4.207709875695078e-06, + "loss": 0.6086, + "step": 1472 + }, + { + "epoch": 1.6476510067114094, + "grad_norm": 0.5012889504432678, + "learning_rate": 4.206619891566792e-06, + "loss": 0.5879, + "step": 1473 + }, + { + "epoch": 1.6487695749440716, + "grad_norm": 0.5006359219551086, + "learning_rate": 4.2055292995768145e-06, + "loss": 0.5648, + "step": 1474 + }, + { + "epoch": 1.6498881431767338, + "grad_norm": 0.4885180592536926, + "learning_rate": 4.204438100113592e-06, + "loss": 0.5629, + "step": 1475 + }, + { + "epoch": 1.651006711409396, + "grad_norm": 0.4856667220592499, + "learning_rate": 4.203346293565784e-06, + "loss": 0.598, + "step": 1476 + }, + { + "epoch": 1.6521252796420582, + "grad_norm": 0.484739750623703, + "learning_rate": 4.2022538803222714e-06, + "loss": 0.5862, + "step": 1477 + }, + { + "epoch": 1.6532438478747205, + "grad_norm": 0.4918213188648224, + "learning_rate": 4.2011608607721455e-06, + "loss": 0.5699, + "step": 1478 + }, + { + "epoch": 1.6543624161073827, + "grad_norm": 0.5036900639533997, + "learning_rate": 4.200067235304719e-06, + "loss": 0.6143, + "step": 1479 + }, + { + "epoch": 1.6554809843400449, + "grad_norm": 0.4958629310131073, + "learning_rate": 4.1989730043095175e-06, + "loss": 0.5423, + "step": 1480 + }, + { + "epoch": 1.656599552572707, + "grad_norm": 0.49514490365982056, + "learning_rate": 4.1978781681762825e-06, + "loss": 0.6092, + "step": 1481 + }, + { + "epoch": 1.6577181208053693, + "grad_norm": 0.5146138668060303, + "learning_rate": 4.1967827272949715e-06, + "loss": 0.5907, + "step": 1482 + }, + { + "epoch": 1.6588366890380313, + "grad_norm": 0.5063082575798035, + "learning_rate": 4.195686682055758e-06, + "loss": 0.5779, + "step": 1483 + }, + { + "epoch": 1.6599552572706935, + "grad_norm": 0.48410534858703613, + "learning_rate": 4.194590032849028e-06, + "loss": 0.5864, + "step": 1484 + }, + { + "epoch": 1.6610738255033557, + "grad_norm": 0.5001199841499329, + "learning_rate": 4.193492780065386e-06, + "loss": 0.5479, + "step": 1485 + }, + { + "epoch": 1.662192393736018, + "grad_norm": 0.49552416801452637, + "learning_rate": 4.19239492409565e-06, + "loss": 0.5646, + "step": 1486 + }, + { + "epoch": 1.6633109619686801, + "grad_norm": 0.4871010482311249, + "learning_rate": 4.191296465330853e-06, + "loss": 0.5601, + "step": 1487 + }, + { + "epoch": 1.6644295302013423, + "grad_norm": 0.5030876398086548, + "learning_rate": 4.190197404162242e-06, + "loss": 0.5892, + "step": 1488 + }, + { + "epoch": 1.6655480984340043, + "grad_norm": 0.4873735010623932, + "learning_rate": 4.18909774098128e-06, + "loss": 0.5981, + "step": 1489 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.5151247978210449, + "learning_rate": 4.187997476179643e-06, + "loss": 0.5813, + "step": 1490 + }, + { + "epoch": 1.6677852348993287, + "grad_norm": 0.4993012249469757, + "learning_rate": 4.1868966101492225e-06, + "loss": 0.5718, + "step": 1491 + }, + { + "epoch": 1.668903803131991, + "grad_norm": 0.48291707038879395, + "learning_rate": 4.1857951432821235e-06, + "loss": 0.566, + "step": 1492 + }, + { + "epoch": 1.6700223713646531, + "grad_norm": 0.49533194303512573, + "learning_rate": 4.184693075970665e-06, + "loss": 0.5956, + "step": 1493 + }, + { + "epoch": 1.6711409395973154, + "grad_norm": 0.5048168897628784, + "learning_rate": 4.183590408607379e-06, + "loss": 0.5735, + "step": 1494 + }, + { + "epoch": 1.6722595078299776, + "grad_norm": 0.4690997004508972, + "learning_rate": 4.1824871415850135e-06, + "loss": 0.5551, + "step": 1495 + }, + { + "epoch": 1.6733780760626398, + "grad_norm": 0.4947203993797302, + "learning_rate": 4.1813832752965275e-06, + "loss": 0.5877, + "step": 1496 + }, + { + "epoch": 1.674496644295302, + "grad_norm": 0.508201539516449, + "learning_rate": 4.180278810135096e-06, + "loss": 0.6059, + "step": 1497 + }, + { + "epoch": 1.6756152125279642, + "grad_norm": 0.4906938672065735, + "learning_rate": 4.179173746494105e-06, + "loss": 0.575, + "step": 1498 + }, + { + "epoch": 1.6767337807606264, + "grad_norm": 0.48704689741134644, + "learning_rate": 4.178068084767155e-06, + "loss": 0.5701, + "step": 1499 + }, + { + "epoch": 1.6778523489932886, + "grad_norm": 0.5050029754638672, + "learning_rate": 4.176961825348059e-06, + "loss": 0.5727, + "step": 1500 + }, + { + "epoch": 1.6789709172259508, + "grad_norm": 0.4842473566532135, + "learning_rate": 4.175854968630843e-06, + "loss": 0.5682, + "step": 1501 + }, + { + "epoch": 1.680089485458613, + "grad_norm": 0.5032848119735718, + "learning_rate": 4.1747475150097465e-06, + "loss": 0.5873, + "step": 1502 + }, + { + "epoch": 1.6812080536912752, + "grad_norm": 0.497781902551651, + "learning_rate": 4.17363946487922e-06, + "loss": 0.5914, + "step": 1503 + }, + { + "epoch": 1.6823266219239374, + "grad_norm": 0.5035775303840637, + "learning_rate": 4.172530818633929e-06, + "loss": 0.5519, + "step": 1504 + }, + { + "epoch": 1.6834451901565997, + "grad_norm": 0.47141513228416443, + "learning_rate": 4.171421576668747e-06, + "loss": 0.5913, + "step": 1505 + }, + { + "epoch": 1.6845637583892619, + "grad_norm": 0.49259573221206665, + "learning_rate": 4.170311739378765e-06, + "loss": 0.5769, + "step": 1506 + }, + { + "epoch": 1.685682326621924, + "grad_norm": 0.5033203363418579, + "learning_rate": 4.169201307159282e-06, + "loss": 0.5851, + "step": 1507 + }, + { + "epoch": 1.6868008948545863, + "grad_norm": 0.48197343945503235, + "learning_rate": 4.1680902804058095e-06, + "loss": 0.5502, + "step": 1508 + }, + { + "epoch": 1.6879194630872483, + "grad_norm": 0.5090881586074829, + "learning_rate": 4.166978659514075e-06, + "loss": 0.5684, + "step": 1509 + }, + { + "epoch": 1.6890380313199105, + "grad_norm": 0.4990635812282562, + "learning_rate": 4.1658664448800105e-06, + "loss": 0.5961, + "step": 1510 + }, + { + "epoch": 1.6901565995525727, + "grad_norm": 0.522132396697998, + "learning_rate": 4.164753636899765e-06, + "loss": 0.5941, + "step": 1511 + }, + { + "epoch": 1.691275167785235, + "grad_norm": 0.4907796382904053, + "learning_rate": 4.163640235969696e-06, + "loss": 0.5514, + "step": 1512 + }, + { + "epoch": 1.692393736017897, + "grad_norm": 0.49310702085494995, + "learning_rate": 4.1625262424863744e-06, + "loss": 0.5723, + "step": 1513 + }, + { + "epoch": 1.6935123042505593, + "grad_norm": 0.5064623951911926, + "learning_rate": 4.161411656846581e-06, + "loss": 0.5847, + "step": 1514 + }, + { + "epoch": 1.6946308724832215, + "grad_norm": 0.47868281602859497, + "learning_rate": 4.1602964794473065e-06, + "loss": 0.5757, + "step": 1515 + }, + { + "epoch": 1.6957494407158835, + "grad_norm": 0.5030828714370728, + "learning_rate": 4.159180710685753e-06, + "loss": 0.6001, + "step": 1516 + }, + { + "epoch": 1.6968680089485457, + "grad_norm": 0.4966081976890564, + "learning_rate": 4.158064350959336e-06, + "loss": 0.5643, + "step": 1517 + }, + { + "epoch": 1.697986577181208, + "grad_norm": 0.4922417104244232, + "learning_rate": 4.156947400665677e-06, + "loss": 0.5866, + "step": 1518 + }, + { + "epoch": 1.6991051454138701, + "grad_norm": 0.4993835985660553, + "learning_rate": 4.15582986020261e-06, + "loss": 0.5784, + "step": 1519 + }, + { + "epoch": 1.7002237136465324, + "grad_norm": 0.4877612292766571, + "learning_rate": 4.15471172996818e-06, + "loss": 0.5736, + "step": 1520 + }, + { + "epoch": 1.7013422818791946, + "grad_norm": 0.5129001140594482, + "learning_rate": 4.1535930103606406e-06, + "loss": 0.5798, + "step": 1521 + }, + { + "epoch": 1.7024608501118568, + "grad_norm": 0.48489999771118164, + "learning_rate": 4.152473701778455e-06, + "loss": 0.556, + "step": 1522 + }, + { + "epoch": 1.703579418344519, + "grad_norm": 0.4901261627674103, + "learning_rate": 4.1513538046202995e-06, + "loss": 0.578, + "step": 1523 + }, + { + "epoch": 1.7046979865771812, + "grad_norm": 0.49826791882514954, + "learning_rate": 4.150233319285055e-06, + "loss": 0.5671, + "step": 1524 + }, + { + "epoch": 1.7058165548098434, + "grad_norm": 0.49227479100227356, + "learning_rate": 4.149112246171817e-06, + "loss": 0.5561, + "step": 1525 + }, + { + "epoch": 1.7069351230425056, + "grad_norm": 0.49544984102249146, + "learning_rate": 4.147990585679886e-06, + "loss": 0.5682, + "step": 1526 + }, + { + "epoch": 1.7080536912751678, + "grad_norm": 0.49339771270751953, + "learning_rate": 4.146868338208775e-06, + "loss": 0.5607, + "step": 1527 + }, + { + "epoch": 1.70917225950783, + "grad_norm": 0.506488561630249, + "learning_rate": 4.1457455041582044e-06, + "loss": 0.5778, + "step": 1528 + }, + { + "epoch": 1.7102908277404922, + "grad_norm": 0.5283234715461731, + "learning_rate": 4.144622083928102e-06, + "loss": 0.5777, + "step": 1529 + }, + { + "epoch": 1.7114093959731544, + "grad_norm": 0.5093042254447937, + "learning_rate": 4.14349807791861e-06, + "loss": 0.5832, + "step": 1530 + }, + { + "epoch": 1.7125279642058167, + "grad_norm": 0.49732184410095215, + "learning_rate": 4.142373486530071e-06, + "loss": 0.5965, + "step": 1531 + }, + { + "epoch": 1.7136465324384789, + "grad_norm": 0.5034535527229309, + "learning_rate": 4.141248310163042e-06, + "loss": 0.5896, + "step": 1532 + }, + { + "epoch": 1.714765100671141, + "grad_norm": 0.484789103269577, + "learning_rate": 4.140122549218289e-06, + "loss": 0.5636, + "step": 1533 + }, + { + "epoch": 1.7158836689038033, + "grad_norm": 0.49596208333969116, + "learning_rate": 4.138996204096781e-06, + "loss": 0.5794, + "step": 1534 + }, + { + "epoch": 1.7170022371364653, + "grad_norm": 0.4908808767795563, + "learning_rate": 4.137869275199701e-06, + "loss": 0.5846, + "step": 1535 + }, + { + "epoch": 1.7181208053691275, + "grad_norm": 0.4921574890613556, + "learning_rate": 4.1367417629284356e-06, + "loss": 0.5834, + "step": 1536 + }, + { + "epoch": 1.7192393736017897, + "grad_norm": 0.48451635241508484, + "learning_rate": 4.13561366768458e-06, + "loss": 0.5624, + "step": 1537 + }, + { + "epoch": 1.720357941834452, + "grad_norm": 0.5193755626678467, + "learning_rate": 4.134484989869939e-06, + "loss": 0.5908, + "step": 1538 + }, + { + "epoch": 1.721476510067114, + "grad_norm": 0.497798889875412, + "learning_rate": 4.133355729886523e-06, + "loss": 0.5668, + "step": 1539 + }, + { + "epoch": 1.7225950782997763, + "grad_norm": 0.5026369690895081, + "learning_rate": 4.132225888136552e-06, + "loss": 0.5932, + "step": 1540 + }, + { + "epoch": 1.7237136465324385, + "grad_norm": 0.5131746530532837, + "learning_rate": 4.131095465022449e-06, + "loss": 0.5965, + "step": 1541 + }, + { + "epoch": 1.7248322147651005, + "grad_norm": 0.4941963851451874, + "learning_rate": 4.129964460946847e-06, + "loss": 0.5797, + "step": 1542 + }, + { + "epoch": 1.7259507829977627, + "grad_norm": 0.5222018957138062, + "learning_rate": 4.128832876312587e-06, + "loss": 0.5948, + "step": 1543 + }, + { + "epoch": 1.727069351230425, + "grad_norm": 0.5069666504859924, + "learning_rate": 4.127700711522715e-06, + "loss": 0.5855, + "step": 1544 + }, + { + "epoch": 1.7281879194630871, + "grad_norm": 0.5024001002311707, + "learning_rate": 4.126567966980484e-06, + "loss": 0.6114, + "step": 1545 + }, + { + "epoch": 1.7293064876957494, + "grad_norm": 0.49587997794151306, + "learning_rate": 4.125434643089353e-06, + "loss": 0.568, + "step": 1546 + }, + { + "epoch": 1.7304250559284116, + "grad_norm": 0.48950231075286865, + "learning_rate": 4.124300740252989e-06, + "loss": 0.5772, + "step": 1547 + }, + { + "epoch": 1.7315436241610738, + "grad_norm": 0.5003415942192078, + "learning_rate": 4.123166258875262e-06, + "loss": 0.5942, + "step": 1548 + }, + { + "epoch": 1.732662192393736, + "grad_norm": 0.5020825266838074, + "learning_rate": 4.1220311993602515e-06, + "loss": 0.5826, + "step": 1549 + }, + { + "epoch": 1.7337807606263982, + "grad_norm": 0.5122042894363403, + "learning_rate": 4.120895562112242e-06, + "loss": 0.6112, + "step": 1550 + }, + { + "epoch": 1.7348993288590604, + "grad_norm": 0.5022247433662415, + "learning_rate": 4.119759347535722e-06, + "loss": 0.556, + "step": 1551 + }, + { + "epoch": 1.7360178970917226, + "grad_norm": 0.5053969025611877, + "learning_rate": 4.118622556035387e-06, + "loss": 0.6139, + "step": 1552 + }, + { + "epoch": 1.7371364653243848, + "grad_norm": 0.5022400617599487, + "learning_rate": 4.11748518801614e-06, + "loss": 0.5976, + "step": 1553 + }, + { + "epoch": 1.738255033557047, + "grad_norm": 0.49503523111343384, + "learning_rate": 4.116347243883086e-06, + "loss": 0.5742, + "step": 1554 + }, + { + "epoch": 1.7393736017897092, + "grad_norm": 0.5141612887382507, + "learning_rate": 4.115208724041536e-06, + "loss": 0.599, + "step": 1555 + }, + { + "epoch": 1.7404921700223714, + "grad_norm": 0.484074205160141, + "learning_rate": 4.114069628897006e-06, + "loss": 0.5675, + "step": 1556 + }, + { + "epoch": 1.7416107382550337, + "grad_norm": 0.5334796905517578, + "learning_rate": 4.11292995885522e-06, + "loss": 0.6095, + "step": 1557 + }, + { + "epoch": 1.7427293064876959, + "grad_norm": 0.48460689187049866, + "learning_rate": 4.111789714322101e-06, + "loss": 0.5477, + "step": 1558 + }, + { + "epoch": 1.743847874720358, + "grad_norm": 0.5085021257400513, + "learning_rate": 4.110648895703782e-06, + "loss": 0.5941, + "step": 1559 + }, + { + "epoch": 1.7449664429530203, + "grad_norm": 0.5090224146842957, + "learning_rate": 4.109507503406599e-06, + "loss": 0.5753, + "step": 1560 + }, + { + "epoch": 1.7460850111856825, + "grad_norm": 0.5055748224258423, + "learning_rate": 4.108365537837088e-06, + "loss": 0.5914, + "step": 1561 + }, + { + "epoch": 1.7472035794183445, + "grad_norm": 0.48406150937080383, + "learning_rate": 4.107222999401997e-06, + "loss": 0.5805, + "step": 1562 + }, + { + "epoch": 1.7483221476510067, + "grad_norm": 0.48651930689811707, + "learning_rate": 4.106079888508272e-06, + "loss": 0.5848, + "step": 1563 + }, + { + "epoch": 1.749440715883669, + "grad_norm": 0.4852994978427887, + "learning_rate": 4.104936205563064e-06, + "loss": 0.5796, + "step": 1564 + }, + { + "epoch": 1.750559284116331, + "grad_norm": 0.4920569658279419, + "learning_rate": 4.10379195097373e-06, + "loss": 0.5981, + "step": 1565 + }, + { + "epoch": 1.7516778523489933, + "grad_norm": 0.5077376961708069, + "learning_rate": 4.1026471251478285e-06, + "loss": 0.6071, + "step": 1566 + }, + { + "epoch": 1.7527964205816555, + "grad_norm": 0.4932427406311035, + "learning_rate": 4.101501728493121e-06, + "loss": 0.5917, + "step": 1567 + }, + { + "epoch": 1.7539149888143175, + "grad_norm": 0.49199509620666504, + "learning_rate": 4.100355761417577e-06, + "loss": 0.5814, + "step": 1568 + }, + { + "epoch": 1.7550335570469797, + "grad_norm": 0.4920978546142578, + "learning_rate": 4.099209224329361e-06, + "loss": 0.5931, + "step": 1569 + }, + { + "epoch": 1.756152125279642, + "grad_norm": 0.5020889043807983, + "learning_rate": 4.098062117636849e-06, + "loss": 0.5669, + "step": 1570 + }, + { + "epoch": 1.7572706935123041, + "grad_norm": 0.5099507570266724, + "learning_rate": 4.096914441748613e-06, + "loss": 0.5758, + "step": 1571 + }, + { + "epoch": 1.7583892617449663, + "grad_norm": 0.49736326932907104, + "learning_rate": 4.095766197073433e-06, + "loss": 0.5624, + "step": 1572 + }, + { + "epoch": 1.7595078299776286, + "grad_norm": 0.494096577167511, + "learning_rate": 4.094617384020287e-06, + "loss": 0.5902, + "step": 1573 + }, + { + "epoch": 1.7606263982102908, + "grad_norm": 0.5103378295898438, + "learning_rate": 4.09346800299836e-06, + "loss": 0.5903, + "step": 1574 + }, + { + "epoch": 1.761744966442953, + "grad_norm": 0.49033430218696594, + "learning_rate": 4.092318054417036e-06, + "loss": 0.5703, + "step": 1575 + }, + { + "epoch": 1.7628635346756152, + "grad_norm": 0.5065776705741882, + "learning_rate": 4.0911675386859015e-06, + "loss": 0.5648, + "step": 1576 + }, + { + "epoch": 1.7639821029082774, + "grad_norm": 0.5001108050346375, + "learning_rate": 4.0900164562147485e-06, + "loss": 0.5857, + "step": 1577 + }, + { + "epoch": 1.7651006711409396, + "grad_norm": 0.5082598924636841, + "learning_rate": 4.0888648074135645e-06, + "loss": 0.5488, + "step": 1578 + }, + { + "epoch": 1.7662192393736018, + "grad_norm": 0.4972035586833954, + "learning_rate": 4.087712592692544e-06, + "loss": 0.594, + "step": 1579 + }, + { + "epoch": 1.767337807606264, + "grad_norm": 0.4995720088481903, + "learning_rate": 4.086559812462082e-06, + "loss": 0.5974, + "step": 1580 + }, + { + "epoch": 1.7684563758389262, + "grad_norm": 0.5213293433189392, + "learning_rate": 4.085406467132774e-06, + "loss": 0.5956, + "step": 1581 + }, + { + "epoch": 1.7695749440715884, + "grad_norm": 0.4914456903934479, + "learning_rate": 4.0842525571154165e-06, + "loss": 0.5804, + "step": 1582 + }, + { + "epoch": 1.7706935123042506, + "grad_norm": 0.4833807945251465, + "learning_rate": 4.083098082821007e-06, + "loss": 0.5468, + "step": 1583 + }, + { + "epoch": 1.7718120805369129, + "grad_norm": 0.4896129071712494, + "learning_rate": 4.081943044660746e-06, + "loss": 0.5949, + "step": 1584 + }, + { + "epoch": 1.772930648769575, + "grad_norm": 0.4938727021217346, + "learning_rate": 4.080787443046034e-06, + "loss": 0.576, + "step": 1585 + }, + { + "epoch": 1.7740492170022373, + "grad_norm": 0.49900153279304504, + "learning_rate": 4.07963127838847e-06, + "loss": 0.5788, + "step": 1586 + }, + { + "epoch": 1.7751677852348995, + "grad_norm": 0.5096091628074646, + "learning_rate": 4.0784745510998556e-06, + "loss": 0.6012, + "step": 1587 + }, + { + "epoch": 1.7762863534675615, + "grad_norm": 0.4790339469909668, + "learning_rate": 4.077317261592194e-06, + "loss": 0.572, + "step": 1588 + }, + { + "epoch": 1.7774049217002237, + "grad_norm": 0.49272850155830383, + "learning_rate": 4.076159410277685e-06, + "loss": 0.559, + "step": 1589 + }, + { + "epoch": 1.778523489932886, + "grad_norm": 0.46964558959007263, + "learning_rate": 4.075000997568732e-06, + "loss": 0.5498, + "step": 1590 + }, + { + "epoch": 1.779642058165548, + "grad_norm": 0.4938308298587799, + "learning_rate": 4.0738420238779365e-06, + "loss": 0.5915, + "step": 1591 + }, + { + "epoch": 1.7807606263982103, + "grad_norm": 0.5084967017173767, + "learning_rate": 4.072682489618101e-06, + "loss": 0.5918, + "step": 1592 + }, + { + "epoch": 1.7818791946308725, + "grad_norm": 0.49874448776245117, + "learning_rate": 4.071522395202226e-06, + "loss": 0.5622, + "step": 1593 + }, + { + "epoch": 1.7829977628635347, + "grad_norm": 0.49470290541648865, + "learning_rate": 4.070361741043511e-06, + "loss": 0.5466, + "step": 1594 + }, + { + "epoch": 1.7841163310961967, + "grad_norm": 0.5095946192741394, + "learning_rate": 4.0692005275553595e-06, + "loss": 0.5731, + "step": 1595 + }, + { + "epoch": 1.785234899328859, + "grad_norm": 0.5301234722137451, + "learning_rate": 4.06803875515137e-06, + "loss": 0.5867, + "step": 1596 + }, + { + "epoch": 1.7863534675615211, + "grad_norm": 0.514214813709259, + "learning_rate": 4.06687642424534e-06, + "loss": 0.6005, + "step": 1597 + }, + { + "epoch": 1.7874720357941833, + "grad_norm": 0.5104812383651733, + "learning_rate": 4.065713535251268e-06, + "loss": 0.5935, + "step": 1598 + }, + { + "epoch": 1.7885906040268456, + "grad_norm": 0.5091798901557922, + "learning_rate": 4.064550088583349e-06, + "loss": 0.5944, + "step": 1599 + }, + { + "epoch": 1.7897091722595078, + "grad_norm": 0.4886806607246399, + "learning_rate": 4.0633860846559794e-06, + "loss": 0.551, + "step": 1600 + }, + { + "epoch": 1.79082774049217, + "grad_norm": 0.49812769889831543, + "learning_rate": 4.062221523883753e-06, + "loss": 0.5491, + "step": 1601 + }, + { + "epoch": 1.7919463087248322, + "grad_norm": 0.5070154070854187, + "learning_rate": 4.06105640668146e-06, + "loss": 0.5942, + "step": 1602 + }, + { + "epoch": 1.7930648769574944, + "grad_norm": 0.5108977556228638, + "learning_rate": 4.059890733464091e-06, + "loss": 0.5943, + "step": 1603 + }, + { + "epoch": 1.7941834451901566, + "grad_norm": 0.5012465119361877, + "learning_rate": 4.058724504646834e-06, + "loss": 0.5642, + "step": 1604 + }, + { + "epoch": 1.7953020134228188, + "grad_norm": 0.49801158905029297, + "learning_rate": 4.057557720645076e-06, + "loss": 0.5709, + "step": 1605 + }, + { + "epoch": 1.796420581655481, + "grad_norm": 0.4962095320224762, + "learning_rate": 4.0563903818743984e-06, + "loss": 0.5965, + "step": 1606 + }, + { + "epoch": 1.7975391498881432, + "grad_norm": 0.4680939018726349, + "learning_rate": 4.055222488750584e-06, + "loss": 0.5476, + "step": 1607 + }, + { + "epoch": 1.7986577181208054, + "grad_norm": 0.48658618330955505, + "learning_rate": 4.054054041689612e-06, + "loss": 0.5959, + "step": 1608 + }, + { + "epoch": 1.7997762863534676, + "grad_norm": 0.4988051652908325, + "learning_rate": 4.052885041107656e-06, + "loss": 0.5821, + "step": 1609 + }, + { + "epoch": 1.8008948545861299, + "grad_norm": 0.49107739329338074, + "learning_rate": 4.051715487421092e-06, + "loss": 0.6017, + "step": 1610 + }, + { + "epoch": 1.802013422818792, + "grad_norm": 0.5018040537834167, + "learning_rate": 4.050545381046488e-06, + "loss": 0.5873, + "step": 1611 + }, + { + "epoch": 1.8031319910514543, + "grad_norm": 0.49129587411880493, + "learning_rate": 4.049374722400613e-06, + "loss": 0.5474, + "step": 1612 + }, + { + "epoch": 1.8042505592841165, + "grad_norm": 0.49445709586143494, + "learning_rate": 4.048203511900428e-06, + "loss": 0.5787, + "step": 1613 + }, + { + "epoch": 1.8053691275167785, + "grad_norm": 0.4987739026546478, + "learning_rate": 4.047031749963095e-06, + "loss": 0.5673, + "step": 1614 + }, + { + "epoch": 1.8064876957494407, + "grad_norm": 0.4809098243713379, + "learning_rate": 4.045859437005971e-06, + "loss": 0.5701, + "step": 1615 + }, + { + "epoch": 1.807606263982103, + "grad_norm": 0.5122459530830383, + "learning_rate": 4.044686573446608e-06, + "loss": 0.5996, + "step": 1616 + }, + { + "epoch": 1.808724832214765, + "grad_norm": 0.49991941452026367, + "learning_rate": 4.0435131597027564e-06, + "loss": 0.5981, + "step": 1617 + }, + { + "epoch": 1.8098434004474273, + "grad_norm": 0.4867013990879059, + "learning_rate": 4.042339196192358e-06, + "loss": 0.5762, + "step": 1618 + }, + { + "epoch": 1.8109619686800895, + "grad_norm": 0.4812503159046173, + "learning_rate": 4.041164683333558e-06, + "loss": 0.5484, + "step": 1619 + }, + { + "epoch": 1.8120805369127517, + "grad_norm": 0.4890724718570709, + "learning_rate": 4.03998962154469e-06, + "loss": 0.6001, + "step": 1620 + }, + { + "epoch": 1.8131991051454137, + "grad_norm": 0.5026843547821045, + "learning_rate": 4.038814011244286e-06, + "loss": 0.5838, + "step": 1621 + }, + { + "epoch": 1.814317673378076, + "grad_norm": 0.49154821038246155, + "learning_rate": 4.037637852851075e-06, + "loss": 0.5607, + "step": 1622 + }, + { + "epoch": 1.8154362416107381, + "grad_norm": 0.4974023699760437, + "learning_rate": 4.036461146783979e-06, + "loss": 0.5528, + "step": 1623 + }, + { + "epoch": 1.8165548098434003, + "grad_norm": 0.5217268466949463, + "learning_rate": 4.035283893462114e-06, + "loss": 0.5978, + "step": 1624 + }, + { + "epoch": 1.8176733780760626, + "grad_norm": 0.5096696019172668, + "learning_rate": 4.034106093304795e-06, + "loss": 0.6219, + "step": 1625 + }, + { + "epoch": 1.8187919463087248, + "grad_norm": 0.49499067664146423, + "learning_rate": 4.032927746731528e-06, + "loss": 0.6066, + "step": 1626 + }, + { + "epoch": 1.819910514541387, + "grad_norm": 0.48196083307266235, + "learning_rate": 4.031748854162014e-06, + "loss": 0.562, + "step": 1627 + }, + { + "epoch": 1.8210290827740492, + "grad_norm": 0.5044178366661072, + "learning_rate": 4.030569416016152e-06, + "loss": 0.6103, + "step": 1628 + }, + { + "epoch": 1.8221476510067114, + "grad_norm": 0.48558661341667175, + "learning_rate": 4.0293894327140315e-06, + "loss": 0.5558, + "step": 1629 + }, + { + "epoch": 1.8232662192393736, + "grad_norm": 0.492214173078537, + "learning_rate": 4.0282089046759365e-06, + "loss": 0.5665, + "step": 1630 + }, + { + "epoch": 1.8243847874720358, + "grad_norm": 0.4871746599674225, + "learning_rate": 4.027027832322348e-06, + "loss": 0.5497, + "step": 1631 + }, + { + "epoch": 1.825503355704698, + "grad_norm": 0.4865874648094177, + "learning_rate": 4.025846216073938e-06, + "loss": 0.5558, + "step": 1632 + }, + { + "epoch": 1.8266219239373602, + "grad_norm": 0.4919774532318115, + "learning_rate": 4.024664056351572e-06, + "loss": 0.5804, + "step": 1633 + }, + { + "epoch": 1.8277404921700224, + "grad_norm": 0.49305927753448486, + "learning_rate": 4.02348135357631e-06, + "loss": 0.5869, + "step": 1634 + }, + { + "epoch": 1.8288590604026846, + "grad_norm": 0.523074209690094, + "learning_rate": 4.022298108169408e-06, + "loss": 0.5828, + "step": 1635 + }, + { + "epoch": 1.8299776286353469, + "grad_norm": 0.4927634298801422, + "learning_rate": 4.021114320552311e-06, + "loss": 0.5614, + "step": 1636 + }, + { + "epoch": 1.831096196868009, + "grad_norm": 0.49153605103492737, + "learning_rate": 4.019929991146659e-06, + "loss": 0.5589, + "step": 1637 + }, + { + "epoch": 1.8322147651006713, + "grad_norm": 0.5093767046928406, + "learning_rate": 4.018745120374286e-06, + "loss": 0.5923, + "step": 1638 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.5137583017349243, + "learning_rate": 4.017559708657216e-06, + "loss": 0.6154, + "step": 1639 + }, + { + "epoch": 1.8344519015659957, + "grad_norm": 0.5136058926582336, + "learning_rate": 4.016373756417669e-06, + "loss": 0.5634, + "step": 1640 + }, + { + "epoch": 1.8355704697986577, + "grad_norm": 0.4985203146934509, + "learning_rate": 4.0151872640780554e-06, + "loss": 0.5906, + "step": 1641 + }, + { + "epoch": 1.8366890380313199, + "grad_norm": 0.4897896349430084, + "learning_rate": 4.014000232060978e-06, + "loss": 0.5707, + "step": 1642 + }, + { + "epoch": 1.837807606263982, + "grad_norm": 0.49876436591148376, + "learning_rate": 4.012812660789233e-06, + "loss": 0.5998, + "step": 1643 + }, + { + "epoch": 1.8389261744966443, + "grad_norm": 0.5061901807785034, + "learning_rate": 4.011624550685808e-06, + "loss": 0.602, + "step": 1644 + }, + { + "epoch": 1.8400447427293065, + "grad_norm": 0.505077600479126, + "learning_rate": 4.010435902173883e-06, + "loss": 0.5753, + "step": 1645 + }, + { + "epoch": 1.8411633109619687, + "grad_norm": 0.4808630049228668, + "learning_rate": 4.0092467156768274e-06, + "loss": 0.5737, + "step": 1646 + }, + { + "epoch": 1.8422818791946307, + "grad_norm": 0.5141550898551941, + "learning_rate": 4.008056991618206e-06, + "loss": 0.5963, + "step": 1647 + }, + { + "epoch": 1.843400447427293, + "grad_norm": 0.4977302551269531, + "learning_rate": 4.006866730421773e-06, + "loss": 0.5772, + "step": 1648 + }, + { + "epoch": 1.8445190156599551, + "grad_norm": 0.48998597264289856, + "learning_rate": 4.005675932511474e-06, + "loss": 0.5539, + "step": 1649 + }, + { + "epoch": 1.8456375838926173, + "grad_norm": 0.5025562644004822, + "learning_rate": 4.004484598311445e-06, + "loss": 0.5811, + "step": 1650 + }, + { + "epoch": 1.8467561521252795, + "grad_norm": 0.49174991250038147, + "learning_rate": 4.003292728246015e-06, + "loss": 0.5752, + "step": 1651 + }, + { + "epoch": 1.8478747203579418, + "grad_norm": 0.49403971433639526, + "learning_rate": 4.0021003227397015e-06, + "loss": 0.6112, + "step": 1652 + }, + { + "epoch": 1.848993288590604, + "grad_norm": 0.5124534368515015, + "learning_rate": 4.000907382217215e-06, + "loss": 0.5871, + "step": 1653 + }, + { + "epoch": 1.8501118568232662, + "grad_norm": 0.5068091750144958, + "learning_rate": 3.9997139071034555e-06, + "loss": 0.6084, + "step": 1654 + }, + { + "epoch": 1.8512304250559284, + "grad_norm": 0.5051630139350891, + "learning_rate": 3.998519897823514e-06, + "loss": 0.5964, + "step": 1655 + }, + { + "epoch": 1.8523489932885906, + "grad_norm": 0.49604538083076477, + "learning_rate": 3.997325354802669e-06, + "loss": 0.5655, + "step": 1656 + }, + { + "epoch": 1.8534675615212528, + "grad_norm": 0.5094951391220093, + "learning_rate": 3.996130278466394e-06, + "loss": 0.5824, + "step": 1657 + }, + { + "epoch": 1.854586129753915, + "grad_norm": 0.4972209930419922, + "learning_rate": 3.994934669240347e-06, + "loss": 0.5644, + "step": 1658 + }, + { + "epoch": 1.8557046979865772, + "grad_norm": 0.49230918288230896, + "learning_rate": 3.993738527550382e-06, + "loss": 0.5801, + "step": 1659 + }, + { + "epoch": 1.8568232662192394, + "grad_norm": 0.5058436989784241, + "learning_rate": 3.9925418538225355e-06, + "loss": 0.5917, + "step": 1660 + }, + { + "epoch": 1.8579418344519016, + "grad_norm": 0.5173357129096985, + "learning_rate": 3.9913446484830396e-06, + "loss": 0.5809, + "step": 1661 + }, + { + "epoch": 1.8590604026845639, + "grad_norm": 0.48803800344467163, + "learning_rate": 3.9901469119583125e-06, + "loss": 0.5721, + "step": 1662 + }, + { + "epoch": 1.860178970917226, + "grad_norm": 0.5094105005264282, + "learning_rate": 3.988948644674962e-06, + "loss": 0.5742, + "step": 1663 + }, + { + "epoch": 1.8612975391498883, + "grad_norm": 0.49270302057266235, + "learning_rate": 3.987749847059788e-06, + "loss": 0.5864, + "step": 1664 + }, + { + "epoch": 1.8624161073825505, + "grad_norm": 0.49852290749549866, + "learning_rate": 3.986550519539773e-06, + "loss": 0.5875, + "step": 1665 + }, + { + "epoch": 1.8635346756152127, + "grad_norm": 0.4975663423538208, + "learning_rate": 3.985350662542096e-06, + "loss": 0.5649, + "step": 1666 + }, + { + "epoch": 1.8646532438478747, + "grad_norm": 0.5100948214530945, + "learning_rate": 3.984150276494116e-06, + "loss": 0.564, + "step": 1667 + }, + { + "epoch": 1.8657718120805369, + "grad_norm": 0.5079052448272705, + "learning_rate": 3.982949361823388e-06, + "loss": 0.5662, + "step": 1668 + }, + { + "epoch": 1.866890380313199, + "grad_norm": 0.49659186601638794, + "learning_rate": 3.981747918957653e-06, + "loss": 0.55, + "step": 1669 + }, + { + "epoch": 1.8680089485458613, + "grad_norm": 0.48802655935287476, + "learning_rate": 3.980545948324835e-06, + "loss": 0.5676, + "step": 1670 + }, + { + "epoch": 1.8691275167785235, + "grad_norm": 0.4884412884712219, + "learning_rate": 3.979343450353056e-06, + "loss": 0.5771, + "step": 1671 + }, + { + "epoch": 1.8702460850111857, + "grad_norm": 0.49414676427841187, + "learning_rate": 3.978140425470615e-06, + "loss": 0.5606, + "step": 1672 + }, + { + "epoch": 1.8713646532438477, + "grad_norm": 0.4894448220729828, + "learning_rate": 3.976936874106008e-06, + "loss": 0.5428, + "step": 1673 + }, + { + "epoch": 1.87248322147651, + "grad_norm": 0.48966702818870544, + "learning_rate": 3.9757327966879134e-06, + "loss": 0.5601, + "step": 1674 + }, + { + "epoch": 1.8736017897091721, + "grad_norm": 0.513053297996521, + "learning_rate": 3.974528193645196e-06, + "loss": 0.5993, + "step": 1675 + }, + { + "epoch": 1.8747203579418343, + "grad_norm": 0.5094168782234192, + "learning_rate": 3.973323065406911e-06, + "loss": 0.59, + "step": 1676 + }, + { + "epoch": 1.8758389261744965, + "grad_norm": 0.513595461845398, + "learning_rate": 3.9721174124023e-06, + "loss": 0.5842, + "step": 1677 + }, + { + "epoch": 1.8769574944071588, + "grad_norm": 0.5007264614105225, + "learning_rate": 3.9709112350607904e-06, + "loss": 0.5798, + "step": 1678 + }, + { + "epoch": 1.878076062639821, + "grad_norm": 0.48793676495552063, + "learning_rate": 3.969704533811997e-06, + "loss": 0.5724, + "step": 1679 + }, + { + "epoch": 1.8791946308724832, + "grad_norm": 0.5160763263702393, + "learning_rate": 3.96849730908572e-06, + "loss": 0.5812, + "step": 1680 + }, + { + "epoch": 1.8803131991051454, + "grad_norm": 0.5015366077423096, + "learning_rate": 3.967289561311949e-06, + "loss": 0.562, + "step": 1681 + }, + { + "epoch": 1.8814317673378076, + "grad_norm": 0.4753899574279785, + "learning_rate": 3.9660812909208575e-06, + "loss": 0.548, + "step": 1682 + }, + { + "epoch": 1.8825503355704698, + "grad_norm": 0.5044482350349426, + "learning_rate": 3.964872498342805e-06, + "loss": 0.606, + "step": 1683 + }, + { + "epoch": 1.883668903803132, + "grad_norm": 0.4928360879421234, + "learning_rate": 3.963663184008338e-06, + "loss": 0.5852, + "step": 1684 + }, + { + "epoch": 1.8847874720357942, + "grad_norm": 0.4930068254470825, + "learning_rate": 3.962453348348189e-06, + "loss": 0.5818, + "step": 1685 + }, + { + "epoch": 1.8859060402684564, + "grad_norm": 0.5112072229385376, + "learning_rate": 3.961242991793276e-06, + "loss": 0.6012, + "step": 1686 + }, + { + "epoch": 1.8870246085011186, + "grad_norm": 0.5157861113548279, + "learning_rate": 3.960032114774702e-06, + "loss": 0.6044, + "step": 1687 + }, + { + "epoch": 1.8881431767337808, + "grad_norm": 0.4951840937137604, + "learning_rate": 3.958820717723754e-06, + "loss": 0.5944, + "step": 1688 + }, + { + "epoch": 1.889261744966443, + "grad_norm": 0.5044351816177368, + "learning_rate": 3.957608801071907e-06, + "loss": 0.5814, + "step": 1689 + }, + { + "epoch": 1.8903803131991053, + "grad_norm": 0.498749315738678, + "learning_rate": 3.956396365250821e-06, + "loss": 0.5821, + "step": 1690 + }, + { + "epoch": 1.8914988814317675, + "grad_norm": 0.5246557593345642, + "learning_rate": 3.955183410692338e-06, + "loss": 0.5937, + "step": 1691 + }, + { + "epoch": 1.8926174496644297, + "grad_norm": 0.49238988757133484, + "learning_rate": 3.953969937828489e-06, + "loss": 0.5567, + "step": 1692 + }, + { + "epoch": 1.8937360178970917, + "grad_norm": 0.49119988083839417, + "learning_rate": 3.952755947091485e-06, + "loss": 0.553, + "step": 1693 + }, + { + "epoch": 1.8948545861297539, + "grad_norm": 0.5058138370513916, + "learning_rate": 3.951541438913723e-06, + "loss": 0.5559, + "step": 1694 + }, + { + "epoch": 1.895973154362416, + "grad_norm": 0.49434196949005127, + "learning_rate": 3.950326413727788e-06, + "loss": 0.5723, + "step": 1695 + }, + { + "epoch": 1.8970917225950783, + "grad_norm": 0.49539026618003845, + "learning_rate": 3.949110871966444e-06, + "loss": 0.5426, + "step": 1696 + }, + { + "epoch": 1.8982102908277405, + "grad_norm": 0.5126526355743408, + "learning_rate": 3.9478948140626414e-06, + "loss": 0.5969, + "step": 1697 + }, + { + "epoch": 1.8993288590604027, + "grad_norm": 0.5030180811882019, + "learning_rate": 3.946678240449515e-06, + "loss": 0.5667, + "step": 1698 + }, + { + "epoch": 1.900447427293065, + "grad_norm": 0.5048319101333618, + "learning_rate": 3.945461151560382e-06, + "loss": 0.5755, + "step": 1699 + }, + { + "epoch": 1.901565995525727, + "grad_norm": 0.49721550941467285, + "learning_rate": 3.944243547828742e-06, + "loss": 0.5752, + "step": 1700 + }, + { + "epoch": 1.9026845637583891, + "grad_norm": 0.5135868191719055, + "learning_rate": 3.943025429688281e-06, + "loss": 0.6088, + "step": 1701 + }, + { + "epoch": 1.9038031319910513, + "grad_norm": 0.48717236518859863, + "learning_rate": 3.941806797572867e-06, + "loss": 0.5583, + "step": 1702 + }, + { + "epoch": 1.9049217002237135, + "grad_norm": 0.4741756319999695, + "learning_rate": 3.940587651916551e-06, + "loss": 0.5203, + "step": 1703 + }, + { + "epoch": 1.9060402684563758, + "grad_norm": 0.49962809681892395, + "learning_rate": 3.939367993153566e-06, + "loss": 0.5598, + "step": 1704 + }, + { + "epoch": 1.907158836689038, + "grad_norm": 0.48792213201522827, + "learning_rate": 3.938147821718328e-06, + "loss": 0.5504, + "step": 1705 + }, + { + "epoch": 1.9082774049217002, + "grad_norm": 0.5173740386962891, + "learning_rate": 3.936927138045438e-06, + "loss": 0.5623, + "step": 1706 + }, + { + "epoch": 1.9093959731543624, + "grad_norm": 0.5256320834159851, + "learning_rate": 3.935705942569675e-06, + "loss": 0.5873, + "step": 1707 + }, + { + "epoch": 1.9105145413870246, + "grad_norm": 0.5090080499649048, + "learning_rate": 3.934484235726006e-06, + "loss": 0.6075, + "step": 1708 + }, + { + "epoch": 1.9116331096196868, + "grad_norm": 0.4803988039493561, + "learning_rate": 3.933262017949574e-06, + "loss": 0.5692, + "step": 1709 + }, + { + "epoch": 1.912751677852349, + "grad_norm": 0.4928509294986725, + "learning_rate": 3.932039289675709e-06, + "loss": 0.565, + "step": 1710 + }, + { + "epoch": 1.9138702460850112, + "grad_norm": 0.5102552771568298, + "learning_rate": 3.930816051339918e-06, + "loss": 0.5732, + "step": 1711 + }, + { + "epoch": 1.9149888143176734, + "grad_norm": 0.49371102452278137, + "learning_rate": 3.929592303377896e-06, + "loss": 0.5702, + "step": 1712 + }, + { + "epoch": 1.9161073825503356, + "grad_norm": 0.48887091875076294, + "learning_rate": 3.928368046225515e-06, + "loss": 0.5743, + "step": 1713 + }, + { + "epoch": 1.9172259507829978, + "grad_norm": 0.5040315985679626, + "learning_rate": 3.927143280318828e-06, + "loss": 0.5998, + "step": 1714 + }, + { + "epoch": 1.91834451901566, + "grad_norm": 0.5192286372184753, + "learning_rate": 3.92591800609407e-06, + "loss": 0.5694, + "step": 1715 + }, + { + "epoch": 1.9194630872483223, + "grad_norm": 0.5107104778289795, + "learning_rate": 3.924692223987661e-06, + "loss": 0.6087, + "step": 1716 + }, + { + "epoch": 1.9205816554809845, + "grad_norm": 0.5028025507926941, + "learning_rate": 3.923465934436195e-06, + "loss": 0.5696, + "step": 1717 + }, + { + "epoch": 1.9217002237136467, + "grad_norm": 0.5084637403488159, + "learning_rate": 3.922239137876452e-06, + "loss": 0.6032, + "step": 1718 + }, + { + "epoch": 1.9228187919463087, + "grad_norm": 0.5004264116287231, + "learning_rate": 3.921011834745392e-06, + "loss": 0.5559, + "step": 1719 + }, + { + "epoch": 1.9239373601789709, + "grad_norm": 0.522179365158081, + "learning_rate": 3.919784025480151e-06, + "loss": 0.612, + "step": 1720 + }, + { + "epoch": 1.925055928411633, + "grad_norm": 0.508663535118103, + "learning_rate": 3.918555710518051e-06, + "loss": 0.5731, + "step": 1721 + }, + { + "epoch": 1.9261744966442953, + "grad_norm": 0.5001826286315918, + "learning_rate": 3.917326890296591e-06, + "loss": 0.5906, + "step": 1722 + }, + { + "epoch": 1.9272930648769575, + "grad_norm": 0.4933156967163086, + "learning_rate": 3.91609756525345e-06, + "loss": 0.5816, + "step": 1723 + }, + { + "epoch": 1.9284116331096197, + "grad_norm": 0.48916542530059814, + "learning_rate": 3.914867735826489e-06, + "loss": 0.6014, + "step": 1724 + }, + { + "epoch": 1.929530201342282, + "grad_norm": 0.5224503874778748, + "learning_rate": 3.913637402453745e-06, + "loss": 0.5787, + "step": 1725 + }, + { + "epoch": 1.930648769574944, + "grad_norm": 0.4952391982078552, + "learning_rate": 3.912406565573438e-06, + "loss": 0.5606, + "step": 1726 + }, + { + "epoch": 1.9317673378076061, + "grad_norm": 0.5111702680587769, + "learning_rate": 3.911175225623965e-06, + "loss": 0.5668, + "step": 1727 + }, + { + "epoch": 1.9328859060402683, + "grad_norm": 0.4997490346431732, + "learning_rate": 3.909943383043904e-06, + "loss": 0.5695, + "step": 1728 + }, + { + "epoch": 1.9340044742729305, + "grad_norm": 0.5036013126373291, + "learning_rate": 3.908711038272011e-06, + "loss": 0.5928, + "step": 1729 + }, + { + "epoch": 1.9351230425055927, + "grad_norm": 0.5162931084632874, + "learning_rate": 3.907478191747221e-06, + "loss": 0.5749, + "step": 1730 + }, + { + "epoch": 1.936241610738255, + "grad_norm": 0.48660942912101746, + "learning_rate": 3.906244843908647e-06, + "loss": 0.5539, + "step": 1731 + }, + { + "epoch": 1.9373601789709172, + "grad_norm": 0.4941866993904114, + "learning_rate": 3.905010995195582e-06, + "loss": 0.5854, + "step": 1732 + }, + { + "epoch": 1.9384787472035794, + "grad_norm": 0.48834383487701416, + "learning_rate": 3.903776646047496e-06, + "loss": 0.6049, + "step": 1733 + }, + { + "epoch": 1.9395973154362416, + "grad_norm": 0.5004755258560181, + "learning_rate": 3.902541796904038e-06, + "loss": 0.5544, + "step": 1734 + }, + { + "epoch": 1.9407158836689038, + "grad_norm": 0.508080005645752, + "learning_rate": 3.901306448205035e-06, + "loss": 0.5804, + "step": 1735 + }, + { + "epoch": 1.941834451901566, + "grad_norm": 0.49556127190589905, + "learning_rate": 3.9000706003904934e-06, + "loss": 0.5871, + "step": 1736 + }, + { + "epoch": 1.9429530201342282, + "grad_norm": 0.49995389580726624, + "learning_rate": 3.898834253900594e-06, + "loss": 0.59, + "step": 1737 + }, + { + "epoch": 1.9440715883668904, + "grad_norm": 0.4921482503414154, + "learning_rate": 3.8975974091756975e-06, + "loss": 0.5611, + "step": 1738 + }, + { + "epoch": 1.9451901565995526, + "grad_norm": 0.5095739364624023, + "learning_rate": 3.896360066656342e-06, + "loss": 0.5636, + "step": 1739 + }, + { + "epoch": 1.9463087248322148, + "grad_norm": 0.4894288182258606, + "learning_rate": 3.895122226783243e-06, + "loss": 0.5896, + "step": 1740 + }, + { + "epoch": 1.947427293064877, + "grad_norm": 0.516846776008606, + "learning_rate": 3.893883889997292e-06, + "loss": 0.5938, + "step": 1741 + }, + { + "epoch": 1.9485458612975393, + "grad_norm": 0.4975781738758087, + "learning_rate": 3.892645056739559e-06, + "loss": 0.5635, + "step": 1742 + }, + { + "epoch": 1.9496644295302015, + "grad_norm": 0.49723535776138306, + "learning_rate": 3.891405727451289e-06, + "loss": 0.588, + "step": 1743 + }, + { + "epoch": 1.9507829977628637, + "grad_norm": 0.5051411390304565, + "learning_rate": 3.8901659025739044e-06, + "loss": 0.5574, + "step": 1744 + }, + { + "epoch": 1.951901565995526, + "grad_norm": 0.4824368953704834, + "learning_rate": 3.888925582549006e-06, + "loss": 0.5679, + "step": 1745 + }, + { + "epoch": 1.9530201342281879, + "grad_norm": 0.49928563833236694, + "learning_rate": 3.887684767818369e-06, + "loss": 0.5664, + "step": 1746 + }, + { + "epoch": 1.95413870246085, + "grad_norm": 0.5037205219268799, + "learning_rate": 3.886443458823944e-06, + "loss": 0.5685, + "step": 1747 + }, + { + "epoch": 1.9552572706935123, + "grad_norm": 0.49731746315956116, + "learning_rate": 3.88520165600786e-06, + "loss": 0.563, + "step": 1748 + }, + { + "epoch": 1.9563758389261745, + "grad_norm": 0.5214453935623169, + "learning_rate": 3.883959359812421e-06, + "loss": 0.5882, + "step": 1749 + }, + { + "epoch": 1.9574944071588367, + "grad_norm": 0.49750804901123047, + "learning_rate": 3.8827165706801055e-06, + "loss": 0.5872, + "step": 1750 + }, + { + "epoch": 1.958612975391499, + "grad_norm": 0.5092785358428955, + "learning_rate": 3.881473289053569e-06, + "loss": 0.5948, + "step": 1751 + }, + { + "epoch": 1.959731543624161, + "grad_norm": 0.5069229602813721, + "learning_rate": 3.880229515375642e-06, + "loss": 0.5836, + "step": 1752 + }, + { + "epoch": 1.9608501118568231, + "grad_norm": 0.48659929633140564, + "learning_rate": 3.878985250089331e-06, + "loss": 0.5723, + "step": 1753 + }, + { + "epoch": 1.9619686800894853, + "grad_norm": 0.4850320518016815, + "learning_rate": 3.8777404936378145e-06, + "loss": 0.567, + "step": 1754 + }, + { + "epoch": 1.9630872483221475, + "grad_norm": 0.49184632301330566, + "learning_rate": 3.87649524646445e-06, + "loss": 0.5594, + "step": 1755 + }, + { + "epoch": 1.9642058165548097, + "grad_norm": 0.5033354163169861, + "learning_rate": 3.875249509012769e-06, + "loss": 0.5774, + "step": 1756 + }, + { + "epoch": 1.965324384787472, + "grad_norm": 0.4867788851261139, + "learning_rate": 3.874003281726474e-06, + "loss": 0.5656, + "step": 1757 + }, + { + "epoch": 1.9664429530201342, + "grad_norm": 0.5024017095565796, + "learning_rate": 3.872756565049447e-06, + "loss": 0.542, + "step": 1758 + }, + { + "epoch": 1.9675615212527964, + "grad_norm": 0.4939708709716797, + "learning_rate": 3.871509359425741e-06, + "loss": 0.5554, + "step": 1759 + }, + { + "epoch": 1.9686800894854586, + "grad_norm": 0.48120421171188354, + "learning_rate": 3.870261665299583e-06, + "loss": 0.5724, + "step": 1760 + }, + { + "epoch": 1.9697986577181208, + "grad_norm": 0.49949952960014343, + "learning_rate": 3.869013483115378e-06, + "loss": 0.5732, + "step": 1761 + }, + { + "epoch": 1.970917225950783, + "grad_norm": 0.5079313516616821, + "learning_rate": 3.867764813317699e-06, + "loss": 0.5969, + "step": 1762 + }, + { + "epoch": 1.9720357941834452, + "grad_norm": 0.5027442574501038, + "learning_rate": 3.866515656351297e-06, + "loss": 0.5742, + "step": 1763 + }, + { + "epoch": 1.9731543624161074, + "grad_norm": 0.4810044765472412, + "learning_rate": 3.865266012661095e-06, + "loss": 0.5704, + "step": 1764 + }, + { + "epoch": 1.9742729306487696, + "grad_norm": 0.49268779158592224, + "learning_rate": 3.86401588269219e-06, + "loss": 0.5459, + "step": 1765 + }, + { + "epoch": 1.9753914988814318, + "grad_norm": 0.4957699179649353, + "learning_rate": 3.86276526688985e-06, + "loss": 0.5722, + "step": 1766 + }, + { + "epoch": 1.976510067114094, + "grad_norm": 0.4951817989349365, + "learning_rate": 3.8615141656995194e-06, + "loss": 0.5564, + "step": 1767 + }, + { + "epoch": 1.9776286353467563, + "grad_norm": 0.5150087475776672, + "learning_rate": 3.860262579566813e-06, + "loss": 0.5721, + "step": 1768 + }, + { + "epoch": 1.9787472035794185, + "grad_norm": 0.5071924328804016, + "learning_rate": 3.859010508937521e-06, + "loss": 0.5685, + "step": 1769 + }, + { + "epoch": 1.9798657718120807, + "grad_norm": 0.5006820559501648, + "learning_rate": 3.857757954257601e-06, + "loss": 0.5701, + "step": 1770 + }, + { + "epoch": 1.9809843400447429, + "grad_norm": 0.5228837132453918, + "learning_rate": 3.856504915973188e-06, + "loss": 0.5691, + "step": 1771 + }, + { + "epoch": 1.9821029082774049, + "grad_norm": 0.4966356158256531, + "learning_rate": 3.855251394530589e-06, + "loss": 0.5598, + "step": 1772 + }, + { + "epoch": 1.983221476510067, + "grad_norm": 0.5056618452072144, + "learning_rate": 3.85399739037628e-06, + "loss": 0.5656, + "step": 1773 + }, + { + "epoch": 1.9843400447427293, + "grad_norm": 0.49823620915412903, + "learning_rate": 3.85274290395691e-06, + "loss": 0.5685, + "step": 1774 + }, + { + "epoch": 1.9854586129753915, + "grad_norm": 0.5027350187301636, + "learning_rate": 3.851487935719302e-06, + "loss": 0.5697, + "step": 1775 + }, + { + "epoch": 1.9865771812080537, + "grad_norm": 0.5098000764846802, + "learning_rate": 3.8502324861104475e-06, + "loss": 0.5658, + "step": 1776 + }, + { + "epoch": 1.987695749440716, + "grad_norm": 0.5115095376968384, + "learning_rate": 3.848976555577513e-06, + "loss": 0.5695, + "step": 1777 + }, + { + "epoch": 1.9888143176733781, + "grad_norm": 0.485042005777359, + "learning_rate": 3.847720144567832e-06, + "loss": 0.5505, + "step": 1778 + }, + { + "epoch": 1.9899328859060401, + "grad_norm": 0.5096463561058044, + "learning_rate": 3.846463253528913e-06, + "loss": 0.576, + "step": 1779 + }, + { + "epoch": 1.9910514541387023, + "grad_norm": 0.508958101272583, + "learning_rate": 3.845205882908432e-06, + "loss": 0.58, + "step": 1780 + }, + { + "epoch": 1.9921700223713645, + "grad_norm": 0.5001410245895386, + "learning_rate": 3.8439480331542394e-06, + "loss": 0.5778, + "step": 1781 + }, + { + "epoch": 1.9932885906040267, + "grad_norm": 0.49567386507987976, + "learning_rate": 3.842689704714354e-06, + "loss": 0.5688, + "step": 1782 + }, + { + "epoch": 1.994407158836689, + "grad_norm": 0.5215677618980408, + "learning_rate": 3.841430898036966e-06, + "loss": 0.5879, + "step": 1783 + }, + { + "epoch": 1.9955257270693512, + "grad_norm": 0.5176389217376709, + "learning_rate": 3.840171613570435e-06, + "loss": 0.5788, + "step": 1784 + }, + { + "epoch": 1.9966442953020134, + "grad_norm": 0.5101540088653564, + "learning_rate": 3.838911851763291e-06, + "loss": 0.563, + "step": 1785 + }, + { + "epoch": 1.9977628635346756, + "grad_norm": 0.5037233233451843, + "learning_rate": 3.837651613064234e-06, + "loss": 0.582, + "step": 1786 + }, + { + "epoch": 1.9988814317673378, + "grad_norm": 0.5072547197341919, + "learning_rate": 3.836390897922136e-06, + "loss": 0.5856, + "step": 1787 + }, + { + "epoch": 2.0, + "grad_norm": 0.49261927604675293, + "learning_rate": 3.835129706786036e-06, + "loss": 0.5879, + "step": 1788 + } + ], + "logging_steps": 1, + "max_steps": 5364, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 894, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.005375691167105e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}