{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 4164, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004803073967339097, "grad_norm": 12.295453677898383, "learning_rate": 1.6e-08, "loss": 0.3404, "step": 1 }, { "epoch": 0.0009606147934678194, "grad_norm": 11.115325503679177, "learning_rate": 3.2e-08, "loss": 0.2956, "step": 2 }, { "epoch": 0.001440922190201729, "grad_norm": 12.134299471519068, "learning_rate": 4.8e-08, "loss": 0.3497, "step": 3 }, { "epoch": 0.0019212295869356388, "grad_norm": 11.889325768094226, "learning_rate": 6.4e-08, "loss": 0.3239, "step": 4 }, { "epoch": 0.0024015369836695487, "grad_norm": 11.017686747111874, "learning_rate": 8e-08, "loss": 0.2895, "step": 5 }, { "epoch": 0.002881844380403458, "grad_norm": 10.764449822783085, "learning_rate": 9.6e-08, "loss": 0.3062, "step": 6 }, { "epoch": 0.0033621517771373678, "grad_norm": 9.810863684605177, "learning_rate": 1.1200000000000001e-07, "loss": 0.3056, "step": 7 }, { "epoch": 0.0038424591738712775, "grad_norm": 11.438972361023298, "learning_rate": 1.28e-07, "loss": 0.294, "step": 8 }, { "epoch": 0.004322766570605188, "grad_norm": 11.696011568247082, "learning_rate": 1.4400000000000002e-07, "loss": 0.3285, "step": 9 }, { "epoch": 0.004803073967339097, "grad_norm": 11.507512344037782, "learning_rate": 1.6e-07, "loss": 0.314, "step": 10 }, { "epoch": 0.005283381364073006, "grad_norm": 10.684581565163377, "learning_rate": 1.7600000000000001e-07, "loss": 0.2883, "step": 11 }, { "epoch": 0.005763688760806916, "grad_norm": 11.24558403809558, "learning_rate": 1.92e-07, "loss": 0.3116, "step": 12 }, { "epoch": 0.006243996157540826, "grad_norm": 10.790135146175551, "learning_rate": 2.08e-07, "loss": 0.3244, "step": 13 }, { "epoch": 0.0067243035542747355, "grad_norm": 10.608527245093223, "learning_rate": 2.2400000000000002e-07, "loss": 0.2747, "step": 14 }, { "epoch": 0.007204610951008645, "grad_norm": 10.265000231723027, "learning_rate": 2.4000000000000003e-07, "loss": 0.2674, "step": 15 }, { "epoch": 0.007684918347742555, "grad_norm": 11.298317377682416, "learning_rate": 2.56e-07, "loss": 0.2679, "step": 16 }, { "epoch": 0.008165225744476465, "grad_norm": 9.86248788668308, "learning_rate": 2.72e-07, "loss": 0.3099, "step": 17 }, { "epoch": 0.008645533141210375, "grad_norm": 9.0577072692478, "learning_rate": 2.8800000000000004e-07, "loss": 0.2674, "step": 18 }, { "epoch": 0.009125840537944284, "grad_norm": 7.238412780645614, "learning_rate": 3.04e-07, "loss": 0.2082, "step": 19 }, { "epoch": 0.009606147934678195, "grad_norm": 9.490442029212941, "learning_rate": 3.2e-07, "loss": 0.2548, "step": 20 }, { "epoch": 0.010086455331412104, "grad_norm": 7.936036846955943, "learning_rate": 3.36e-07, "loss": 0.2487, "step": 21 }, { "epoch": 0.010566762728146013, "grad_norm": 8.384911819682703, "learning_rate": 3.5200000000000003e-07, "loss": 0.2578, "step": 22 }, { "epoch": 0.011047070124879923, "grad_norm": 6.667251467184058, "learning_rate": 3.68e-07, "loss": 0.2315, "step": 23 }, { "epoch": 0.011527377521613832, "grad_norm": 5.388532747106631, "learning_rate": 3.84e-07, "loss": 0.2153, "step": 24 }, { "epoch": 0.012007684918347743, "grad_norm": 5.076276759500522, "learning_rate": 4.0000000000000003e-07, "loss": 0.1773, "step": 25 }, { "epoch": 0.012487992315081652, "grad_norm": 5.044901367454851, "learning_rate": 4.16e-07, "loss": 0.1981, "step": 26 }, { "epoch": 0.012968299711815562, "grad_norm": 4.6484570213541145, "learning_rate": 4.3200000000000006e-07, "loss": 0.1962, "step": 27 }, { "epoch": 0.013448607108549471, "grad_norm": 4.820579032408368, "learning_rate": 4.4800000000000004e-07, "loss": 0.1923, "step": 28 }, { "epoch": 0.013928914505283382, "grad_norm": 3.6954293876191535, "learning_rate": 4.64e-07, "loss": 0.1827, "step": 29 }, { "epoch": 0.01440922190201729, "grad_norm": 3.8040876250915083, "learning_rate": 4.800000000000001e-07, "loss": 0.1959, "step": 30 }, { "epoch": 0.014889529298751201, "grad_norm": 3.0166885970679043, "learning_rate": 4.96e-07, "loss": 0.1652, "step": 31 }, { "epoch": 0.01536983669548511, "grad_norm": 2.2910049629070497, "learning_rate": 5.12e-07, "loss": 0.1723, "step": 32 }, { "epoch": 0.01585014409221902, "grad_norm": 2.7933136537228744, "learning_rate": 5.280000000000001e-07, "loss": 0.1663, "step": 33 }, { "epoch": 0.01633045148895293, "grad_norm": 2.5494412148634487, "learning_rate": 5.44e-07, "loss": 0.1593, "step": 34 }, { "epoch": 0.01681075888568684, "grad_norm": 2.7365853076508504, "learning_rate": 5.6e-07, "loss": 0.1845, "step": 35 }, { "epoch": 0.01729106628242075, "grad_norm": 2.6326300548480956, "learning_rate": 5.760000000000001e-07, "loss": 0.1734, "step": 36 }, { "epoch": 0.01777137367915466, "grad_norm": 1.9178697505579982, "learning_rate": 5.920000000000001e-07, "loss": 0.1249, "step": 37 }, { "epoch": 0.01825168107588857, "grad_norm": 2.5724433031106853, "learning_rate": 6.08e-07, "loss": 0.1308, "step": 38 }, { "epoch": 0.018731988472622477, "grad_norm": 2.6912073654452016, "learning_rate": 6.24e-07, "loss": 0.1527, "step": 39 }, { "epoch": 0.01921229586935639, "grad_norm": 2.167890593402019, "learning_rate": 6.4e-07, "loss": 0.1551, "step": 40 }, { "epoch": 0.0196926032660903, "grad_norm": 1.9023821581489182, "learning_rate": 6.560000000000002e-07, "loss": 0.1264, "step": 41 }, { "epoch": 0.020172910662824207, "grad_norm": 3.3946961363245363, "learning_rate": 6.72e-07, "loss": 0.183, "step": 42 }, { "epoch": 0.020653218059558116, "grad_norm": 2.2864808587177783, "learning_rate": 6.88e-07, "loss": 0.1473, "step": 43 }, { "epoch": 0.021133525456292025, "grad_norm": 2.0274847089097765, "learning_rate": 7.040000000000001e-07, "loss": 0.125, "step": 44 }, { "epoch": 0.021613832853025938, "grad_norm": 2.315770203936131, "learning_rate": 7.2e-07, "loss": 0.1362, "step": 45 }, { "epoch": 0.022094140249759846, "grad_norm": 2.305110403527537, "learning_rate": 7.36e-07, "loss": 0.1515, "step": 46 }, { "epoch": 0.022574447646493755, "grad_norm": 2.3692793672422985, "learning_rate": 7.520000000000001e-07, "loss": 0.1427, "step": 47 }, { "epoch": 0.023054755043227664, "grad_norm": 1.853538426035806, "learning_rate": 7.68e-07, "loss": 0.13, "step": 48 }, { "epoch": 0.023535062439961577, "grad_norm": 1.7993480609748818, "learning_rate": 7.84e-07, "loss": 0.1384, "step": 49 }, { "epoch": 0.024015369836695485, "grad_norm": 1.2624888010807611, "learning_rate": 8.000000000000001e-07, "loss": 0.0812, "step": 50 }, { "epoch": 0.024495677233429394, "grad_norm": 1.654775529490419, "learning_rate": 8.160000000000001e-07, "loss": 0.1289, "step": 51 }, { "epoch": 0.024975984630163303, "grad_norm": 1.9053266889486429, "learning_rate": 8.32e-07, "loss": 0.128, "step": 52 }, { "epoch": 0.025456292026897216, "grad_norm": 1.3131685994032491, "learning_rate": 8.480000000000001e-07, "loss": 0.1099, "step": 53 }, { "epoch": 0.025936599423631124, "grad_norm": 1.8073567315806054, "learning_rate": 8.640000000000001e-07, "loss": 0.1209, "step": 54 }, { "epoch": 0.026416906820365033, "grad_norm": 1.5555744040575845, "learning_rate": 8.8e-07, "loss": 0.1268, "step": 55 }, { "epoch": 0.026897214217098942, "grad_norm": 1.1507622935499833, "learning_rate": 8.960000000000001e-07, "loss": 0.0939, "step": 56 }, { "epoch": 0.027377521613832854, "grad_norm": 1.8426592777636386, "learning_rate": 9.120000000000001e-07, "loss": 0.1267, "step": 57 }, { "epoch": 0.027857829010566763, "grad_norm": 1.8172160960989603, "learning_rate": 9.28e-07, "loss": 0.1419, "step": 58 }, { "epoch": 0.028338136407300672, "grad_norm": 1.734138032657134, "learning_rate": 9.440000000000001e-07, "loss": 0.1063, "step": 59 }, { "epoch": 0.02881844380403458, "grad_norm": 1.9904099384917207, "learning_rate": 9.600000000000001e-07, "loss": 0.1547, "step": 60 }, { "epoch": 0.029298751200768493, "grad_norm": 1.431824839460655, "learning_rate": 9.76e-07, "loss": 0.1297, "step": 61 }, { "epoch": 0.029779058597502402, "grad_norm": 1.3047846711226216, "learning_rate": 9.92e-07, "loss": 0.0861, "step": 62 }, { "epoch": 0.03025936599423631, "grad_norm": 1.9570883092765483, "learning_rate": 1.0080000000000001e-06, "loss": 0.1552, "step": 63 }, { "epoch": 0.03073967339097022, "grad_norm": 1.2651593114993294, "learning_rate": 1.024e-06, "loss": 0.0973, "step": 64 }, { "epoch": 0.03121998078770413, "grad_norm": 1.576842206277053, "learning_rate": 1.04e-06, "loss": 0.1015, "step": 65 }, { "epoch": 0.03170028818443804, "grad_norm": 1.7388488234803576, "learning_rate": 1.0560000000000001e-06, "loss": 0.1098, "step": 66 }, { "epoch": 0.03218059558117195, "grad_norm": 1.5418141689606337, "learning_rate": 1.072e-06, "loss": 0.0982, "step": 67 }, { "epoch": 0.03266090297790586, "grad_norm": 1.7260585222683362, "learning_rate": 1.088e-06, "loss": 0.1348, "step": 68 }, { "epoch": 0.03314121037463977, "grad_norm": 1.4436799175309636, "learning_rate": 1.1040000000000001e-06, "loss": 0.1192, "step": 69 }, { "epoch": 0.03362151777137368, "grad_norm": 3.222503996824014, "learning_rate": 1.12e-06, "loss": 0.1234, "step": 70 }, { "epoch": 0.034101825168107586, "grad_norm": 1.6939823082128094, "learning_rate": 1.1360000000000002e-06, "loss": 0.1165, "step": 71 }, { "epoch": 0.0345821325648415, "grad_norm": 1.8379294842379774, "learning_rate": 1.1520000000000002e-06, "loss": 0.1185, "step": 72 }, { "epoch": 0.03506243996157541, "grad_norm": 1.5569252017032666, "learning_rate": 1.168e-06, "loss": 0.1312, "step": 73 }, { "epoch": 0.03554274735830932, "grad_norm": 1.714684617371849, "learning_rate": 1.1840000000000002e-06, "loss": 0.0876, "step": 74 }, { "epoch": 0.03602305475504323, "grad_norm": 1.3225072536309823, "learning_rate": 1.2000000000000002e-06, "loss": 0.0957, "step": 75 }, { "epoch": 0.03650336215177714, "grad_norm": 1.8975516746309764, "learning_rate": 1.216e-06, "loss": 0.1211, "step": 76 }, { "epoch": 0.036983669548511046, "grad_norm": 1.5164480525596415, "learning_rate": 1.2320000000000002e-06, "loss": 0.1117, "step": 77 }, { "epoch": 0.037463976945244955, "grad_norm": 1.8598013368901916, "learning_rate": 1.248e-06, "loss": 0.0974, "step": 78 }, { "epoch": 0.037944284341978864, "grad_norm": 1.6096443930527105, "learning_rate": 1.2640000000000003e-06, "loss": 0.1066, "step": 79 }, { "epoch": 0.03842459173871278, "grad_norm": 1.1438512252152493, "learning_rate": 1.28e-06, "loss": 0.096, "step": 80 }, { "epoch": 0.03890489913544669, "grad_norm": 1.464888807639377, "learning_rate": 1.296e-06, "loss": 0.0982, "step": 81 }, { "epoch": 0.0393852065321806, "grad_norm": 1.81847293552993, "learning_rate": 1.3120000000000003e-06, "loss": 0.1419, "step": 82 }, { "epoch": 0.039865513928914506, "grad_norm": 1.7194540925734543, "learning_rate": 1.328e-06, "loss": 0.1054, "step": 83 }, { "epoch": 0.040345821325648415, "grad_norm": 1.46339302106847, "learning_rate": 1.344e-06, "loss": 0.1246, "step": 84 }, { "epoch": 0.040826128722382324, "grad_norm": 1.6380410081431438, "learning_rate": 1.3600000000000001e-06, "loss": 0.0806, "step": 85 }, { "epoch": 0.04130643611911623, "grad_norm": 1.8756328408381668, "learning_rate": 1.376e-06, "loss": 0.1188, "step": 86 }, { "epoch": 0.04178674351585014, "grad_norm": 1.4464326251601627, "learning_rate": 1.392e-06, "loss": 0.113, "step": 87 }, { "epoch": 0.04226705091258405, "grad_norm": 1.240058007444482, "learning_rate": 1.4080000000000001e-06, "loss": 0.0934, "step": 88 }, { "epoch": 0.042747358309317966, "grad_norm": 2.3178241893688645, "learning_rate": 1.424e-06, "loss": 0.1126, "step": 89 }, { "epoch": 0.043227665706051875, "grad_norm": 1.7177471908214919, "learning_rate": 1.44e-06, "loss": 0.1345, "step": 90 }, { "epoch": 0.043707973102785784, "grad_norm": 2.7314130913960812, "learning_rate": 1.4560000000000001e-06, "loss": 0.1069, "step": 91 }, { "epoch": 0.04418828049951969, "grad_norm": 1.0979463668294076, "learning_rate": 1.472e-06, "loss": 0.0966, "step": 92 }, { "epoch": 0.0446685878962536, "grad_norm": 1.4821760121201706, "learning_rate": 1.488e-06, "loss": 0.1048, "step": 93 }, { "epoch": 0.04514889529298751, "grad_norm": 1.6721548051611312, "learning_rate": 1.5040000000000001e-06, "loss": 0.1065, "step": 94 }, { "epoch": 0.04562920268972142, "grad_norm": 1.4046486532313425, "learning_rate": 1.52e-06, "loss": 0.1121, "step": 95 }, { "epoch": 0.04610951008645533, "grad_norm": 1.8920282885751272, "learning_rate": 1.536e-06, "loss": 0.093, "step": 96 }, { "epoch": 0.046589817483189244, "grad_norm": 1.3117971937414172, "learning_rate": 1.5520000000000001e-06, "loss": 0.0763, "step": 97 }, { "epoch": 0.04707012487992315, "grad_norm": 1.2226066484010771, "learning_rate": 1.568e-06, "loss": 0.0925, "step": 98 }, { "epoch": 0.04755043227665706, "grad_norm": 1.2523216029904312, "learning_rate": 1.5840000000000002e-06, "loss": 0.0886, "step": 99 }, { "epoch": 0.04803073967339097, "grad_norm": 1.2711895556124746, "learning_rate": 1.6000000000000001e-06, "loss": 0.0871, "step": 100 }, { "epoch": 0.04851104707012488, "grad_norm": 3.040299010535842, "learning_rate": 1.616e-06, "loss": 0.0745, "step": 101 }, { "epoch": 0.04899135446685879, "grad_norm": 1.6389584028051778, "learning_rate": 1.6320000000000002e-06, "loss": 0.1278, "step": 102 }, { "epoch": 0.0494716618635927, "grad_norm": 1.5552388728614976, "learning_rate": 1.6480000000000001e-06, "loss": 0.0993, "step": 103 }, { "epoch": 0.049951969260326606, "grad_norm": 1.3887590517997233, "learning_rate": 1.664e-06, "loss": 0.0846, "step": 104 }, { "epoch": 0.05043227665706052, "grad_norm": 1.5458592978622285, "learning_rate": 1.6800000000000002e-06, "loss": 0.1046, "step": 105 }, { "epoch": 0.05091258405379443, "grad_norm": 2.9179450722754434, "learning_rate": 1.6960000000000002e-06, "loss": 0.1242, "step": 106 }, { "epoch": 0.05139289145052834, "grad_norm": 2.7394754636725693, "learning_rate": 1.712e-06, "loss": 0.0819, "step": 107 }, { "epoch": 0.05187319884726225, "grad_norm": 2.077429221247708, "learning_rate": 1.7280000000000002e-06, "loss": 0.1007, "step": 108 }, { "epoch": 0.05235350624399616, "grad_norm": 1.2362929358102204, "learning_rate": 1.7440000000000002e-06, "loss": 0.0975, "step": 109 }, { "epoch": 0.052833813640730067, "grad_norm": 1.5896189455542367, "learning_rate": 1.76e-06, "loss": 0.1118, "step": 110 }, { "epoch": 0.053314121037463975, "grad_norm": 1.8111208272150052, "learning_rate": 1.7760000000000002e-06, "loss": 0.1173, "step": 111 }, { "epoch": 0.053794428434197884, "grad_norm": 1.1388547833141291, "learning_rate": 1.7920000000000002e-06, "loss": 0.0715, "step": 112 }, { "epoch": 0.05427473583093179, "grad_norm": 1.637903840464677, "learning_rate": 1.808e-06, "loss": 0.1148, "step": 113 }, { "epoch": 0.05475504322766571, "grad_norm": 1.6505805474404067, "learning_rate": 1.8240000000000002e-06, "loss": 0.1086, "step": 114 }, { "epoch": 0.05523535062439962, "grad_norm": 2.027298880252341, "learning_rate": 1.8400000000000002e-06, "loss": 0.1137, "step": 115 }, { "epoch": 0.05571565802113353, "grad_norm": 1.9867450378269047, "learning_rate": 1.856e-06, "loss": 0.0949, "step": 116 }, { "epoch": 0.056195965417867436, "grad_norm": 1.0287961975246656, "learning_rate": 1.8720000000000002e-06, "loss": 0.0993, "step": 117 }, { "epoch": 0.056676272814601344, "grad_norm": 3.6886267076116726, "learning_rate": 1.8880000000000002e-06, "loss": 0.0734, "step": 118 }, { "epoch": 0.05715658021133525, "grad_norm": 1.4209403055185024, "learning_rate": 1.9040000000000003e-06, "loss": 0.088, "step": 119 }, { "epoch": 0.05763688760806916, "grad_norm": 1.5252469546994676, "learning_rate": 1.9200000000000003e-06, "loss": 0.1195, "step": 120 }, { "epoch": 0.05811719500480307, "grad_norm": 1.2043288229655742, "learning_rate": 1.936e-06, "loss": 0.0929, "step": 121 }, { "epoch": 0.05859750240153699, "grad_norm": 4.538691137978197, "learning_rate": 1.952e-06, "loss": 0.0773, "step": 122 }, { "epoch": 0.059077809798270896, "grad_norm": 1.6211192048515701, "learning_rate": 1.968e-06, "loss": 0.1006, "step": 123 }, { "epoch": 0.059558117195004805, "grad_norm": 1.1742642998496757, "learning_rate": 1.984e-06, "loss": 0.0682, "step": 124 }, { "epoch": 0.060038424591738714, "grad_norm": 1.3758629673989613, "learning_rate": 2.0000000000000003e-06, "loss": 0.0732, "step": 125 }, { "epoch": 0.06051873198847262, "grad_norm": 1.4595665452207671, "learning_rate": 2.0160000000000003e-06, "loss": 0.0885, "step": 126 }, { "epoch": 0.06099903938520653, "grad_norm": 1.2343001806608365, "learning_rate": 2.032e-06, "loss": 0.0871, "step": 127 }, { "epoch": 0.06147934678194044, "grad_norm": 1.4944430814568856, "learning_rate": 2.048e-06, "loss": 0.0987, "step": 128 }, { "epoch": 0.06195965417867435, "grad_norm": 1.1669826735751045, "learning_rate": 2.064e-06, "loss": 0.0828, "step": 129 }, { "epoch": 0.06243996157540826, "grad_norm": 2.050815392469096, "learning_rate": 2.08e-06, "loss": 0.1436, "step": 130 }, { "epoch": 0.06292026897214217, "grad_norm": 2.6189886490073837, "learning_rate": 2.0960000000000003e-06, "loss": 0.1121, "step": 131 }, { "epoch": 0.06340057636887608, "grad_norm": 1.465151382295218, "learning_rate": 2.1120000000000003e-06, "loss": 0.1569, "step": 132 }, { "epoch": 0.06388088376560999, "grad_norm": 1.6059405590984388, "learning_rate": 2.128e-06, "loss": 0.0891, "step": 133 }, { "epoch": 0.0643611911623439, "grad_norm": 1.3606090814821632, "learning_rate": 2.144e-06, "loss": 0.104, "step": 134 }, { "epoch": 0.06484149855907781, "grad_norm": 1.1881719471401748, "learning_rate": 2.16e-06, "loss": 0.0775, "step": 135 }, { "epoch": 0.06532180595581172, "grad_norm": 1.205747071113525, "learning_rate": 2.176e-06, "loss": 0.084, "step": 136 }, { "epoch": 0.06580211335254563, "grad_norm": 1.2301202707311236, "learning_rate": 2.1920000000000004e-06, "loss": 0.0647, "step": 137 }, { "epoch": 0.06628242074927954, "grad_norm": 1.154403895526943, "learning_rate": 2.2080000000000003e-06, "loss": 0.0884, "step": 138 }, { "epoch": 0.06676272814601344, "grad_norm": 1.1013298512945302, "learning_rate": 2.2240000000000002e-06, "loss": 0.0643, "step": 139 }, { "epoch": 0.06724303554274735, "grad_norm": 1.0932013932642193, "learning_rate": 2.24e-06, "loss": 0.092, "step": 140 }, { "epoch": 0.06772334293948126, "grad_norm": 1.0454452002652364, "learning_rate": 2.256e-06, "loss": 0.0799, "step": 141 }, { "epoch": 0.06820365033621517, "grad_norm": 1.4743566119826792, "learning_rate": 2.2720000000000004e-06, "loss": 0.1136, "step": 142 }, { "epoch": 0.0686839577329491, "grad_norm": 1.4039096246861968, "learning_rate": 2.2880000000000004e-06, "loss": 0.0909, "step": 143 }, { "epoch": 0.069164265129683, "grad_norm": 1.2657755183130568, "learning_rate": 2.3040000000000003e-06, "loss": 0.1117, "step": 144 }, { "epoch": 0.06964457252641691, "grad_norm": 1.5851847672258597, "learning_rate": 2.3200000000000002e-06, "loss": 0.0951, "step": 145 }, { "epoch": 0.07012487992315082, "grad_norm": 1.2581561491117204, "learning_rate": 2.336e-06, "loss": 0.0878, "step": 146 }, { "epoch": 0.07060518731988473, "grad_norm": 1.420979749511269, "learning_rate": 2.352e-06, "loss": 0.0936, "step": 147 }, { "epoch": 0.07108549471661864, "grad_norm": 1.3031371111764456, "learning_rate": 2.3680000000000005e-06, "loss": 0.0909, "step": 148 }, { "epoch": 0.07156580211335255, "grad_norm": 1.4573325610550967, "learning_rate": 2.3840000000000004e-06, "loss": 0.0827, "step": 149 }, { "epoch": 0.07204610951008646, "grad_norm": 1.3047911271077164, "learning_rate": 2.4000000000000003e-06, "loss": 0.0851, "step": 150 }, { "epoch": 0.07252641690682037, "grad_norm": 1.4176849309242068, "learning_rate": 2.4160000000000002e-06, "loss": 0.0831, "step": 151 }, { "epoch": 0.07300672430355427, "grad_norm": 2.5041223053166903, "learning_rate": 2.432e-06, "loss": 0.0963, "step": 152 }, { "epoch": 0.07348703170028818, "grad_norm": 1.2243042228153846, "learning_rate": 2.448e-06, "loss": 0.092, "step": 153 }, { "epoch": 0.07396733909702209, "grad_norm": 1.3595133894472542, "learning_rate": 2.4640000000000005e-06, "loss": 0.1088, "step": 154 }, { "epoch": 0.074447646493756, "grad_norm": 1.8046494765641843, "learning_rate": 2.4800000000000004e-06, "loss": 0.1067, "step": 155 }, { "epoch": 0.07492795389048991, "grad_norm": 1.3961366152512769, "learning_rate": 2.496e-06, "loss": 0.1066, "step": 156 }, { "epoch": 0.07540826128722382, "grad_norm": 1.569757837719253, "learning_rate": 2.512e-06, "loss": 0.116, "step": 157 }, { "epoch": 0.07588856868395773, "grad_norm": 1.2274453659822118, "learning_rate": 2.5280000000000006e-06, "loss": 0.0888, "step": 158 }, { "epoch": 0.07636887608069164, "grad_norm": 1.3105009460198556, "learning_rate": 2.5440000000000005e-06, "loss": 0.109, "step": 159 }, { "epoch": 0.07684918347742556, "grad_norm": 1.1721145729988776, "learning_rate": 2.56e-06, "loss": 0.0994, "step": 160 }, { "epoch": 0.07732949087415947, "grad_norm": 1.2096805266515924, "learning_rate": 2.576e-06, "loss": 0.0887, "step": 161 }, { "epoch": 0.07780979827089338, "grad_norm": 2.25466141210399, "learning_rate": 2.592e-06, "loss": 0.0977, "step": 162 }, { "epoch": 0.07829010566762729, "grad_norm": 1.2284935839699218, "learning_rate": 2.608e-06, "loss": 0.0991, "step": 163 }, { "epoch": 0.0787704130643612, "grad_norm": 1.1932140220390712, "learning_rate": 2.6240000000000006e-06, "loss": 0.0866, "step": 164 }, { "epoch": 0.0792507204610951, "grad_norm": 1.0789509823335703, "learning_rate": 2.64e-06, "loss": 0.0996, "step": 165 }, { "epoch": 0.07973102785782901, "grad_norm": 0.9371763943071141, "learning_rate": 2.656e-06, "loss": 0.0582, "step": 166 }, { "epoch": 0.08021133525456292, "grad_norm": 1.7368430972984232, "learning_rate": 2.672e-06, "loss": 0.1086, "step": 167 }, { "epoch": 0.08069164265129683, "grad_norm": 1.6755808286564804, "learning_rate": 2.688e-06, "loss": 0.0873, "step": 168 }, { "epoch": 0.08117195004803074, "grad_norm": 1.3121940443924218, "learning_rate": 2.704e-06, "loss": 0.0805, "step": 169 }, { "epoch": 0.08165225744476465, "grad_norm": 1.3271980163235704, "learning_rate": 2.7200000000000002e-06, "loss": 0.0813, "step": 170 }, { "epoch": 0.08213256484149856, "grad_norm": 1.1932639416175121, "learning_rate": 2.736e-06, "loss": 0.0823, "step": 171 }, { "epoch": 0.08261287223823247, "grad_norm": 1.1588476048524006, "learning_rate": 2.752e-06, "loss": 0.0987, "step": 172 }, { "epoch": 0.08309317963496637, "grad_norm": 0.9072215194037728, "learning_rate": 2.768e-06, "loss": 0.067, "step": 173 }, { "epoch": 0.08357348703170028, "grad_norm": 1.356010402076568, "learning_rate": 2.784e-06, "loss": 0.0868, "step": 174 }, { "epoch": 0.08405379442843419, "grad_norm": 1.2650989878632668, "learning_rate": 2.8000000000000003e-06, "loss": 0.0812, "step": 175 }, { "epoch": 0.0845341018251681, "grad_norm": 1.4802205189991515, "learning_rate": 2.8160000000000002e-06, "loss": 0.099, "step": 176 }, { "epoch": 0.08501440922190202, "grad_norm": 1.1417060742721667, "learning_rate": 2.832e-06, "loss": 0.0815, "step": 177 }, { "epoch": 0.08549471661863593, "grad_norm": 1.3705511538151176, "learning_rate": 2.848e-06, "loss": 0.1102, "step": 178 }, { "epoch": 0.08597502401536984, "grad_norm": 1.3163532047784325, "learning_rate": 2.864e-06, "loss": 0.0755, "step": 179 }, { "epoch": 0.08645533141210375, "grad_norm": 1.47238236129405, "learning_rate": 2.88e-06, "loss": 0.0949, "step": 180 }, { "epoch": 0.08693563880883766, "grad_norm": 0.9260065978895016, "learning_rate": 2.8960000000000003e-06, "loss": 0.0586, "step": 181 }, { "epoch": 0.08741594620557157, "grad_norm": 1.311717165904964, "learning_rate": 2.9120000000000002e-06, "loss": 0.0816, "step": 182 }, { "epoch": 0.08789625360230548, "grad_norm": 1.3185427170913515, "learning_rate": 2.928e-06, "loss": 0.1061, "step": 183 }, { "epoch": 0.08837656099903939, "grad_norm": 1.0037072726769718, "learning_rate": 2.944e-06, "loss": 0.062, "step": 184 }, { "epoch": 0.0888568683957733, "grad_norm": 1.0985735941649346, "learning_rate": 2.96e-06, "loss": 0.0805, "step": 185 }, { "epoch": 0.0893371757925072, "grad_norm": 1.3125670658691977, "learning_rate": 2.976e-06, "loss": 0.0931, "step": 186 }, { "epoch": 0.08981748318924111, "grad_norm": 1.0838144485116263, "learning_rate": 2.9920000000000003e-06, "loss": 0.0834, "step": 187 }, { "epoch": 0.09029779058597502, "grad_norm": 1.799900406869807, "learning_rate": 3.0080000000000003e-06, "loss": 0.0919, "step": 188 }, { "epoch": 0.09077809798270893, "grad_norm": 1.173117411096857, "learning_rate": 3.024e-06, "loss": 0.0721, "step": 189 }, { "epoch": 0.09125840537944284, "grad_norm": 1.373335051247715, "learning_rate": 3.04e-06, "loss": 0.0846, "step": 190 }, { "epoch": 0.09173871277617675, "grad_norm": 1.3981230024483795, "learning_rate": 3.056e-06, "loss": 0.1008, "step": 191 }, { "epoch": 0.09221902017291066, "grad_norm": 1.4513236326035954, "learning_rate": 3.072e-06, "loss": 0.1059, "step": 192 }, { "epoch": 0.09269932756964457, "grad_norm": 1.2186053621072894, "learning_rate": 3.0880000000000003e-06, "loss": 0.09, "step": 193 }, { "epoch": 0.09317963496637849, "grad_norm": 1.1068806010368781, "learning_rate": 3.1040000000000003e-06, "loss": 0.0718, "step": 194 }, { "epoch": 0.0936599423631124, "grad_norm": 1.237536933577925, "learning_rate": 3.12e-06, "loss": 0.0768, "step": 195 }, { "epoch": 0.0941402497598463, "grad_norm": 1.2585614650979184, "learning_rate": 3.136e-06, "loss": 0.0757, "step": 196 }, { "epoch": 0.09462055715658022, "grad_norm": 0.9595246063304191, "learning_rate": 3.152e-06, "loss": 0.0696, "step": 197 }, { "epoch": 0.09510086455331412, "grad_norm": 1.1929913387812179, "learning_rate": 3.1680000000000004e-06, "loss": 0.0867, "step": 198 }, { "epoch": 0.09558117195004803, "grad_norm": 1.0572272850541955, "learning_rate": 3.1840000000000003e-06, "loss": 0.0816, "step": 199 }, { "epoch": 0.09606147934678194, "grad_norm": 1.160822305103066, "learning_rate": 3.2000000000000003e-06, "loss": 0.085, "step": 200 }, { "epoch": 0.09654178674351585, "grad_norm": 1.3295668411317592, "learning_rate": 3.216e-06, "loss": 0.1003, "step": 201 }, { "epoch": 0.09702209414024976, "grad_norm": 1.3117059163085465, "learning_rate": 3.232e-06, "loss": 0.0814, "step": 202 }, { "epoch": 0.09750240153698367, "grad_norm": 1.4613138653883198, "learning_rate": 3.248e-06, "loss": 0.1041, "step": 203 }, { "epoch": 0.09798270893371758, "grad_norm": 1.3385478082564994, "learning_rate": 3.2640000000000004e-06, "loss": 0.0846, "step": 204 }, { "epoch": 0.09846301633045149, "grad_norm": 1.271829712055238, "learning_rate": 3.2800000000000004e-06, "loss": 0.0986, "step": 205 }, { "epoch": 0.0989433237271854, "grad_norm": 1.1382933856150044, "learning_rate": 3.2960000000000003e-06, "loss": 0.0662, "step": 206 }, { "epoch": 0.0994236311239193, "grad_norm": 1.2777612246188284, "learning_rate": 3.3120000000000002e-06, "loss": 0.0944, "step": 207 }, { "epoch": 0.09990393852065321, "grad_norm": 1.2569659513450397, "learning_rate": 3.328e-06, "loss": 0.0688, "step": 208 }, { "epoch": 0.10038424591738712, "grad_norm": 1.2019771035760043, "learning_rate": 3.344e-06, "loss": 0.092, "step": 209 }, { "epoch": 0.10086455331412104, "grad_norm": 1.0705863361886445, "learning_rate": 3.3600000000000004e-06, "loss": 0.085, "step": 210 }, { "epoch": 0.10134486071085495, "grad_norm": 1.1235244108380247, "learning_rate": 3.3760000000000004e-06, "loss": 0.0798, "step": 211 }, { "epoch": 0.10182516810758886, "grad_norm": 1.0830560530059452, "learning_rate": 3.3920000000000003e-06, "loss": 0.0745, "step": 212 }, { "epoch": 0.10230547550432277, "grad_norm": 1.2281697980226327, "learning_rate": 3.4080000000000002e-06, "loss": 0.0822, "step": 213 }, { "epoch": 0.10278578290105668, "grad_norm": 1.0293384591155774, "learning_rate": 3.424e-06, "loss": 0.0699, "step": 214 }, { "epoch": 0.10326609029779059, "grad_norm": 1.1168765812283847, "learning_rate": 3.44e-06, "loss": 0.0903, "step": 215 }, { "epoch": 0.1037463976945245, "grad_norm": 1.2232503580307106, "learning_rate": 3.4560000000000005e-06, "loss": 0.073, "step": 216 }, { "epoch": 0.1042267050912584, "grad_norm": 1.1853245402764723, "learning_rate": 3.4720000000000004e-06, "loss": 0.0804, "step": 217 }, { "epoch": 0.10470701248799232, "grad_norm": 1.0349173303797323, "learning_rate": 3.4880000000000003e-06, "loss": 0.0791, "step": 218 }, { "epoch": 0.10518731988472622, "grad_norm": 0.9123748906672083, "learning_rate": 3.5040000000000002e-06, "loss": 0.0846, "step": 219 }, { "epoch": 0.10566762728146013, "grad_norm": 1.1862034175014786, "learning_rate": 3.52e-06, "loss": 0.0968, "step": 220 }, { "epoch": 0.10614793467819404, "grad_norm": 1.2703272165951818, "learning_rate": 3.5360000000000005e-06, "loss": 0.106, "step": 221 }, { "epoch": 0.10662824207492795, "grad_norm": 0.825924476735835, "learning_rate": 3.5520000000000005e-06, "loss": 0.0722, "step": 222 }, { "epoch": 0.10710854947166186, "grad_norm": 1.0687515686341, "learning_rate": 3.5680000000000004e-06, "loss": 0.0716, "step": 223 }, { "epoch": 0.10758885686839577, "grad_norm": 1.0805753001019858, "learning_rate": 3.5840000000000003e-06, "loss": 0.0691, "step": 224 }, { "epoch": 0.10806916426512968, "grad_norm": 1.6287239686502346, "learning_rate": 3.6000000000000003e-06, "loss": 0.067, "step": 225 }, { "epoch": 0.10854947166186359, "grad_norm": 1.9814573884606366, "learning_rate": 3.616e-06, "loss": 0.0752, "step": 226 }, { "epoch": 0.10902977905859751, "grad_norm": 1.3374463525520308, "learning_rate": 3.6320000000000005e-06, "loss": 0.0876, "step": 227 }, { "epoch": 0.10951008645533142, "grad_norm": 1.452473041749981, "learning_rate": 3.6480000000000005e-06, "loss": 0.0788, "step": 228 }, { "epoch": 0.10999039385206533, "grad_norm": 0.9767761821599609, "learning_rate": 3.6640000000000004e-06, "loss": 0.059, "step": 229 }, { "epoch": 0.11047070124879924, "grad_norm": 2.0951330953014837, "learning_rate": 3.6800000000000003e-06, "loss": 0.1257, "step": 230 }, { "epoch": 0.11095100864553314, "grad_norm": 1.4760857161454637, "learning_rate": 3.6960000000000003e-06, "loss": 0.0694, "step": 231 }, { "epoch": 0.11143131604226705, "grad_norm": 1.426490577687766, "learning_rate": 3.712e-06, "loss": 0.078, "step": 232 }, { "epoch": 0.11191162343900096, "grad_norm": 1.049787450994003, "learning_rate": 3.7280000000000006e-06, "loss": 0.0679, "step": 233 }, { "epoch": 0.11239193083573487, "grad_norm": 0.8796776572576731, "learning_rate": 3.7440000000000005e-06, "loss": 0.0607, "step": 234 }, { "epoch": 0.11287223823246878, "grad_norm": 1.3931219386969078, "learning_rate": 3.7600000000000004e-06, "loss": 0.102, "step": 235 }, { "epoch": 0.11335254562920269, "grad_norm": 1.393636110259964, "learning_rate": 3.7760000000000004e-06, "loss": 0.111, "step": 236 }, { "epoch": 0.1138328530259366, "grad_norm": 0.9780090176809716, "learning_rate": 3.7920000000000003e-06, "loss": 0.0947, "step": 237 }, { "epoch": 0.1143131604226705, "grad_norm": 1.3598074761781376, "learning_rate": 3.8080000000000006e-06, "loss": 0.1047, "step": 238 }, { "epoch": 0.11479346781940442, "grad_norm": 1.413503721034128, "learning_rate": 3.824e-06, "loss": 0.0731, "step": 239 }, { "epoch": 0.11527377521613832, "grad_norm": 1.0636156845849574, "learning_rate": 3.8400000000000005e-06, "loss": 0.0947, "step": 240 }, { "epoch": 0.11575408261287223, "grad_norm": 1.214860626930102, "learning_rate": 3.856e-06, "loss": 0.0652, "step": 241 }, { "epoch": 0.11623439000960614, "grad_norm": 1.203838132160498, "learning_rate": 3.872e-06, "loss": 0.0772, "step": 242 }, { "epoch": 0.11671469740634005, "grad_norm": 1.4337623413545681, "learning_rate": 3.888e-06, "loss": 0.1038, "step": 243 }, { "epoch": 0.11719500480307397, "grad_norm": 1.8506896303135723, "learning_rate": 3.904e-06, "loss": 0.0974, "step": 244 }, { "epoch": 0.11767531219980788, "grad_norm": 1.0209624715713772, "learning_rate": 3.920000000000001e-06, "loss": 0.0668, "step": 245 }, { "epoch": 0.11815561959654179, "grad_norm": 1.0430831232311266, "learning_rate": 3.936e-06, "loss": 0.0693, "step": 246 }, { "epoch": 0.1186359269932757, "grad_norm": 0.9781386213920611, "learning_rate": 3.9520000000000004e-06, "loss": 0.0922, "step": 247 }, { "epoch": 0.11911623439000961, "grad_norm": 1.3090670400806688, "learning_rate": 3.968e-06, "loss": 0.0697, "step": 248 }, { "epoch": 0.11959654178674352, "grad_norm": 1.6565002477397985, "learning_rate": 3.984e-06, "loss": 0.1017, "step": 249 }, { "epoch": 0.12007684918347743, "grad_norm": 1.013045916484816, "learning_rate": 4.000000000000001e-06, "loss": 0.0601, "step": 250 }, { "epoch": 0.12055715658021134, "grad_norm": 0.9922526880061665, "learning_rate": 4.016e-06, "loss": 0.0567, "step": 251 }, { "epoch": 0.12103746397694524, "grad_norm": 1.1410675081208677, "learning_rate": 4.0320000000000005e-06, "loss": 0.0766, "step": 252 }, { "epoch": 0.12151777137367915, "grad_norm": 1.3175828458287309, "learning_rate": 4.048e-06, "loss": 0.1039, "step": 253 }, { "epoch": 0.12199807877041306, "grad_norm": 1.3696186687199499, "learning_rate": 4.064e-06, "loss": 0.068, "step": 254 }, { "epoch": 0.12247838616714697, "grad_norm": 1.5050717359950978, "learning_rate": 4.08e-06, "loss": 0.0797, "step": 255 }, { "epoch": 0.12295869356388088, "grad_norm": 1.3400957977202135, "learning_rate": 4.096e-06, "loss": 0.0999, "step": 256 }, { "epoch": 0.12343900096061479, "grad_norm": 1.2617471427133966, "learning_rate": 4.112000000000001e-06, "loss": 0.0822, "step": 257 }, { "epoch": 0.1239193083573487, "grad_norm": 1.267747053897931, "learning_rate": 4.128e-06, "loss": 0.0773, "step": 258 }, { "epoch": 0.12439961575408261, "grad_norm": 1.2676946090954653, "learning_rate": 4.1440000000000005e-06, "loss": 0.0754, "step": 259 }, { "epoch": 0.12487992315081652, "grad_norm": 1.2292713163126525, "learning_rate": 4.16e-06, "loss": 0.09, "step": 260 }, { "epoch": 0.12536023054755044, "grad_norm": 1.0012628385996274, "learning_rate": 4.176e-06, "loss": 0.0697, "step": 261 }, { "epoch": 0.12584053794428435, "grad_norm": 1.3874865197803345, "learning_rate": 4.192000000000001e-06, "loss": 0.0932, "step": 262 }, { "epoch": 0.12632084534101826, "grad_norm": 1.177205882683919, "learning_rate": 4.208e-06, "loss": 0.0664, "step": 263 }, { "epoch": 0.12680115273775217, "grad_norm": 1.2350489439494747, "learning_rate": 4.2240000000000006e-06, "loss": 0.0898, "step": 264 }, { "epoch": 0.12728146013448607, "grad_norm": 1.1559020122519823, "learning_rate": 4.24e-06, "loss": 0.0786, "step": 265 }, { "epoch": 0.12776176753121998, "grad_norm": 1.3539373299195447, "learning_rate": 4.256e-06, "loss": 0.0728, "step": 266 }, { "epoch": 0.1282420749279539, "grad_norm": 1.2165821416393527, "learning_rate": 4.272000000000001e-06, "loss": 0.0665, "step": 267 }, { "epoch": 0.1287223823246878, "grad_norm": 0.8998352503566164, "learning_rate": 4.288e-06, "loss": 0.0572, "step": 268 }, { "epoch": 0.1292026897214217, "grad_norm": 1.332170024000591, "learning_rate": 4.304000000000001e-06, "loss": 0.0896, "step": 269 }, { "epoch": 0.12968299711815562, "grad_norm": 0.8702695667096485, "learning_rate": 4.32e-06, "loss": 0.0579, "step": 270 }, { "epoch": 0.13016330451488953, "grad_norm": 1.1346194333159956, "learning_rate": 4.3360000000000005e-06, "loss": 0.0809, "step": 271 }, { "epoch": 0.13064361191162344, "grad_norm": 1.3242195848254044, "learning_rate": 4.352e-06, "loss": 0.0953, "step": 272 }, { "epoch": 0.13112391930835735, "grad_norm": 0.947922780884019, "learning_rate": 4.368e-06, "loss": 0.0726, "step": 273 }, { "epoch": 0.13160422670509125, "grad_norm": 0.8590229026058185, "learning_rate": 4.384000000000001e-06, "loss": 0.0859, "step": 274 }, { "epoch": 0.13208453410182516, "grad_norm": 2.0364307950864897, "learning_rate": 4.4e-06, "loss": 0.1065, "step": 275 }, { "epoch": 0.13256484149855907, "grad_norm": 0.9495154556473616, "learning_rate": 4.416000000000001e-06, "loss": 0.0539, "step": 276 }, { "epoch": 0.13304514889529298, "grad_norm": 1.274982611643209, "learning_rate": 4.432e-06, "loss": 0.0847, "step": 277 }, { "epoch": 0.1335254562920269, "grad_norm": 1.0179278688325417, "learning_rate": 4.4480000000000004e-06, "loss": 0.0781, "step": 278 }, { "epoch": 0.1340057636887608, "grad_norm": 1.0777338379659434, "learning_rate": 4.464000000000001e-06, "loss": 0.0823, "step": 279 }, { "epoch": 0.1344860710854947, "grad_norm": 1.7837833704627275, "learning_rate": 4.48e-06, "loss": 0.0809, "step": 280 }, { "epoch": 0.13496637848222862, "grad_norm": 1.1491936699966703, "learning_rate": 4.496000000000001e-06, "loss": 0.0726, "step": 281 }, { "epoch": 0.13544668587896252, "grad_norm": 1.1891782145869991, "learning_rate": 4.512e-06, "loss": 0.0872, "step": 282 }, { "epoch": 0.13592699327569643, "grad_norm": 0.8682236854456676, "learning_rate": 4.5280000000000005e-06, "loss": 0.0739, "step": 283 }, { "epoch": 0.13640730067243034, "grad_norm": 2.941844018714875, "learning_rate": 4.544000000000001e-06, "loss": 0.0862, "step": 284 }, { "epoch": 0.13688760806916425, "grad_norm": 1.1758035570138663, "learning_rate": 4.56e-06, "loss": 0.094, "step": 285 }, { "epoch": 0.1373679154658982, "grad_norm": 1.213798829069968, "learning_rate": 4.576000000000001e-06, "loss": 0.0582, "step": 286 }, { "epoch": 0.1378482228626321, "grad_norm": 1.1521708031630107, "learning_rate": 4.592e-06, "loss": 0.0961, "step": 287 }, { "epoch": 0.138328530259366, "grad_norm": 1.5167984093098619, "learning_rate": 4.608000000000001e-06, "loss": 0.0886, "step": 288 }, { "epoch": 0.13880883765609991, "grad_norm": 1.334195306364946, "learning_rate": 4.624e-06, "loss": 0.0926, "step": 289 }, { "epoch": 0.13928914505283382, "grad_norm": 1.201794873892803, "learning_rate": 4.6400000000000005e-06, "loss": 0.0714, "step": 290 }, { "epoch": 0.13976945244956773, "grad_norm": 1.1161034977945354, "learning_rate": 4.656000000000001e-06, "loss": 0.0835, "step": 291 }, { "epoch": 0.14024975984630164, "grad_norm": 1.1187497922543908, "learning_rate": 4.672e-06, "loss": 0.0768, "step": 292 }, { "epoch": 0.14073006724303555, "grad_norm": 1.135592941087255, "learning_rate": 4.688000000000001e-06, "loss": 0.0987, "step": 293 }, { "epoch": 0.14121037463976946, "grad_norm": 1.0116456228936113, "learning_rate": 4.704e-06, "loss": 0.0868, "step": 294 }, { "epoch": 0.14169068203650337, "grad_norm": 0.8988345290651037, "learning_rate": 4.7200000000000005e-06, "loss": 0.0729, "step": 295 }, { "epoch": 0.14217098943323728, "grad_norm": 1.4920548715651283, "learning_rate": 4.736000000000001e-06, "loss": 0.0689, "step": 296 }, { "epoch": 0.14265129682997119, "grad_norm": 1.1491177326584943, "learning_rate": 4.752e-06, "loss": 0.0822, "step": 297 }, { "epoch": 0.1431316042267051, "grad_norm": 1.3710614031545738, "learning_rate": 4.768000000000001e-06, "loss": 0.1054, "step": 298 }, { "epoch": 0.143611911623439, "grad_norm": 0.798935770934357, "learning_rate": 4.784e-06, "loss": 0.0609, "step": 299 }, { "epoch": 0.1440922190201729, "grad_norm": 1.0553443465463967, "learning_rate": 4.800000000000001e-06, "loss": 0.0588, "step": 300 }, { "epoch": 0.14457252641690682, "grad_norm": 0.862660611904342, "learning_rate": 4.816e-06, "loss": 0.0607, "step": 301 }, { "epoch": 0.14505283381364073, "grad_norm": 1.0888178319055062, "learning_rate": 4.8320000000000005e-06, "loss": 0.0888, "step": 302 }, { "epoch": 0.14553314121037464, "grad_norm": 0.8383673824701234, "learning_rate": 4.848000000000001e-06, "loss": 0.0474, "step": 303 }, { "epoch": 0.14601344860710855, "grad_norm": 0.946622673839637, "learning_rate": 4.864e-06, "loss": 0.0644, "step": 304 }, { "epoch": 0.14649375600384246, "grad_norm": 0.9082218680745726, "learning_rate": 4.880000000000001e-06, "loss": 0.0639, "step": 305 }, { "epoch": 0.14697406340057637, "grad_norm": 1.3359634218439083, "learning_rate": 4.896e-06, "loss": 0.0808, "step": 306 }, { "epoch": 0.14745437079731027, "grad_norm": 2.4941362749992577, "learning_rate": 4.9120000000000006e-06, "loss": 0.1029, "step": 307 }, { "epoch": 0.14793467819404418, "grad_norm": 0.8768474072222495, "learning_rate": 4.928000000000001e-06, "loss": 0.0603, "step": 308 }, { "epoch": 0.1484149855907781, "grad_norm": 1.0191777482177848, "learning_rate": 4.9440000000000004e-06, "loss": 0.1195, "step": 309 }, { "epoch": 0.148895292987512, "grad_norm": 1.3776289338364824, "learning_rate": 4.960000000000001e-06, "loss": 0.0929, "step": 310 }, { "epoch": 0.1493756003842459, "grad_norm": 1.118324563424117, "learning_rate": 4.976e-06, "loss": 0.0994, "step": 311 }, { "epoch": 0.14985590778097982, "grad_norm": 0.94374520838527, "learning_rate": 4.992e-06, "loss": 0.0641, "step": 312 }, { "epoch": 0.15033621517771373, "grad_norm": 1.2295166440217464, "learning_rate": 5.008000000000001e-06, "loss": 0.0764, "step": 313 }, { "epoch": 0.15081652257444764, "grad_norm": 1.0038813920972378, "learning_rate": 5.024e-06, "loss": 0.0894, "step": 314 }, { "epoch": 0.15129682997118155, "grad_norm": 0.9564388991346412, "learning_rate": 5.04e-06, "loss": 0.0646, "step": 315 }, { "epoch": 0.15177713736791545, "grad_norm": 1.001140276754152, "learning_rate": 5.056000000000001e-06, "loss": 0.0665, "step": 316 }, { "epoch": 0.15225744476464936, "grad_norm": 1.6640587209098594, "learning_rate": 5.072e-06, "loss": 0.0996, "step": 317 }, { "epoch": 0.15273775216138327, "grad_norm": 0.9225089106833559, "learning_rate": 5.088000000000001e-06, "loss": 0.0636, "step": 318 }, { "epoch": 0.15321805955811718, "grad_norm": 0.8945005100522845, "learning_rate": 5.104e-06, "loss": 0.0769, "step": 319 }, { "epoch": 0.15369836695485112, "grad_norm": 1.7987667845062436, "learning_rate": 5.12e-06, "loss": 0.1081, "step": 320 }, { "epoch": 0.15417867435158503, "grad_norm": 1.255685447274647, "learning_rate": 5.136e-06, "loss": 0.0614, "step": 321 }, { "epoch": 0.15465898174831894, "grad_norm": 0.9074627196357872, "learning_rate": 5.152e-06, "loss": 0.0713, "step": 322 }, { "epoch": 0.15513928914505284, "grad_norm": 0.8841836941406669, "learning_rate": 5.168000000000001e-06, "loss": 0.0644, "step": 323 }, { "epoch": 0.15561959654178675, "grad_norm": 1.119142780438519, "learning_rate": 5.184e-06, "loss": 0.1067, "step": 324 }, { "epoch": 0.15609990393852066, "grad_norm": 0.7376552219638878, "learning_rate": 5.2e-06, "loss": 0.0584, "step": 325 }, { "epoch": 0.15658021133525457, "grad_norm": 0.7093197189610518, "learning_rate": 5.216e-06, "loss": 0.067, "step": 326 }, { "epoch": 0.15706051873198848, "grad_norm": 0.9146977097269845, "learning_rate": 5.232e-06, "loss": 0.0878, "step": 327 }, { "epoch": 0.1575408261287224, "grad_norm": 0.893881736147892, "learning_rate": 5.248000000000001e-06, "loss": 0.0749, "step": 328 }, { "epoch": 0.1580211335254563, "grad_norm": 1.0133364596640484, "learning_rate": 5.264e-06, "loss": 0.0669, "step": 329 }, { "epoch": 0.1585014409221902, "grad_norm": 1.0188513118607696, "learning_rate": 5.28e-06, "loss": 0.0744, "step": 330 }, { "epoch": 0.15898174831892412, "grad_norm": 1.3769922031431308, "learning_rate": 5.296e-06, "loss": 0.0804, "step": 331 }, { "epoch": 0.15946205571565802, "grad_norm": 1.0924350131956813, "learning_rate": 5.312e-06, "loss": 0.073, "step": 332 }, { "epoch": 0.15994236311239193, "grad_norm": 1.1474380032242861, "learning_rate": 5.328000000000001e-06, "loss": 0.0748, "step": 333 }, { "epoch": 0.16042267050912584, "grad_norm": 1.2007135376136377, "learning_rate": 5.344e-06, "loss": 0.0633, "step": 334 }, { "epoch": 0.16090297790585975, "grad_norm": 0.8615599372907458, "learning_rate": 5.36e-06, "loss": 0.0565, "step": 335 }, { "epoch": 0.16138328530259366, "grad_norm": 1.239088822730045, "learning_rate": 5.376e-06, "loss": 0.0774, "step": 336 }, { "epoch": 0.16186359269932757, "grad_norm": 1.2276765585158387, "learning_rate": 5.392e-06, "loss": 0.0798, "step": 337 }, { "epoch": 0.16234390009606148, "grad_norm": 1.4151473611051535, "learning_rate": 5.408e-06, "loss": 0.0946, "step": 338 }, { "epoch": 0.1628242074927954, "grad_norm": 1.55213536433148, "learning_rate": 5.424e-06, "loss": 0.081, "step": 339 }, { "epoch": 0.1633045148895293, "grad_norm": 1.2190236388499174, "learning_rate": 5.4400000000000004e-06, "loss": 0.1009, "step": 340 }, { "epoch": 0.1637848222862632, "grad_norm": 0.9398509713441388, "learning_rate": 5.456e-06, "loss": 0.0597, "step": 341 }, { "epoch": 0.1642651296829971, "grad_norm": 2.110512888312876, "learning_rate": 5.472e-06, "loss": 0.1204, "step": 342 }, { "epoch": 0.16474543707973102, "grad_norm": 1.05455486733766, "learning_rate": 5.488e-06, "loss": 0.0927, "step": 343 }, { "epoch": 0.16522574447646493, "grad_norm": 0.6714424760276676, "learning_rate": 5.504e-06, "loss": 0.0605, "step": 344 }, { "epoch": 0.16570605187319884, "grad_norm": 0.8868751693597045, "learning_rate": 5.5200000000000005e-06, "loss": 0.0592, "step": 345 }, { "epoch": 0.16618635926993275, "grad_norm": 1.3695330733027493, "learning_rate": 5.536e-06, "loss": 0.0836, "step": 346 }, { "epoch": 0.16666666666666666, "grad_norm": 0.6064272072499131, "learning_rate": 5.552e-06, "loss": 0.051, "step": 347 }, { "epoch": 0.16714697406340057, "grad_norm": 1.1286927064113377, "learning_rate": 5.568e-06, "loss": 0.0813, "step": 348 }, { "epoch": 0.16762728146013448, "grad_norm": 2.9709430641912284, "learning_rate": 5.584e-06, "loss": 0.0718, "step": 349 }, { "epoch": 0.16810758885686838, "grad_norm": 1.1789706938875752, "learning_rate": 5.600000000000001e-06, "loss": 0.0912, "step": 350 }, { "epoch": 0.1685878962536023, "grad_norm": 1.0699393776208648, "learning_rate": 5.616e-06, "loss": 0.0832, "step": 351 }, { "epoch": 0.1690682036503362, "grad_norm": 1.4723575935028883, "learning_rate": 5.6320000000000005e-06, "loss": 0.0807, "step": 352 }, { "epoch": 0.16954851104707014, "grad_norm": 0.9417323601320572, "learning_rate": 5.648e-06, "loss": 0.0745, "step": 353 }, { "epoch": 0.17002881844380405, "grad_norm": 1.0015388420248077, "learning_rate": 5.664e-06, "loss": 0.0742, "step": 354 }, { "epoch": 0.17050912584053796, "grad_norm": 0.846437738871243, "learning_rate": 5.68e-06, "loss": 0.0738, "step": 355 }, { "epoch": 0.17098943323727187, "grad_norm": 1.0340220342143194, "learning_rate": 5.696e-06, "loss": 0.1003, "step": 356 }, { "epoch": 0.17146974063400577, "grad_norm": 1.050210067042245, "learning_rate": 5.7120000000000005e-06, "loss": 0.0944, "step": 357 }, { "epoch": 0.17195004803073968, "grad_norm": 1.1441788701822333, "learning_rate": 5.728e-06, "loss": 0.0935, "step": 358 }, { "epoch": 0.1724303554274736, "grad_norm": 0.8578595357010815, "learning_rate": 5.744e-06, "loss": 0.065, "step": 359 }, { "epoch": 0.1729106628242075, "grad_norm": 1.0827485015540093, "learning_rate": 5.76e-06, "loss": 0.061, "step": 360 }, { "epoch": 0.1733909702209414, "grad_norm": 0.9247919383729161, "learning_rate": 5.776e-06, "loss": 0.0566, "step": 361 }, { "epoch": 0.17387127761767532, "grad_norm": 0.9359918484086721, "learning_rate": 5.792000000000001e-06, "loss": 0.0918, "step": 362 }, { "epoch": 0.17435158501440923, "grad_norm": 0.6461989395520733, "learning_rate": 5.808e-06, "loss": 0.0596, "step": 363 }, { "epoch": 0.17483189241114314, "grad_norm": 1.2658049954240356, "learning_rate": 5.8240000000000005e-06, "loss": 0.0921, "step": 364 }, { "epoch": 0.17531219980787704, "grad_norm": 1.3185639772784372, "learning_rate": 5.84e-06, "loss": 0.0949, "step": 365 }, { "epoch": 0.17579250720461095, "grad_norm": 1.147542350202481, "learning_rate": 5.856e-06, "loss": 0.0644, "step": 366 }, { "epoch": 0.17627281460134486, "grad_norm": 1.2428899445451909, "learning_rate": 5.872000000000001e-06, "loss": 0.0923, "step": 367 }, { "epoch": 0.17675312199807877, "grad_norm": 1.0123606541011576, "learning_rate": 5.888e-06, "loss": 0.062, "step": 368 }, { "epoch": 0.17723342939481268, "grad_norm": 1.2676040664006363, "learning_rate": 5.9040000000000006e-06, "loss": 0.0993, "step": 369 }, { "epoch": 0.1777137367915466, "grad_norm": 1.7674197429016132, "learning_rate": 5.92e-06, "loss": 0.1028, "step": 370 }, { "epoch": 0.1781940441882805, "grad_norm": 1.3206360765520475, "learning_rate": 5.9360000000000004e-06, "loss": 0.0909, "step": 371 }, { "epoch": 0.1786743515850144, "grad_norm": 1.397633543771018, "learning_rate": 5.952e-06, "loss": 0.0922, "step": 372 }, { "epoch": 0.17915465898174832, "grad_norm": 1.4356961352059268, "learning_rate": 5.968e-06, "loss": 0.0789, "step": 373 }, { "epoch": 0.17963496637848222, "grad_norm": 1.0525284488409845, "learning_rate": 5.984000000000001e-06, "loss": 0.0883, "step": 374 }, { "epoch": 0.18011527377521613, "grad_norm": 1.6818878970224407, "learning_rate": 6e-06, "loss": 0.0957, "step": 375 }, { "epoch": 0.18059558117195004, "grad_norm": 1.0922291599097236, "learning_rate": 6.0160000000000005e-06, "loss": 0.0648, "step": 376 }, { "epoch": 0.18107588856868395, "grad_norm": 0.9542123025634616, "learning_rate": 6.032e-06, "loss": 0.0799, "step": 377 }, { "epoch": 0.18155619596541786, "grad_norm": 1.173846664966484, "learning_rate": 6.048e-06, "loss": 0.0663, "step": 378 }, { "epoch": 0.18203650336215177, "grad_norm": 0.8431515012151173, "learning_rate": 6.064000000000001e-06, "loss": 0.0746, "step": 379 }, { "epoch": 0.18251681075888568, "grad_norm": 0.8101682237804423, "learning_rate": 6.08e-06, "loss": 0.0692, "step": 380 }, { "epoch": 0.1829971181556196, "grad_norm": 0.8310188133668261, "learning_rate": 6.096000000000001e-06, "loss": 0.072, "step": 381 }, { "epoch": 0.1834774255523535, "grad_norm": 1.275506422840803, "learning_rate": 6.112e-06, "loss": 0.1017, "step": 382 }, { "epoch": 0.1839577329490874, "grad_norm": 1.2320868444882946, "learning_rate": 6.1280000000000005e-06, "loss": 0.0851, "step": 383 }, { "epoch": 0.1844380403458213, "grad_norm": 0.9892285232343796, "learning_rate": 6.144e-06, "loss": 0.0902, "step": 384 }, { "epoch": 0.18491834774255522, "grad_norm": 1.2195514742095586, "learning_rate": 6.16e-06, "loss": 0.0784, "step": 385 }, { "epoch": 0.18539865513928913, "grad_norm": 0.7206845127032119, "learning_rate": 6.176000000000001e-06, "loss": 0.0486, "step": 386 }, { "epoch": 0.18587896253602307, "grad_norm": 0.9671084909417331, "learning_rate": 6.192e-06, "loss": 0.0772, "step": 387 }, { "epoch": 0.18635926993275698, "grad_norm": 1.0054776144188855, "learning_rate": 6.2080000000000005e-06, "loss": 0.0693, "step": 388 }, { "epoch": 0.18683957732949089, "grad_norm": 0.9245309107680109, "learning_rate": 6.224e-06, "loss": 0.0739, "step": 389 }, { "epoch": 0.1873198847262248, "grad_norm": 1.0103439605117195, "learning_rate": 6.24e-06, "loss": 0.0853, "step": 390 }, { "epoch": 0.1878001921229587, "grad_norm": 0.7676096813994673, "learning_rate": 6.256000000000001e-06, "loss": 0.0765, "step": 391 }, { "epoch": 0.1882804995196926, "grad_norm": 0.9670272222035949, "learning_rate": 6.272e-06, "loss": 0.073, "step": 392 }, { "epoch": 0.18876080691642652, "grad_norm": 1.5221890168980567, "learning_rate": 6.288000000000001e-06, "loss": 0.0796, "step": 393 }, { "epoch": 0.18924111431316043, "grad_norm": 0.7006607860700146, "learning_rate": 6.304e-06, "loss": 0.0626, "step": 394 }, { "epoch": 0.18972142170989434, "grad_norm": 1.5246564977183437, "learning_rate": 6.3200000000000005e-06, "loss": 0.0722, "step": 395 }, { "epoch": 0.19020172910662825, "grad_norm": 1.9379386031947312, "learning_rate": 6.336000000000001e-06, "loss": 0.0807, "step": 396 }, { "epoch": 0.19068203650336216, "grad_norm": 1.1381971297694942, "learning_rate": 6.352e-06, "loss": 0.0882, "step": 397 }, { "epoch": 0.19116234390009607, "grad_norm": 1.051013291042444, "learning_rate": 6.368000000000001e-06, "loss": 0.0742, "step": 398 }, { "epoch": 0.19164265129682997, "grad_norm": 0.8171391283247772, "learning_rate": 6.384e-06, "loss": 0.055, "step": 399 }, { "epoch": 0.19212295869356388, "grad_norm": 0.9833424794189063, "learning_rate": 6.4000000000000006e-06, "loss": 0.0681, "step": 400 }, { "epoch": 0.1926032660902978, "grad_norm": 1.6364622173487873, "learning_rate": 6.416e-06, "loss": 0.0785, "step": 401 }, { "epoch": 0.1930835734870317, "grad_norm": 1.0615403955397404, "learning_rate": 6.432e-06, "loss": 0.0991, "step": 402 }, { "epoch": 0.1935638808837656, "grad_norm": 0.7691979544920202, "learning_rate": 6.448000000000001e-06, "loss": 0.0748, "step": 403 }, { "epoch": 0.19404418828049952, "grad_norm": 1.1926569318031923, "learning_rate": 6.464e-06, "loss": 0.0644, "step": 404 }, { "epoch": 0.19452449567723343, "grad_norm": 0.5496991194443854, "learning_rate": 6.480000000000001e-06, "loss": 0.0531, "step": 405 }, { "epoch": 0.19500480307396734, "grad_norm": 0.8326919307353674, "learning_rate": 6.496e-06, "loss": 0.0725, "step": 406 }, { "epoch": 0.19548511047070125, "grad_norm": 1.0612929395385298, "learning_rate": 6.5120000000000005e-06, "loss": 0.0786, "step": 407 }, { "epoch": 0.19596541786743515, "grad_norm": 0.718075701725119, "learning_rate": 6.528000000000001e-06, "loss": 0.0585, "step": 408 }, { "epoch": 0.19644572526416906, "grad_norm": 1.2693919034433871, "learning_rate": 6.544e-06, "loss": 0.0737, "step": 409 }, { "epoch": 0.19692603266090297, "grad_norm": 0.9667230149007086, "learning_rate": 6.560000000000001e-06, "loss": 0.0939, "step": 410 }, { "epoch": 0.19740634005763688, "grad_norm": 0.7618960818418478, "learning_rate": 6.576e-06, "loss": 0.0451, "step": 411 }, { "epoch": 0.1978866474543708, "grad_norm": 0.9832215410415592, "learning_rate": 6.592000000000001e-06, "loss": 0.0671, "step": 412 }, { "epoch": 0.1983669548511047, "grad_norm": 0.757140075019698, "learning_rate": 6.608000000000001e-06, "loss": 0.072, "step": 413 }, { "epoch": 0.1988472622478386, "grad_norm": 0.8925384905186106, "learning_rate": 6.6240000000000004e-06, "loss": 0.0603, "step": 414 }, { "epoch": 0.19932756964457252, "grad_norm": 1.1216976394709526, "learning_rate": 6.640000000000001e-06, "loss": 0.073, "step": 415 }, { "epoch": 0.19980787704130643, "grad_norm": 0.635084003555822, "learning_rate": 6.656e-06, "loss": 0.05, "step": 416 }, { "epoch": 0.20028818443804033, "grad_norm": 0.7700738336691185, "learning_rate": 6.672000000000001e-06, "loss": 0.0723, "step": 417 }, { "epoch": 0.20076849183477424, "grad_norm": 1.2073339284961622, "learning_rate": 6.688e-06, "loss": 0.0789, "step": 418 }, { "epoch": 0.20124879923150815, "grad_norm": 1.0303414284152617, "learning_rate": 6.7040000000000005e-06, "loss": 0.0942, "step": 419 }, { "epoch": 0.2017291066282421, "grad_norm": 1.4496720069236764, "learning_rate": 6.720000000000001e-06, "loss": 0.0934, "step": 420 }, { "epoch": 0.202209414024976, "grad_norm": 1.39926810164135, "learning_rate": 6.736e-06, "loss": 0.0943, "step": 421 }, { "epoch": 0.2026897214217099, "grad_norm": 0.9605472208895933, "learning_rate": 6.752000000000001e-06, "loss": 0.07, "step": 422 }, { "epoch": 0.20317002881844382, "grad_norm": 1.1676905060892429, "learning_rate": 6.768e-06, "loss": 0.0957, "step": 423 }, { "epoch": 0.20365033621517772, "grad_norm": 1.0226460242714408, "learning_rate": 6.784000000000001e-06, "loss": 0.0924, "step": 424 }, { "epoch": 0.20413064361191163, "grad_norm": 0.7672962042725741, "learning_rate": 6.800000000000001e-06, "loss": 0.0731, "step": 425 }, { "epoch": 0.20461095100864554, "grad_norm": 1.1567284889374434, "learning_rate": 6.8160000000000005e-06, "loss": 0.0783, "step": 426 }, { "epoch": 0.20509125840537945, "grad_norm": 1.2828166133488417, "learning_rate": 6.832000000000001e-06, "loss": 0.085, "step": 427 }, { "epoch": 0.20557156580211336, "grad_norm": 0.9948993350398787, "learning_rate": 6.848e-06, "loss": 0.0712, "step": 428 }, { "epoch": 0.20605187319884727, "grad_norm": 1.0013503870526177, "learning_rate": 6.864000000000001e-06, "loss": 0.0725, "step": 429 }, { "epoch": 0.20653218059558118, "grad_norm": 0.828738359422558, "learning_rate": 6.88e-06, "loss": 0.0793, "step": 430 }, { "epoch": 0.2070124879923151, "grad_norm": 1.2375082026111115, "learning_rate": 6.8960000000000006e-06, "loss": 0.0868, "step": 431 }, { "epoch": 0.207492795389049, "grad_norm": 0.9272139309771383, "learning_rate": 6.912000000000001e-06, "loss": 0.0498, "step": 432 }, { "epoch": 0.2079731027857829, "grad_norm": 1.1885661004953512, "learning_rate": 6.928e-06, "loss": 0.0644, "step": 433 }, { "epoch": 0.2084534101825168, "grad_norm": 0.9603791498315314, "learning_rate": 6.944000000000001e-06, "loss": 0.0652, "step": 434 }, { "epoch": 0.20893371757925072, "grad_norm": 1.9606166639543452, "learning_rate": 6.96e-06, "loss": 0.0883, "step": 435 }, { "epoch": 0.20941402497598463, "grad_norm": 1.042036621430657, "learning_rate": 6.976000000000001e-06, "loss": 0.0773, "step": 436 }, { "epoch": 0.20989433237271854, "grad_norm": 0.807705104762043, "learning_rate": 6.992000000000001e-06, "loss": 0.0663, "step": 437 }, { "epoch": 0.21037463976945245, "grad_norm": 0.644829498274957, "learning_rate": 7.0080000000000005e-06, "loss": 0.0532, "step": 438 }, { "epoch": 0.21085494716618636, "grad_norm": 1.3697814725304436, "learning_rate": 7.024000000000001e-06, "loss": 0.0971, "step": 439 }, { "epoch": 0.21133525456292027, "grad_norm": 0.8499179196327713, "learning_rate": 7.04e-06, "loss": 0.06, "step": 440 }, { "epoch": 0.21181556195965417, "grad_norm": 1.1863374964467173, "learning_rate": 7.056000000000001e-06, "loss": 0.0835, "step": 441 }, { "epoch": 0.21229586935638808, "grad_norm": 1.025816763411157, "learning_rate": 7.072000000000001e-06, "loss": 0.0663, "step": 442 }, { "epoch": 0.212776176753122, "grad_norm": 0.7405994529015164, "learning_rate": 7.088000000000001e-06, "loss": 0.0598, "step": 443 }, { "epoch": 0.2132564841498559, "grad_norm": 0.6945613328918526, "learning_rate": 7.104000000000001e-06, "loss": 0.0455, "step": 444 }, { "epoch": 0.2137367915465898, "grad_norm": 1.2263272059064698, "learning_rate": 7.1200000000000004e-06, "loss": 0.0892, "step": 445 }, { "epoch": 0.21421709894332372, "grad_norm": 0.8174840860440725, "learning_rate": 7.136000000000001e-06, "loss": 0.1028, "step": 446 }, { "epoch": 0.21469740634005763, "grad_norm": 0.8623153464251558, "learning_rate": 7.152e-06, "loss": 0.0851, "step": 447 }, { "epoch": 0.21517771373679154, "grad_norm": 0.8772215697627417, "learning_rate": 7.168000000000001e-06, "loss": 0.0631, "step": 448 }, { "epoch": 0.21565802113352545, "grad_norm": 1.1867765756821822, "learning_rate": 7.184000000000001e-06, "loss": 0.071, "step": 449 }, { "epoch": 0.21613832853025935, "grad_norm": 1.100610239534095, "learning_rate": 7.2000000000000005e-06, "loss": 0.0819, "step": 450 }, { "epoch": 0.21661863592699326, "grad_norm": 1.0692001780240707, "learning_rate": 7.216000000000001e-06, "loss": 0.0752, "step": 451 }, { "epoch": 0.21709894332372717, "grad_norm": 1.8257869421442983, "learning_rate": 7.232e-06, "loss": 0.0961, "step": 452 }, { "epoch": 0.21757925072046108, "grad_norm": 0.8717548117040241, "learning_rate": 7.248000000000001e-06, "loss": 0.0705, "step": 453 }, { "epoch": 0.21805955811719502, "grad_norm": 0.679515897488406, "learning_rate": 7.264000000000001e-06, "loss": 0.0595, "step": 454 }, { "epoch": 0.21853986551392893, "grad_norm": 0.895857495998295, "learning_rate": 7.280000000000001e-06, "loss": 0.0897, "step": 455 }, { "epoch": 0.21902017291066284, "grad_norm": 0.8679817923298461, "learning_rate": 7.296000000000001e-06, "loss": 0.0767, "step": 456 }, { "epoch": 0.21950048030739674, "grad_norm": 0.9327609488192921, "learning_rate": 7.3120000000000005e-06, "loss": 0.0806, "step": 457 }, { "epoch": 0.21998078770413065, "grad_norm": 0.5822697098791232, "learning_rate": 7.328000000000001e-06, "loss": 0.0568, "step": 458 }, { "epoch": 0.22046109510086456, "grad_norm": 0.8136477286314635, "learning_rate": 7.344000000000001e-06, "loss": 0.0794, "step": 459 }, { "epoch": 0.22094140249759847, "grad_norm": 0.6332136039248887, "learning_rate": 7.360000000000001e-06, "loss": 0.0576, "step": 460 }, { "epoch": 0.22142170989433238, "grad_norm": 0.9815300986796249, "learning_rate": 7.376000000000001e-06, "loss": 0.0643, "step": 461 }, { "epoch": 0.2219020172910663, "grad_norm": 0.724954981826669, "learning_rate": 7.3920000000000005e-06, "loss": 0.0788, "step": 462 }, { "epoch": 0.2223823246878002, "grad_norm": 0.9480652345961487, "learning_rate": 7.408000000000001e-06, "loss": 0.0818, "step": 463 }, { "epoch": 0.2228626320845341, "grad_norm": 1.3054685818951983, "learning_rate": 7.424e-06, "loss": 0.0919, "step": 464 }, { "epoch": 0.22334293948126802, "grad_norm": 1.0576795770417882, "learning_rate": 7.440000000000001e-06, "loss": 0.0724, "step": 465 }, { "epoch": 0.22382324687800192, "grad_norm": 0.7169105130347482, "learning_rate": 7.456000000000001e-06, "loss": 0.0516, "step": 466 }, { "epoch": 0.22430355427473583, "grad_norm": 0.9174559479694644, "learning_rate": 7.472000000000001e-06, "loss": 0.0465, "step": 467 }, { "epoch": 0.22478386167146974, "grad_norm": 1.2190080584481207, "learning_rate": 7.488000000000001e-06, "loss": 0.0813, "step": 468 }, { "epoch": 0.22526416906820365, "grad_norm": 0.8257736438539969, "learning_rate": 7.5040000000000005e-06, "loss": 0.0665, "step": 469 }, { "epoch": 0.22574447646493756, "grad_norm": 1.005516446834955, "learning_rate": 7.520000000000001e-06, "loss": 0.0591, "step": 470 }, { "epoch": 0.22622478386167147, "grad_norm": 0.9928080490901438, "learning_rate": 7.536000000000001e-06, "loss": 0.0864, "step": 471 }, { "epoch": 0.22670509125840538, "grad_norm": 1.4816159445977666, "learning_rate": 7.552000000000001e-06, "loss": 0.0794, "step": 472 }, { "epoch": 0.2271853986551393, "grad_norm": 1.758118832520925, "learning_rate": 7.568000000000001e-06, "loss": 0.0671, "step": 473 }, { "epoch": 0.2276657060518732, "grad_norm": 1.2359986745074605, "learning_rate": 7.5840000000000006e-06, "loss": 0.0862, "step": 474 }, { "epoch": 0.2281460134486071, "grad_norm": 1.065041005575724, "learning_rate": 7.600000000000001e-06, "loss": 0.0867, "step": 475 }, { "epoch": 0.228626320845341, "grad_norm": 1.1386518653125688, "learning_rate": 7.616000000000001e-06, "loss": 0.0601, "step": 476 }, { "epoch": 0.22910662824207492, "grad_norm": 0.9844587250229394, "learning_rate": 7.632e-06, "loss": 0.0646, "step": 477 }, { "epoch": 0.22958693563880883, "grad_norm": 1.2058332109509622, "learning_rate": 7.648e-06, "loss": 0.0708, "step": 478 }, { "epoch": 0.23006724303554274, "grad_norm": 1.081013183286517, "learning_rate": 7.664e-06, "loss": 0.0701, "step": 479 }, { "epoch": 0.23054755043227665, "grad_norm": 0.9951838717376329, "learning_rate": 7.680000000000001e-06, "loss": 0.0769, "step": 480 }, { "epoch": 0.23102785782901056, "grad_norm": 0.6593691281860705, "learning_rate": 7.696e-06, "loss": 0.0624, "step": 481 }, { "epoch": 0.23150816522574447, "grad_norm": 1.1041552206575123, "learning_rate": 7.712e-06, "loss": 0.0786, "step": 482 }, { "epoch": 0.23198847262247838, "grad_norm": 1.1024525564632526, "learning_rate": 7.728000000000001e-06, "loss": 0.0825, "step": 483 }, { "epoch": 0.23246878001921228, "grad_norm": 0.7562432101013394, "learning_rate": 7.744e-06, "loss": 0.0626, "step": 484 }, { "epoch": 0.2329490874159462, "grad_norm": 0.7159892284435388, "learning_rate": 7.76e-06, "loss": 0.0666, "step": 485 }, { "epoch": 0.2334293948126801, "grad_norm": 0.7637031784203081, "learning_rate": 7.776e-06, "loss": 0.0535, "step": 486 }, { "epoch": 0.23390970220941404, "grad_norm": 1.341218149155002, "learning_rate": 7.792000000000001e-06, "loss": 0.0841, "step": 487 }, { "epoch": 0.23439000960614795, "grad_norm": 1.0381162643143098, "learning_rate": 7.808e-06, "loss": 0.0522, "step": 488 }, { "epoch": 0.23487031700288186, "grad_norm": 1.1102483912786174, "learning_rate": 7.824e-06, "loss": 0.0839, "step": 489 }, { "epoch": 0.23535062439961577, "grad_norm": 1.1434606793180935, "learning_rate": 7.840000000000001e-06, "loss": 0.0691, "step": 490 }, { "epoch": 0.23583093179634967, "grad_norm": 2.1962165048575533, "learning_rate": 7.856e-06, "loss": 0.0763, "step": 491 }, { "epoch": 0.23631123919308358, "grad_norm": 0.890647664698546, "learning_rate": 7.872e-06, "loss": 0.0718, "step": 492 }, { "epoch": 0.2367915465898175, "grad_norm": 0.8203501888126637, "learning_rate": 7.888e-06, "loss": 0.0726, "step": 493 }, { "epoch": 0.2372718539865514, "grad_norm": 1.2395973157128906, "learning_rate": 7.904000000000001e-06, "loss": 0.0898, "step": 494 }, { "epoch": 0.2377521613832853, "grad_norm": 0.9214838558224117, "learning_rate": 7.92e-06, "loss": 0.0657, "step": 495 }, { "epoch": 0.23823246878001922, "grad_norm": 1.0269834431751539, "learning_rate": 7.936e-06, "loss": 0.0807, "step": 496 }, { "epoch": 0.23871277617675313, "grad_norm": 1.020546331564402, "learning_rate": 7.952000000000001e-06, "loss": 0.0756, "step": 497 }, { "epoch": 0.23919308357348704, "grad_norm": 0.8476975296985142, "learning_rate": 7.968e-06, "loss": 0.0643, "step": 498 }, { "epoch": 0.23967339097022095, "grad_norm": 1.1388832811626284, "learning_rate": 7.984e-06, "loss": 0.0849, "step": 499 }, { "epoch": 0.24015369836695485, "grad_norm": 1.0929897122086174, "learning_rate": 8.000000000000001e-06, "loss": 0.0807, "step": 500 }, { "epoch": 0.24063400576368876, "grad_norm": 1.1481077522909218, "learning_rate": 8.016e-06, "loss": 0.0809, "step": 501 }, { "epoch": 0.24111431316042267, "grad_norm": 0.65682707360465, "learning_rate": 8.032e-06, "loss": 0.0654, "step": 502 }, { "epoch": 0.24159462055715658, "grad_norm": 0.8221527194893445, "learning_rate": 8.048e-06, "loss": 0.0723, "step": 503 }, { "epoch": 0.2420749279538905, "grad_norm": 1.0897467045485394, "learning_rate": 8.064000000000001e-06, "loss": 0.0678, "step": 504 }, { "epoch": 0.2425552353506244, "grad_norm": 0.7475058575087895, "learning_rate": 8.08e-06, "loss": 0.0516, "step": 505 }, { "epoch": 0.2430355427473583, "grad_norm": 0.7036037103827971, "learning_rate": 8.096e-06, "loss": 0.0538, "step": 506 }, { "epoch": 0.24351585014409222, "grad_norm": 0.8406021606293904, "learning_rate": 8.112000000000001e-06, "loss": 0.0588, "step": 507 }, { "epoch": 0.24399615754082613, "grad_norm": 0.9956392385750449, "learning_rate": 8.128e-06, "loss": 0.0529, "step": 508 }, { "epoch": 0.24447646493756003, "grad_norm": 1.1472735203009934, "learning_rate": 8.144e-06, "loss": 0.0935, "step": 509 }, { "epoch": 0.24495677233429394, "grad_norm": 1.024997413494147, "learning_rate": 8.16e-06, "loss": 0.0776, "step": 510 }, { "epoch": 0.24543707973102785, "grad_norm": 0.6571287147481466, "learning_rate": 8.176000000000001e-06, "loss": 0.0628, "step": 511 }, { "epoch": 0.24591738712776176, "grad_norm": 1.3063911540302608, "learning_rate": 8.192e-06, "loss": 0.0717, "step": 512 }, { "epoch": 0.24639769452449567, "grad_norm": 1.4230401674267745, "learning_rate": 8.208e-06, "loss": 0.0764, "step": 513 }, { "epoch": 0.24687800192122958, "grad_norm": 0.8048952076569829, "learning_rate": 8.224000000000001e-06, "loss": 0.0571, "step": 514 }, { "epoch": 0.2473583093179635, "grad_norm": 0.6650682219974673, "learning_rate": 8.24e-06, "loss": 0.0565, "step": 515 }, { "epoch": 0.2478386167146974, "grad_norm": 1.2997870431175331, "learning_rate": 8.256e-06, "loss": 0.0837, "step": 516 }, { "epoch": 0.2483189241114313, "grad_norm": 4.337586159697626, "learning_rate": 8.272000000000001e-06, "loss": 0.0467, "step": 517 }, { "epoch": 0.24879923150816521, "grad_norm": 0.8239655704265273, "learning_rate": 8.288000000000001e-06, "loss": 0.0647, "step": 518 }, { "epoch": 0.24927953890489912, "grad_norm": 0.8494250080431497, "learning_rate": 8.304e-06, "loss": 0.0526, "step": 519 }, { "epoch": 0.24975984630163303, "grad_norm": 0.7032539553571172, "learning_rate": 8.32e-06, "loss": 0.0504, "step": 520 }, { "epoch": 0.25024015369836694, "grad_norm": 0.7697835483160133, "learning_rate": 8.336000000000001e-06, "loss": 0.0508, "step": 521 }, { "epoch": 0.2507204610951009, "grad_norm": 0.8341875541080136, "learning_rate": 8.352e-06, "loss": 0.0807, "step": 522 }, { "epoch": 0.25120076849183476, "grad_norm": 0.8726308431077691, "learning_rate": 8.368e-06, "loss": 0.0636, "step": 523 }, { "epoch": 0.2516810758885687, "grad_norm": 0.9286195806386074, "learning_rate": 8.384000000000001e-06, "loss": 0.1007, "step": 524 }, { "epoch": 0.2521613832853026, "grad_norm": 1.0477568055323367, "learning_rate": 8.400000000000001e-06, "loss": 0.0653, "step": 525 }, { "epoch": 0.2526416906820365, "grad_norm": 0.907504117294368, "learning_rate": 8.416e-06, "loss": 0.0687, "step": 526 }, { "epoch": 0.2531219980787704, "grad_norm": 0.6672546812595275, "learning_rate": 8.432e-06, "loss": 0.0833, "step": 527 }, { "epoch": 0.25360230547550433, "grad_norm": 0.8481460299512423, "learning_rate": 8.448000000000001e-06, "loss": 0.069, "step": 528 }, { "epoch": 0.2540826128722382, "grad_norm": 0.7153223957448147, "learning_rate": 8.464e-06, "loss": 0.0685, "step": 529 }, { "epoch": 0.25456292026897215, "grad_norm": 0.6977196547557283, "learning_rate": 8.48e-06, "loss": 0.0642, "step": 530 }, { "epoch": 0.25504322766570603, "grad_norm": 0.8261907792245994, "learning_rate": 8.496000000000001e-06, "loss": 0.0675, "step": 531 }, { "epoch": 0.25552353506243997, "grad_norm": 0.7198555590025666, "learning_rate": 8.512e-06, "loss": 0.0851, "step": 532 }, { "epoch": 0.25600384245917385, "grad_norm": 0.6637599827890456, "learning_rate": 8.528e-06, "loss": 0.0899, "step": 533 }, { "epoch": 0.2564841498559078, "grad_norm": 0.7512929164073144, "learning_rate": 8.544000000000002e-06, "loss": 0.0865, "step": 534 }, { "epoch": 0.25696445725264166, "grad_norm": 1.8750413633075549, "learning_rate": 8.560000000000001e-06, "loss": 0.0611, "step": 535 }, { "epoch": 0.2574447646493756, "grad_norm": 0.8162254344386675, "learning_rate": 8.576e-06, "loss": 0.082, "step": 536 }, { "epoch": 0.2579250720461095, "grad_norm": 0.7407860981655993, "learning_rate": 8.592e-06, "loss": 0.0659, "step": 537 }, { "epoch": 0.2584053794428434, "grad_norm": 0.8614219120120522, "learning_rate": 8.608000000000001e-06, "loss": 0.1007, "step": 538 }, { "epoch": 0.25888568683957736, "grad_norm": 1.2916018437852648, "learning_rate": 8.624e-06, "loss": 0.0711, "step": 539 }, { "epoch": 0.25936599423631124, "grad_norm": 1.0968982870807809, "learning_rate": 8.64e-06, "loss": 0.0494, "step": 540 }, { "epoch": 0.2598463016330452, "grad_norm": 0.8333925089939908, "learning_rate": 8.656000000000001e-06, "loss": 0.0718, "step": 541 }, { "epoch": 0.26032660902977905, "grad_norm": 1.5580346095928794, "learning_rate": 8.672000000000001e-06, "loss": 0.0728, "step": 542 }, { "epoch": 0.260806916426513, "grad_norm": 1.3177727190392534, "learning_rate": 8.688e-06, "loss": 0.068, "step": 543 }, { "epoch": 0.2612872238232469, "grad_norm": 1.1607948509472454, "learning_rate": 8.704e-06, "loss": 0.0691, "step": 544 }, { "epoch": 0.2617675312199808, "grad_norm": 0.9590351041286705, "learning_rate": 8.720000000000001e-06, "loss": 0.0613, "step": 545 }, { "epoch": 0.2622478386167147, "grad_norm": 0.830353272020343, "learning_rate": 8.736e-06, "loss": 0.0661, "step": 546 }, { "epoch": 0.2627281460134486, "grad_norm": 0.8307084029819353, "learning_rate": 8.752e-06, "loss": 0.0557, "step": 547 }, { "epoch": 0.2632084534101825, "grad_norm": 0.8137293430136122, "learning_rate": 8.768000000000001e-06, "loss": 0.0625, "step": 548 }, { "epoch": 0.26368876080691644, "grad_norm": 1.6678230574606077, "learning_rate": 8.784000000000001e-06, "loss": 0.0663, "step": 549 }, { "epoch": 0.2641690682036503, "grad_norm": 0.8131640866998433, "learning_rate": 8.8e-06, "loss": 0.0758, "step": 550 }, { "epoch": 0.26464937560038426, "grad_norm": 0.746830009216251, "learning_rate": 8.816000000000002e-06, "loss": 0.0594, "step": 551 }, { "epoch": 0.26512968299711814, "grad_norm": 1.0612071437391581, "learning_rate": 8.832000000000001e-06, "loss": 0.1148, "step": 552 }, { "epoch": 0.2656099903938521, "grad_norm": 1.0736052555532878, "learning_rate": 8.848e-06, "loss": 0.0795, "step": 553 }, { "epoch": 0.26609029779058596, "grad_norm": 1.7811521568482815, "learning_rate": 8.864e-06, "loss": 0.0708, "step": 554 }, { "epoch": 0.2665706051873199, "grad_norm": 1.1601873615680562, "learning_rate": 8.880000000000001e-06, "loss": 0.0632, "step": 555 }, { "epoch": 0.2670509125840538, "grad_norm": 0.9443323090818715, "learning_rate": 8.896000000000001e-06, "loss": 0.0726, "step": 556 }, { "epoch": 0.2675312199807877, "grad_norm": 0.751021625854935, "learning_rate": 8.912e-06, "loss": 0.0496, "step": 557 }, { "epoch": 0.2680115273775216, "grad_norm": 0.93106805297644, "learning_rate": 8.928000000000002e-06, "loss": 0.0777, "step": 558 }, { "epoch": 0.26849183477425553, "grad_norm": 0.635298715287655, "learning_rate": 8.944000000000001e-06, "loss": 0.059, "step": 559 }, { "epoch": 0.2689721421709894, "grad_norm": 0.8466300533119754, "learning_rate": 8.96e-06, "loss": 0.0788, "step": 560 }, { "epoch": 0.26945244956772335, "grad_norm": 0.7748167130306874, "learning_rate": 8.976e-06, "loss": 0.0599, "step": 561 }, { "epoch": 0.26993275696445723, "grad_norm": 0.7427736105339172, "learning_rate": 8.992000000000001e-06, "loss": 0.0614, "step": 562 }, { "epoch": 0.27041306436119117, "grad_norm": 0.8320001022304129, "learning_rate": 9.008e-06, "loss": 0.0674, "step": 563 }, { "epoch": 0.27089337175792505, "grad_norm": 0.7704318557146501, "learning_rate": 9.024e-06, "loss": 0.0702, "step": 564 }, { "epoch": 0.271373679154659, "grad_norm": 0.7407765317427476, "learning_rate": 9.040000000000002e-06, "loss": 0.0744, "step": 565 }, { "epoch": 0.27185398655139287, "grad_norm": 0.7206956721264949, "learning_rate": 9.056000000000001e-06, "loss": 0.0543, "step": 566 }, { "epoch": 0.2723342939481268, "grad_norm": 0.9451040246693897, "learning_rate": 9.072e-06, "loss": 0.0807, "step": 567 }, { "epoch": 0.2728146013448607, "grad_norm": 0.9009649199487116, "learning_rate": 9.088000000000002e-06, "loss": 0.062, "step": 568 }, { "epoch": 0.2732949087415946, "grad_norm": 1.0113937709014764, "learning_rate": 9.104000000000001e-06, "loss": 0.0648, "step": 569 }, { "epoch": 0.2737752161383285, "grad_norm": 1.0501699409202674, "learning_rate": 9.12e-06, "loss": 0.0537, "step": 570 }, { "epoch": 0.27425552353506244, "grad_norm": 0.5693860840353209, "learning_rate": 9.136e-06, "loss": 0.0448, "step": 571 }, { "epoch": 0.2747358309317964, "grad_norm": 1.0055693738637874, "learning_rate": 9.152000000000001e-06, "loss": 0.0747, "step": 572 }, { "epoch": 0.27521613832853026, "grad_norm": 1.2257150323649562, "learning_rate": 9.168000000000001e-06, "loss": 0.0971, "step": 573 }, { "epoch": 0.2756964457252642, "grad_norm": 0.8489461117348059, "learning_rate": 9.184e-06, "loss": 0.067, "step": 574 }, { "epoch": 0.2761767531219981, "grad_norm": 0.8112579026025699, "learning_rate": 9.200000000000002e-06, "loss": 0.0682, "step": 575 }, { "epoch": 0.276657060518732, "grad_norm": 0.8857859004688043, "learning_rate": 9.216000000000001e-06, "loss": 0.0799, "step": 576 }, { "epoch": 0.2771373679154659, "grad_norm": 0.7523380576301403, "learning_rate": 9.232e-06, "loss": 0.0682, "step": 577 }, { "epoch": 0.27761767531219983, "grad_norm": 0.8161110604162131, "learning_rate": 9.248e-06, "loss": 0.0713, "step": 578 }, { "epoch": 0.2780979827089337, "grad_norm": 1.2335737488024285, "learning_rate": 9.264000000000001e-06, "loss": 0.0976, "step": 579 }, { "epoch": 0.27857829010566765, "grad_norm": 0.8454243600862699, "learning_rate": 9.280000000000001e-06, "loss": 0.0606, "step": 580 }, { "epoch": 0.27905859750240153, "grad_norm": 1.2912157131289221, "learning_rate": 9.296e-06, "loss": 0.0864, "step": 581 }, { "epoch": 0.27953890489913547, "grad_norm": 1.0229081590491909, "learning_rate": 9.312000000000002e-06, "loss": 0.0914, "step": 582 }, { "epoch": 0.28001921229586935, "grad_norm": 1.206253943899943, "learning_rate": 9.328000000000001e-06, "loss": 0.0862, "step": 583 }, { "epoch": 0.2804995196926033, "grad_norm": 0.9742697094872371, "learning_rate": 9.344e-06, "loss": 0.0534, "step": 584 }, { "epoch": 0.28097982708933716, "grad_norm": 1.3994880281576707, "learning_rate": 9.360000000000002e-06, "loss": 0.0942, "step": 585 }, { "epoch": 0.2814601344860711, "grad_norm": 0.7863832094391792, "learning_rate": 9.376000000000001e-06, "loss": 0.0651, "step": 586 }, { "epoch": 0.281940441882805, "grad_norm": 1.790391699352207, "learning_rate": 9.392000000000001e-06, "loss": 0.0837, "step": 587 }, { "epoch": 0.2824207492795389, "grad_norm": 0.7167809240107046, "learning_rate": 9.408e-06, "loss": 0.0641, "step": 588 }, { "epoch": 0.2829010566762728, "grad_norm": 1.3108752667323915, "learning_rate": 9.424000000000002e-06, "loss": 0.0711, "step": 589 }, { "epoch": 0.28338136407300674, "grad_norm": 0.752914709706173, "learning_rate": 9.440000000000001e-06, "loss": 0.0584, "step": 590 }, { "epoch": 0.2838616714697406, "grad_norm": 0.8764860102695597, "learning_rate": 9.456e-06, "loss": 0.0477, "step": 591 }, { "epoch": 0.28434197886647455, "grad_norm": 0.6356131868385272, "learning_rate": 9.472000000000002e-06, "loss": 0.0522, "step": 592 }, { "epoch": 0.28482228626320844, "grad_norm": 1.263517402410837, "learning_rate": 9.488000000000001e-06, "loss": 0.0873, "step": 593 }, { "epoch": 0.28530259365994237, "grad_norm": 0.6481285314183274, "learning_rate": 9.504e-06, "loss": 0.0594, "step": 594 }, { "epoch": 0.28578290105667625, "grad_norm": 0.6520090713036953, "learning_rate": 9.52e-06, "loss": 0.0574, "step": 595 }, { "epoch": 0.2862632084534102, "grad_norm": 1.2576547381797791, "learning_rate": 9.536000000000002e-06, "loss": 0.0782, "step": 596 }, { "epoch": 0.28674351585014407, "grad_norm": 1.0281958680840946, "learning_rate": 9.552000000000001e-06, "loss": 0.0884, "step": 597 }, { "epoch": 0.287223823246878, "grad_norm": 2.0780498256230904, "learning_rate": 9.568e-06, "loss": 0.0686, "step": 598 }, { "epoch": 0.2877041306436119, "grad_norm": 0.7251320029199013, "learning_rate": 9.584000000000002e-06, "loss": 0.0667, "step": 599 }, { "epoch": 0.2881844380403458, "grad_norm": 0.9843387438340672, "learning_rate": 9.600000000000001e-06, "loss": 0.0696, "step": 600 }, { "epoch": 0.2886647454370797, "grad_norm": 1.5091693125190424, "learning_rate": 9.616e-06, "loss": 0.11, "step": 601 }, { "epoch": 0.28914505283381364, "grad_norm": 0.7606667303666816, "learning_rate": 9.632e-06, "loss": 0.0645, "step": 602 }, { "epoch": 0.2896253602305475, "grad_norm": 0.8702218966486639, "learning_rate": 9.648000000000001e-06, "loss": 0.0695, "step": 603 }, { "epoch": 0.29010566762728146, "grad_norm": 0.9692656471462159, "learning_rate": 9.664000000000001e-06, "loss": 0.071, "step": 604 }, { "epoch": 0.2905859750240154, "grad_norm": 1.152104986956867, "learning_rate": 9.68e-06, "loss": 0.0621, "step": 605 }, { "epoch": 0.2910662824207493, "grad_norm": 0.7344517356138976, "learning_rate": 9.696000000000002e-06, "loss": 0.0485, "step": 606 }, { "epoch": 0.2915465898174832, "grad_norm": 0.9928918365561824, "learning_rate": 9.712e-06, "loss": 0.0719, "step": 607 }, { "epoch": 0.2920268972142171, "grad_norm": 0.7667129001690043, "learning_rate": 9.728e-06, "loss": 0.0679, "step": 608 }, { "epoch": 0.29250720461095103, "grad_norm": 0.9950000199696363, "learning_rate": 9.744000000000002e-06, "loss": 0.0723, "step": 609 }, { "epoch": 0.2929875120076849, "grad_norm": 1.4085105451496738, "learning_rate": 9.760000000000001e-06, "loss": 0.0798, "step": 610 }, { "epoch": 0.29346781940441885, "grad_norm": 0.849429025717193, "learning_rate": 9.776000000000001e-06, "loss": 0.0573, "step": 611 }, { "epoch": 0.29394812680115273, "grad_norm": 0.8950057496274839, "learning_rate": 9.792e-06, "loss": 0.0718, "step": 612 }, { "epoch": 0.29442843419788667, "grad_norm": 0.728848993249414, "learning_rate": 9.808000000000002e-06, "loss": 0.0397, "step": 613 }, { "epoch": 0.29490874159462055, "grad_norm": 0.9156226984501676, "learning_rate": 9.824000000000001e-06, "loss": 0.0722, "step": 614 }, { "epoch": 0.2953890489913545, "grad_norm": 1.3199838058869244, "learning_rate": 9.84e-06, "loss": 0.0663, "step": 615 }, { "epoch": 0.29586935638808837, "grad_norm": 0.6241326568020441, "learning_rate": 9.856000000000002e-06, "loss": 0.0477, "step": 616 }, { "epoch": 0.2963496637848223, "grad_norm": 0.8937661400674575, "learning_rate": 9.872e-06, "loss": 0.0655, "step": 617 }, { "epoch": 0.2968299711815562, "grad_norm": 0.7018639448793483, "learning_rate": 9.888000000000001e-06, "loss": 0.0666, "step": 618 }, { "epoch": 0.2973102785782901, "grad_norm": 0.8792158375314773, "learning_rate": 9.904e-06, "loss": 0.0806, "step": 619 }, { "epoch": 0.297790585975024, "grad_norm": 0.8101543945686708, "learning_rate": 9.920000000000002e-06, "loss": 0.0628, "step": 620 }, { "epoch": 0.29827089337175794, "grad_norm": 0.6958139936512852, "learning_rate": 9.936000000000001e-06, "loss": 0.0586, "step": 621 }, { "epoch": 0.2987512007684918, "grad_norm": 0.8118776422093879, "learning_rate": 9.952e-06, "loss": 0.0696, "step": 622 }, { "epoch": 0.29923150816522576, "grad_norm": 0.801044484279383, "learning_rate": 9.968000000000002e-06, "loss": 0.0587, "step": 623 }, { "epoch": 0.29971181556195964, "grad_norm": 0.732515461824863, "learning_rate": 9.984e-06, "loss": 0.0564, "step": 624 }, { "epoch": 0.3001921229586936, "grad_norm": 0.8322604839391531, "learning_rate": 1e-05, "loss": 0.0812, "step": 625 }, { "epoch": 0.30067243035542746, "grad_norm": 0.7657461534156353, "learning_rate": 9.999999219069164e-06, "loss": 0.0553, "step": 626 }, { "epoch": 0.3011527377521614, "grad_norm": 0.96087209630369, "learning_rate": 9.999996876276899e-06, "loss": 0.0722, "step": 627 }, { "epoch": 0.3016330451488953, "grad_norm": 0.8664306047674389, "learning_rate": 9.999992971623935e-06, "loss": 0.0699, "step": 628 }, { "epoch": 0.3021133525456292, "grad_norm": 0.7586378661950982, "learning_rate": 9.999987505111493e-06, "loss": 0.0547, "step": 629 }, { "epoch": 0.3025936599423631, "grad_norm": 0.8770352678016571, "learning_rate": 9.999980476741282e-06, "loss": 0.0544, "step": 630 }, { "epoch": 0.30307396733909703, "grad_norm": 2.875494497735063, "learning_rate": 9.999971886515496e-06, "loss": 0.0706, "step": 631 }, { "epoch": 0.3035542747358309, "grad_norm": 1.2573657771022355, "learning_rate": 9.999961734436818e-06, "loss": 0.0687, "step": 632 }, { "epoch": 0.30403458213256485, "grad_norm": 0.683322107768531, "learning_rate": 9.99995002050842e-06, "loss": 0.0598, "step": 633 }, { "epoch": 0.3045148895292987, "grad_norm": 0.8277715966275165, "learning_rate": 9.99993674473396e-06, "loss": 0.0723, "step": 634 }, { "epoch": 0.30499519692603266, "grad_norm": 2.8948604349281584, "learning_rate": 9.999921907117588e-06, "loss": 0.1244, "step": 635 }, { "epoch": 0.30547550432276654, "grad_norm": 0.8638445515605259, "learning_rate": 9.999905507663936e-06, "loss": 0.0752, "step": 636 }, { "epoch": 0.3059558117195005, "grad_norm": 0.6727302040122436, "learning_rate": 9.999887546378127e-06, "loss": 0.0621, "step": 637 }, { "epoch": 0.30643611911623436, "grad_norm": 1.0952285566093234, "learning_rate": 9.999868023265772e-06, "loss": 0.095, "step": 638 }, { "epoch": 0.3069164265129683, "grad_norm": 0.957687462950951, "learning_rate": 9.99984693833297e-06, "loss": 0.0802, "step": 639 }, { "epoch": 0.30739673390970224, "grad_norm": 1.0874976971451398, "learning_rate": 9.999824291586306e-06, "loss": 0.0964, "step": 640 }, { "epoch": 0.3078770413064361, "grad_norm": 0.826343746073476, "learning_rate": 9.999800083032856e-06, "loss": 0.0553, "step": 641 }, { "epoch": 0.30835734870317005, "grad_norm": 0.8615917787952223, "learning_rate": 9.999774312680182e-06, "loss": 0.0674, "step": 642 }, { "epoch": 0.30883765609990393, "grad_norm": 0.9536454074667529, "learning_rate": 9.999746980536332e-06, "loss": 0.0754, "step": 643 }, { "epoch": 0.30931796349663787, "grad_norm": 0.8678398758854321, "learning_rate": 9.999718086609845e-06, "loss": 0.0536, "step": 644 }, { "epoch": 0.30979827089337175, "grad_norm": 0.8195920103808025, "learning_rate": 9.999687630909748e-06, "loss": 0.0676, "step": 645 }, { "epoch": 0.3102785782901057, "grad_norm": 0.833916110606186, "learning_rate": 9.999655613445552e-06, "loss": 0.0662, "step": 646 }, { "epoch": 0.31075888568683957, "grad_norm": 0.8552019334840019, "learning_rate": 9.99962203422726e-06, "loss": 0.0601, "step": 647 }, { "epoch": 0.3112391930835735, "grad_norm": 1.1244393216373334, "learning_rate": 9.99958689326536e-06, "loss": 0.0835, "step": 648 }, { "epoch": 0.3117195004803074, "grad_norm": 1.0921918261230938, "learning_rate": 9.99955019057083e-06, "loss": 0.0735, "step": 649 }, { "epoch": 0.3121998078770413, "grad_norm": 0.9476055236594563, "learning_rate": 9.999511926155135e-06, "loss": 0.0568, "step": 650 }, { "epoch": 0.3126801152737752, "grad_norm": 0.955427263476322, "learning_rate": 9.999472100030227e-06, "loss": 0.0682, "step": 651 }, { "epoch": 0.31316042267050914, "grad_norm": 0.6227107477762361, "learning_rate": 9.999430712208548e-06, "loss": 0.0515, "step": 652 }, { "epoch": 0.313640730067243, "grad_norm": 0.8819662452007495, "learning_rate": 9.999387762703025e-06, "loss": 0.0708, "step": 653 }, { "epoch": 0.31412103746397696, "grad_norm": 0.7140416262289825, "learning_rate": 9.999343251527076e-06, "loss": 0.0546, "step": 654 }, { "epoch": 0.31460134486071084, "grad_norm": 0.7721756355018631, "learning_rate": 9.999297178694603e-06, "loss": 0.0819, "step": 655 }, { "epoch": 0.3150816522574448, "grad_norm": 0.6792124284969571, "learning_rate": 9.999249544219998e-06, "loss": 0.0626, "step": 656 }, { "epoch": 0.31556195965417866, "grad_norm": 0.8965139505972938, "learning_rate": 9.999200348118142e-06, "loss": 0.0793, "step": 657 }, { "epoch": 0.3160422670509126, "grad_norm": 0.6875270287873301, "learning_rate": 9.9991495904044e-06, "loss": 0.0841, "step": 658 }, { "epoch": 0.3165225744476465, "grad_norm": 0.9499599493614728, "learning_rate": 9.999097271094632e-06, "loss": 0.0746, "step": 659 }, { "epoch": 0.3170028818443804, "grad_norm": 0.9142388374355066, "learning_rate": 9.999043390205176e-06, "loss": 0.0789, "step": 660 }, { "epoch": 0.3174831892411143, "grad_norm": 2.344335044128447, "learning_rate": 9.998987947752866e-06, "loss": 0.0992, "step": 661 }, { "epoch": 0.31796349663784823, "grad_norm": 1.318190298500077, "learning_rate": 9.99893094375502e-06, "loss": 0.0673, "step": 662 }, { "epoch": 0.3184438040345821, "grad_norm": 0.7219015180259919, "learning_rate": 9.998872378229444e-06, "loss": 0.0557, "step": 663 }, { "epoch": 0.31892411143131605, "grad_norm": 0.9108960437724677, "learning_rate": 9.998812251194432e-06, "loss": 0.0701, "step": 664 }, { "epoch": 0.31940441882804993, "grad_norm": 0.8017801457442163, "learning_rate": 9.998750562668767e-06, "loss": 0.0607, "step": 665 }, { "epoch": 0.31988472622478387, "grad_norm": 0.9355161886203067, "learning_rate": 9.99868731267172e-06, "loss": 0.0838, "step": 666 }, { "epoch": 0.32036503362151775, "grad_norm": 1.1545057767987545, "learning_rate": 9.998622501223045e-06, "loss": 0.0935, "step": 667 }, { "epoch": 0.3208453410182517, "grad_norm": 0.6776024404099317, "learning_rate": 9.998556128342989e-06, "loss": 0.0527, "step": 668 }, { "epoch": 0.32132564841498557, "grad_norm": 0.9245439985854373, "learning_rate": 9.998488194052287e-06, "loss": 0.0815, "step": 669 }, { "epoch": 0.3218059558117195, "grad_norm": 0.9108820872007813, "learning_rate": 9.998418698372156e-06, "loss": 0.0806, "step": 670 }, { "epoch": 0.3222862632084534, "grad_norm": 0.6413003064759893, "learning_rate": 9.998347641324309e-06, "loss": 0.0567, "step": 671 }, { "epoch": 0.3227665706051873, "grad_norm": 0.7124154469417349, "learning_rate": 9.998275022930937e-06, "loss": 0.0622, "step": 672 }, { "epoch": 0.32324687800192126, "grad_norm": 0.729795364251075, "learning_rate": 9.99820084321473e-06, "loss": 0.0549, "step": 673 }, { "epoch": 0.32372718539865514, "grad_norm": 0.7072881688429468, "learning_rate": 9.998125102198855e-06, "loss": 0.057, "step": 674 }, { "epoch": 0.3242074927953891, "grad_norm": 0.9404196385002046, "learning_rate": 9.998047799906972e-06, "loss": 0.0731, "step": 675 }, { "epoch": 0.32468780019212296, "grad_norm": 2.2283889506644803, "learning_rate": 9.99796893636323e-06, "loss": 0.0611, "step": 676 }, { "epoch": 0.3251681075888569, "grad_norm": 0.9898738372811073, "learning_rate": 9.997888511592262e-06, "loss": 0.075, "step": 677 }, { "epoch": 0.3256484149855908, "grad_norm": 0.5669213992706328, "learning_rate": 9.997806525619191e-06, "loss": 0.0462, "step": 678 }, { "epoch": 0.3261287223823247, "grad_norm": 0.7769207156534598, "learning_rate": 9.997722978469629e-06, "loss": 0.0598, "step": 679 }, { "epoch": 0.3266090297790586, "grad_norm": 1.14913919174941, "learning_rate": 9.997637870169673e-06, "loss": 0.0802, "step": 680 }, { "epoch": 0.3270893371757925, "grad_norm": 0.5579221498365345, "learning_rate": 9.997551200745905e-06, "loss": 0.0513, "step": 681 }, { "epoch": 0.3275696445725264, "grad_norm": 1.1622979766569141, "learning_rate": 9.997462970225402e-06, "loss": 0.0831, "step": 682 }, { "epoch": 0.32804995196926034, "grad_norm": 0.5843385711909048, "learning_rate": 9.997373178635723e-06, "loss": 0.0463, "step": 683 }, { "epoch": 0.3285302593659942, "grad_norm": 0.6003237420367146, "learning_rate": 9.997281826004919e-06, "loss": 0.0463, "step": 684 }, { "epoch": 0.32901056676272816, "grad_norm": 0.707379073392421, "learning_rate": 9.997188912361522e-06, "loss": 0.0743, "step": 685 }, { "epoch": 0.32949087415946204, "grad_norm": 0.6053436647816546, "learning_rate": 9.997094437734558e-06, "loss": 0.0586, "step": 686 }, { "epoch": 0.329971181556196, "grad_norm": 0.6165862274607972, "learning_rate": 9.99699840215354e-06, "loss": 0.0615, "step": 687 }, { "epoch": 0.33045148895292986, "grad_norm": 0.7162733421968076, "learning_rate": 9.996900805648462e-06, "loss": 0.0664, "step": 688 }, { "epoch": 0.3309317963496638, "grad_norm": 0.9782090778876608, "learning_rate": 9.996801648249815e-06, "loss": 0.0694, "step": 689 }, { "epoch": 0.3314121037463977, "grad_norm": 0.9345924479490764, "learning_rate": 9.996700929988571e-06, "loss": 0.0703, "step": 690 }, { "epoch": 0.3318924111431316, "grad_norm": 0.4641992659191584, "learning_rate": 9.996598650896191e-06, "loss": 0.0581, "step": 691 }, { "epoch": 0.3323727185398655, "grad_norm": 0.730369918229153, "learning_rate": 9.996494811004626e-06, "loss": 0.0671, "step": 692 }, { "epoch": 0.33285302593659943, "grad_norm": 0.7429226769702214, "learning_rate": 9.996389410346312e-06, "loss": 0.0603, "step": 693 }, { "epoch": 0.3333333333333333, "grad_norm": 0.8374173727738586, "learning_rate": 9.996282448954173e-06, "loss": 0.0686, "step": 694 }, { "epoch": 0.33381364073006725, "grad_norm": 0.7712716340055917, "learning_rate": 9.99617392686162e-06, "loss": 0.0606, "step": 695 }, { "epoch": 0.33429394812680113, "grad_norm": 0.5623336642825997, "learning_rate": 9.996063844102555e-06, "loss": 0.069, "step": 696 }, { "epoch": 0.33477425552353507, "grad_norm": 0.6995521178496045, "learning_rate": 9.995952200711361e-06, "loss": 0.0621, "step": 697 }, { "epoch": 0.33525456292026895, "grad_norm": 0.9184035660471785, "learning_rate": 9.995838996722916e-06, "loss": 0.0693, "step": 698 }, { "epoch": 0.3357348703170029, "grad_norm": 0.9083685686418375, "learning_rate": 9.995724232172578e-06, "loss": 0.0619, "step": 699 }, { "epoch": 0.33621517771373677, "grad_norm": 0.7795460332437362, "learning_rate": 9.995607907096198e-06, "loss": 0.0579, "step": 700 }, { "epoch": 0.3366954851104707, "grad_norm": 0.9597351909663773, "learning_rate": 9.995490021530116e-06, "loss": 0.0635, "step": 701 }, { "epoch": 0.3371757925072046, "grad_norm": 0.777710780224125, "learning_rate": 9.995370575511151e-06, "loss": 0.0644, "step": 702 }, { "epoch": 0.3376560999039385, "grad_norm": 0.8524311441817263, "learning_rate": 9.995249569076617e-06, "loss": 0.0649, "step": 703 }, { "epoch": 0.3381364073006724, "grad_norm": 0.9788665595689388, "learning_rate": 9.995127002264313e-06, "loss": 0.0483, "step": 704 }, { "epoch": 0.33861671469740634, "grad_norm": 0.9043257189131158, "learning_rate": 9.995002875112525e-06, "loss": 0.0802, "step": 705 }, { "epoch": 0.3390970220941403, "grad_norm": 1.0447948509065825, "learning_rate": 9.994877187660028e-06, "loss": 0.1116, "step": 706 }, { "epoch": 0.33957732949087416, "grad_norm": 0.6843801841988756, "learning_rate": 9.994749939946082e-06, "loss": 0.0445, "step": 707 }, { "epoch": 0.3400576368876081, "grad_norm": 0.8386099307338503, "learning_rate": 9.994621132010439e-06, "loss": 0.0761, "step": 708 }, { "epoch": 0.340537944284342, "grad_norm": 0.8431393706231558, "learning_rate": 9.994490763893328e-06, "loss": 0.0719, "step": 709 }, { "epoch": 0.3410182516810759, "grad_norm": 0.6771024567973309, "learning_rate": 9.994358835635477e-06, "loss": 0.0568, "step": 710 }, { "epoch": 0.3414985590778098, "grad_norm": 0.8961034176669026, "learning_rate": 9.9942253472781e-06, "loss": 0.0688, "step": 711 }, { "epoch": 0.34197886647454373, "grad_norm": 1.0453002901140704, "learning_rate": 9.99409029886289e-06, "loss": 0.1101, "step": 712 }, { "epoch": 0.3424591738712776, "grad_norm": 0.786975549589651, "learning_rate": 9.993953690432032e-06, "loss": 0.0554, "step": 713 }, { "epoch": 0.34293948126801155, "grad_norm": 0.7042757673435847, "learning_rate": 9.993815522028203e-06, "loss": 0.0617, "step": 714 }, { "epoch": 0.34341978866474543, "grad_norm": 0.9449988070860224, "learning_rate": 9.993675793694558e-06, "loss": 0.0615, "step": 715 }, { "epoch": 0.34390009606147937, "grad_norm": 1.2419899707910662, "learning_rate": 9.99353450547475e-06, "loss": 0.0724, "step": 716 }, { "epoch": 0.34438040345821325, "grad_norm": 1.1846845186388448, "learning_rate": 9.993391657412908e-06, "loss": 0.0892, "step": 717 }, { "epoch": 0.3448607108549472, "grad_norm": 0.9397604288615065, "learning_rate": 9.993247249553656e-06, "loss": 0.0561, "step": 718 }, { "epoch": 0.34534101825168106, "grad_norm": 0.6412293473440506, "learning_rate": 9.993101281942103e-06, "loss": 0.0506, "step": 719 }, { "epoch": 0.345821325648415, "grad_norm": 0.6410026176064426, "learning_rate": 9.992953754623847e-06, "loss": 0.0556, "step": 720 }, { "epoch": 0.3463016330451489, "grad_norm": 0.9400115622092615, "learning_rate": 9.992804667644969e-06, "loss": 0.0713, "step": 721 }, { "epoch": 0.3467819404418828, "grad_norm": 0.8307344024344647, "learning_rate": 9.99265402105204e-06, "loss": 0.0749, "step": 722 }, { "epoch": 0.3472622478386167, "grad_norm": 0.6264072104068256, "learning_rate": 9.992501814892118e-06, "loss": 0.0572, "step": 723 }, { "epoch": 0.34774255523535064, "grad_norm": 0.8873301693627772, "learning_rate": 9.99234804921275e-06, "loss": 0.0636, "step": 724 }, { "epoch": 0.3482228626320845, "grad_norm": 0.5551682372717668, "learning_rate": 9.992192724061965e-06, "loss": 0.0614, "step": 725 }, { "epoch": 0.34870317002881845, "grad_norm": 0.6741323301483185, "learning_rate": 9.992035839488283e-06, "loss": 0.0598, "step": 726 }, { "epoch": 0.34918347742555234, "grad_norm": 0.7478055633816404, "learning_rate": 9.991877395540714e-06, "loss": 0.0698, "step": 727 }, { "epoch": 0.34966378482228627, "grad_norm": 0.7204195910532974, "learning_rate": 9.991717392268747e-06, "loss": 0.0684, "step": 728 }, { "epoch": 0.35014409221902015, "grad_norm": 0.680949486584263, "learning_rate": 9.991555829722363e-06, "loss": 0.0723, "step": 729 }, { "epoch": 0.3506243996157541, "grad_norm": 0.6473589893605443, "learning_rate": 9.991392707952032e-06, "loss": 0.0595, "step": 730 }, { "epoch": 0.35110470701248797, "grad_norm": 0.9261456172897032, "learning_rate": 9.991228027008708e-06, "loss": 0.0842, "step": 731 }, { "epoch": 0.3515850144092219, "grad_norm": 1.0662113215148616, "learning_rate": 9.991061786943832e-06, "loss": 0.0724, "step": 732 }, { "epoch": 0.3520653218059558, "grad_norm": 0.9090716830219397, "learning_rate": 9.990893987809334e-06, "loss": 0.0574, "step": 733 }, { "epoch": 0.3525456292026897, "grad_norm": 0.5908937589147102, "learning_rate": 9.990724629657628e-06, "loss": 0.0514, "step": 734 }, { "epoch": 0.3530259365994236, "grad_norm": 1.0249840050472094, "learning_rate": 9.990553712541617e-06, "loss": 0.0838, "step": 735 }, { "epoch": 0.35350624399615754, "grad_norm": 0.7254263684691535, "learning_rate": 9.990381236514694e-06, "loss": 0.0568, "step": 736 }, { "epoch": 0.3539865513928914, "grad_norm": 0.7557297240778922, "learning_rate": 9.99020720163073e-06, "loss": 0.0613, "step": 737 }, { "epoch": 0.35446685878962536, "grad_norm": 0.7212710346277188, "learning_rate": 9.990031607944095e-06, "loss": 0.0615, "step": 738 }, { "epoch": 0.3549471661863593, "grad_norm": 0.5429849169379788, "learning_rate": 9.989854455509636e-06, "loss": 0.0508, "step": 739 }, { "epoch": 0.3554274735830932, "grad_norm": 0.790313789150935, "learning_rate": 9.98967574438269e-06, "loss": 0.0607, "step": 740 }, { "epoch": 0.3559077809798271, "grad_norm": 0.6588326074450415, "learning_rate": 9.989495474619084e-06, "loss": 0.0501, "step": 741 }, { "epoch": 0.356388088376561, "grad_norm": 1.0216588961310469, "learning_rate": 9.989313646275127e-06, "loss": 0.0817, "step": 742 }, { "epoch": 0.35686839577329493, "grad_norm": 1.2144514747608683, "learning_rate": 9.989130259407617e-06, "loss": 0.095, "step": 743 }, { "epoch": 0.3573487031700288, "grad_norm": 0.8230958178941028, "learning_rate": 9.988945314073842e-06, "loss": 0.0675, "step": 744 }, { "epoch": 0.35782901056676275, "grad_norm": 0.5442331145018258, "learning_rate": 9.988758810331572e-06, "loss": 0.0472, "step": 745 }, { "epoch": 0.35830931796349663, "grad_norm": 0.9420681725327574, "learning_rate": 9.988570748239062e-06, "loss": 0.0609, "step": 746 }, { "epoch": 0.35878962536023057, "grad_norm": 0.7063636013049791, "learning_rate": 9.988381127855063e-06, "loss": 0.0612, "step": 747 }, { "epoch": 0.35926993275696445, "grad_norm": 0.8868618317047221, "learning_rate": 9.988189949238804e-06, "loss": 0.0604, "step": 748 }, { "epoch": 0.3597502401536984, "grad_norm": 0.7054240383568051, "learning_rate": 9.987997212450007e-06, "loss": 0.0641, "step": 749 }, { "epoch": 0.36023054755043227, "grad_norm": 0.7475702380405793, "learning_rate": 9.987802917548874e-06, "loss": 0.0554, "step": 750 }, { "epoch": 0.3607108549471662, "grad_norm": 0.8420433761309497, "learning_rate": 9.9876070645961e-06, "loss": 0.0646, "step": 751 }, { "epoch": 0.3611911623439001, "grad_norm": 1.151439713731852, "learning_rate": 9.98740965365286e-06, "loss": 0.0894, "step": 752 }, { "epoch": 0.361671469740634, "grad_norm": 0.9861736223093036, "learning_rate": 9.987210684780826e-06, "loss": 0.0653, "step": 753 }, { "epoch": 0.3621517771373679, "grad_norm": 1.023603091918796, "learning_rate": 9.987010158042145e-06, "loss": 0.0716, "step": 754 }, { "epoch": 0.36263208453410184, "grad_norm": 0.7427517829264801, "learning_rate": 9.986808073499459e-06, "loss": 0.0588, "step": 755 }, { "epoch": 0.3631123919308357, "grad_norm": 1.0002779227618477, "learning_rate": 9.98660443121589e-06, "loss": 0.0601, "step": 756 }, { "epoch": 0.36359269932756966, "grad_norm": 0.7758536271834264, "learning_rate": 9.986399231255057e-06, "loss": 0.0629, "step": 757 }, { "epoch": 0.36407300672430354, "grad_norm": 0.5275682172040984, "learning_rate": 9.98619247368105e-06, "loss": 0.0591, "step": 758 }, { "epoch": 0.3645533141210375, "grad_norm": 0.8592188357318262, "learning_rate": 9.985984158558462e-06, "loss": 0.0686, "step": 759 }, { "epoch": 0.36503362151777136, "grad_norm": 0.6429739999465831, "learning_rate": 9.985774285952362e-06, "loss": 0.0875, "step": 760 }, { "epoch": 0.3655139289145053, "grad_norm": 0.9068989203363063, "learning_rate": 9.985562855928309e-06, "loss": 0.0728, "step": 761 }, { "epoch": 0.3659942363112392, "grad_norm": 0.8958890702630246, "learning_rate": 9.985349868552343e-06, "loss": 0.0762, "step": 762 }, { "epoch": 0.3664745437079731, "grad_norm": 0.9260397876251805, "learning_rate": 9.985135323891002e-06, "loss": 0.0639, "step": 763 }, { "epoch": 0.366954851104707, "grad_norm": 0.46145582448125044, "learning_rate": 9.984919222011301e-06, "loss": 0.0392, "step": 764 }, { "epoch": 0.36743515850144093, "grad_norm": 0.7368863607093822, "learning_rate": 9.984701562980745e-06, "loss": 0.0675, "step": 765 }, { "epoch": 0.3679154658981748, "grad_norm": 0.8374165642976674, "learning_rate": 9.984482346867325e-06, "loss": 0.0724, "step": 766 }, { "epoch": 0.36839577329490875, "grad_norm": 0.5004955641611937, "learning_rate": 9.984261573739515e-06, "loss": 0.0559, "step": 767 }, { "epoch": 0.3688760806916426, "grad_norm": 0.6896996348098048, "learning_rate": 9.984039243666284e-06, "loss": 0.0683, "step": 768 }, { "epoch": 0.36935638808837656, "grad_norm": 0.4917654383061967, "learning_rate": 9.983815356717075e-06, "loss": 0.0509, "step": 769 }, { "epoch": 0.36983669548511044, "grad_norm": 0.6841919810633339, "learning_rate": 9.983589912961828e-06, "loss": 0.0693, "step": 770 }, { "epoch": 0.3703170028818444, "grad_norm": 0.6557223578797912, "learning_rate": 9.983362912470967e-06, "loss": 0.064, "step": 771 }, { "epoch": 0.37079731027857826, "grad_norm": 0.6242477698953485, "learning_rate": 9.983134355315397e-06, "loss": 0.0587, "step": 772 }, { "epoch": 0.3712776176753122, "grad_norm": 0.5988595426851467, "learning_rate": 9.982904241566515e-06, "loss": 0.0634, "step": 773 }, { "epoch": 0.37175792507204614, "grad_norm": 0.6754592703701289, "learning_rate": 9.982672571296201e-06, "loss": 0.0629, "step": 774 }, { "epoch": 0.37223823246878, "grad_norm": 0.6078844744556268, "learning_rate": 9.982439344576824e-06, "loss": 0.0508, "step": 775 }, { "epoch": 0.37271853986551395, "grad_norm": 0.6857139355602675, "learning_rate": 9.982204561481237e-06, "loss": 0.0466, "step": 776 }, { "epoch": 0.37319884726224783, "grad_norm": 0.6033713609538075, "learning_rate": 9.981968222082778e-06, "loss": 0.0537, "step": 777 }, { "epoch": 0.37367915465898177, "grad_norm": 0.5844207725067214, "learning_rate": 9.981730326455275e-06, "loss": 0.0434, "step": 778 }, { "epoch": 0.37415946205571565, "grad_norm": 0.9540307943992555, "learning_rate": 9.98149087467304e-06, "loss": 0.0525, "step": 779 }, { "epoch": 0.3746397694524496, "grad_norm": 0.7214661073713216, "learning_rate": 9.98124986681087e-06, "loss": 0.0743, "step": 780 }, { "epoch": 0.37512007684918347, "grad_norm": 0.6813082935520749, "learning_rate": 9.981007302944048e-06, "loss": 0.0826, "step": 781 }, { "epoch": 0.3756003842459174, "grad_norm": 0.5582899213242017, "learning_rate": 9.980763183148347e-06, "loss": 0.0603, "step": 782 }, { "epoch": 0.3760806916426513, "grad_norm": 0.5713120406122318, "learning_rate": 9.980517507500023e-06, "loss": 0.0418, "step": 783 }, { "epoch": 0.3765609990393852, "grad_norm": 0.5631392163416006, "learning_rate": 9.980270276075816e-06, "loss": 0.076, "step": 784 }, { "epoch": 0.3770413064361191, "grad_norm": 0.6842884389848988, "learning_rate": 9.980021488952957e-06, "loss": 0.0555, "step": 785 }, { "epoch": 0.37752161383285304, "grad_norm": 0.6012427638288268, "learning_rate": 9.979771146209159e-06, "loss": 0.0581, "step": 786 }, { "epoch": 0.3780019212295869, "grad_norm": 0.6380907947348216, "learning_rate": 9.97951924792262e-06, "loss": 0.0558, "step": 787 }, { "epoch": 0.37848222862632086, "grad_norm": 0.4499455933552752, "learning_rate": 9.979265794172029e-06, "loss": 0.045, "step": 788 }, { "epoch": 0.37896253602305474, "grad_norm": 0.9207107334074435, "learning_rate": 9.979010785036557e-06, "loss": 0.0746, "step": 789 }, { "epoch": 0.3794428434197887, "grad_norm": 0.6001083473477673, "learning_rate": 9.978754220595861e-06, "loss": 0.0578, "step": 790 }, { "epoch": 0.37992315081652256, "grad_norm": 0.5596962433628107, "learning_rate": 9.978496100930086e-06, "loss": 0.0584, "step": 791 }, { "epoch": 0.3804034582132565, "grad_norm": 0.5021012713607019, "learning_rate": 9.978236426119862e-06, "loss": 0.0538, "step": 792 }, { "epoch": 0.3808837656099904, "grad_norm": 0.5102988447573241, "learning_rate": 9.977975196246302e-06, "loss": 0.0498, "step": 793 }, { "epoch": 0.3813640730067243, "grad_norm": 0.6861671569949817, "learning_rate": 9.97771241139101e-06, "loss": 0.0644, "step": 794 }, { "epoch": 0.3818443804034582, "grad_norm": 0.6357301541893875, "learning_rate": 9.977448071636068e-06, "loss": 0.0601, "step": 795 }, { "epoch": 0.38232468780019213, "grad_norm": 0.4024807592618827, "learning_rate": 9.977182177064053e-06, "loss": 0.0379, "step": 796 }, { "epoch": 0.382804995196926, "grad_norm": 1.1288610789705666, "learning_rate": 9.97691472775802e-06, "loss": 0.071, "step": 797 }, { "epoch": 0.38328530259365995, "grad_norm": 0.6469760760858952, "learning_rate": 9.976645723801515e-06, "loss": 0.0654, "step": 798 }, { "epoch": 0.38376560999039383, "grad_norm": 0.5560215918537325, "learning_rate": 9.976375165278567e-06, "loss": 0.0512, "step": 799 }, { "epoch": 0.38424591738712777, "grad_norm": 0.7975603385140326, "learning_rate": 9.976103052273689e-06, "loss": 0.0883, "step": 800 }, { "epoch": 0.38472622478386165, "grad_norm": 0.7728034217886376, "learning_rate": 9.975829384871884e-06, "loss": 0.072, "step": 801 }, { "epoch": 0.3852065321805956, "grad_norm": 0.7767874847251652, "learning_rate": 9.975554163158636e-06, "loss": 0.0464, "step": 802 }, { "epoch": 0.38568683957732947, "grad_norm": 0.6266192997429625, "learning_rate": 9.975277387219919e-06, "loss": 0.0528, "step": 803 }, { "epoch": 0.3861671469740634, "grad_norm": 0.7250311644638058, "learning_rate": 9.97499905714219e-06, "loss": 0.0686, "step": 804 }, { "epoch": 0.3866474543707973, "grad_norm": 1.1253149275559908, "learning_rate": 9.974719173012388e-06, "loss": 0.049, "step": 805 }, { "epoch": 0.3871277617675312, "grad_norm": 0.6222049434177256, "learning_rate": 9.974437734917945e-06, "loss": 0.0653, "step": 806 }, { "epoch": 0.38760806916426516, "grad_norm": 0.6636209039456584, "learning_rate": 9.974154742946775e-06, "loss": 0.0619, "step": 807 }, { "epoch": 0.38808837656099904, "grad_norm": 1.347074867858616, "learning_rate": 9.973870197187272e-06, "loss": 0.0659, "step": 808 }, { "epoch": 0.388568683957733, "grad_norm": 0.685156022786215, "learning_rate": 9.973584097728325e-06, "loss": 0.0545, "step": 809 }, { "epoch": 0.38904899135446686, "grad_norm": 0.8480519439599661, "learning_rate": 9.973296444659301e-06, "loss": 0.0659, "step": 810 }, { "epoch": 0.3895292987512008, "grad_norm": 0.4931591852346543, "learning_rate": 9.973007238070057e-06, "loss": 0.0543, "step": 811 }, { "epoch": 0.3900096061479347, "grad_norm": 0.410538917518561, "learning_rate": 9.97271647805093e-06, "loss": 0.0356, "step": 812 }, { "epoch": 0.3904899135446686, "grad_norm": 0.6183835853418933, "learning_rate": 9.972424164692748e-06, "loss": 0.0548, "step": 813 }, { "epoch": 0.3909702209414025, "grad_norm": 0.5518914943760818, "learning_rate": 9.972130298086821e-06, "loss": 0.0615, "step": 814 }, { "epoch": 0.3914505283381364, "grad_norm": 0.48864613714652044, "learning_rate": 9.971834878324944e-06, "loss": 0.0505, "step": 815 }, { "epoch": 0.3919308357348703, "grad_norm": 0.5320681967319949, "learning_rate": 9.971537905499397e-06, "loss": 0.0501, "step": 816 }, { "epoch": 0.39241114313160425, "grad_norm": 0.45876250806739416, "learning_rate": 9.971239379702951e-06, "loss": 0.0385, "step": 817 }, { "epoch": 0.3928914505283381, "grad_norm": 0.8069704061567374, "learning_rate": 9.970939301028853e-06, "loss": 0.0805, "step": 818 }, { "epoch": 0.39337175792507206, "grad_norm": 0.8977429261025726, "learning_rate": 9.970637669570838e-06, "loss": 0.0594, "step": 819 }, { "epoch": 0.39385206532180594, "grad_norm": 0.509233161828096, "learning_rate": 9.97033448542313e-06, "loss": 0.0577, "step": 820 }, { "epoch": 0.3943323727185399, "grad_norm": 0.5471204223282794, "learning_rate": 9.970029748680437e-06, "loss": 0.0433, "step": 821 }, { "epoch": 0.39481268011527376, "grad_norm": 0.8632034689344821, "learning_rate": 9.969723459437945e-06, "loss": 0.0597, "step": 822 }, { "epoch": 0.3952929875120077, "grad_norm": 0.9564865645949854, "learning_rate": 9.969415617791336e-06, "loss": 0.0627, "step": 823 }, { "epoch": 0.3957732949087416, "grad_norm": 0.7450452763817546, "learning_rate": 9.969106223836766e-06, "loss": 0.0547, "step": 824 }, { "epoch": 0.3962536023054755, "grad_norm": 0.5891613173133876, "learning_rate": 9.968795277670886e-06, "loss": 0.0482, "step": 825 }, { "epoch": 0.3967339097022094, "grad_norm": 0.5741708509441278, "learning_rate": 9.968482779390824e-06, "loss": 0.0699, "step": 826 }, { "epoch": 0.39721421709894333, "grad_norm": 0.5752534750406278, "learning_rate": 9.968168729094197e-06, "loss": 0.0513, "step": 827 }, { "epoch": 0.3976945244956772, "grad_norm": 0.5909076992409328, "learning_rate": 9.967853126879103e-06, "loss": 0.0584, "step": 828 }, { "epoch": 0.39817483189241115, "grad_norm": 0.5478980970759338, "learning_rate": 9.967535972844131e-06, "loss": 0.0578, "step": 829 }, { "epoch": 0.39865513928914503, "grad_norm": 0.43802378334815034, "learning_rate": 9.96721726708835e-06, "loss": 0.047, "step": 830 }, { "epoch": 0.39913544668587897, "grad_norm": 0.5964278773959507, "learning_rate": 9.966897009711314e-06, "loss": 0.0553, "step": 831 }, { "epoch": 0.39961575408261285, "grad_norm": 0.8075938997085979, "learning_rate": 9.966575200813064e-06, "loss": 0.0671, "step": 832 }, { "epoch": 0.4000960614793468, "grad_norm": 0.5724293589628411, "learning_rate": 9.966251840494123e-06, "loss": 0.0565, "step": 833 }, { "epoch": 0.40057636887608067, "grad_norm": 0.8710191021054293, "learning_rate": 9.965926928855498e-06, "loss": 0.0723, "step": 834 }, { "epoch": 0.4010566762728146, "grad_norm": 0.5363225318409294, "learning_rate": 9.965600465998686e-06, "loss": 0.0458, "step": 835 }, { "epoch": 0.4015369836695485, "grad_norm": 0.7851083772033061, "learning_rate": 9.965272452025666e-06, "loss": 0.044, "step": 836 }, { "epoch": 0.4020172910662824, "grad_norm": 0.7693177422501187, "learning_rate": 9.964942887038893e-06, "loss": 0.0475, "step": 837 }, { "epoch": 0.4024975984630163, "grad_norm": 0.7987401635767456, "learning_rate": 9.964611771141322e-06, "loss": 0.0651, "step": 838 }, { "epoch": 0.40297790585975024, "grad_norm": 0.8276640756002739, "learning_rate": 9.96427910443638e-06, "loss": 0.0785, "step": 839 }, { "epoch": 0.4034582132564842, "grad_norm": 0.6809367893973843, "learning_rate": 9.963944887027985e-06, "loss": 0.0715, "step": 840 }, { "epoch": 0.40393852065321806, "grad_norm": 0.6804973064252685, "learning_rate": 9.963609119020538e-06, "loss": 0.0585, "step": 841 }, { "epoch": 0.404418828049952, "grad_norm": 0.5534861907234978, "learning_rate": 9.963271800518921e-06, "loss": 0.0535, "step": 842 }, { "epoch": 0.4048991354466859, "grad_norm": 1.0547311716146044, "learning_rate": 9.962932931628504e-06, "loss": 0.0651, "step": 843 }, { "epoch": 0.4053794428434198, "grad_norm": 0.6295639846864554, "learning_rate": 9.96259251245514e-06, "loss": 0.0568, "step": 844 }, { "epoch": 0.4058597502401537, "grad_norm": 0.5719452618102057, "learning_rate": 9.962250543105167e-06, "loss": 0.0418, "step": 845 }, { "epoch": 0.40634005763688763, "grad_norm": 0.7395158167833543, "learning_rate": 9.961907023685407e-06, "loss": 0.0567, "step": 846 }, { "epoch": 0.4068203650336215, "grad_norm": 0.5391369878497289, "learning_rate": 9.961561954303164e-06, "loss": 0.0506, "step": 847 }, { "epoch": 0.40730067243035545, "grad_norm": 0.8066572923047143, "learning_rate": 9.961215335066232e-06, "loss": 0.0883, "step": 848 }, { "epoch": 0.40778097982708933, "grad_norm": 0.5151043209208244, "learning_rate": 9.960867166082884e-06, "loss": 0.044, "step": 849 }, { "epoch": 0.40826128722382327, "grad_norm": 0.6317828023573119, "learning_rate": 9.960517447461875e-06, "loss": 0.0468, "step": 850 }, { "epoch": 0.40874159462055715, "grad_norm": 0.5968432172723486, "learning_rate": 9.96016617931245e-06, "loss": 0.05, "step": 851 }, { "epoch": 0.4092219020172911, "grad_norm": 0.48351796440258127, "learning_rate": 9.959813361744337e-06, "loss": 0.0422, "step": 852 }, { "epoch": 0.40970220941402496, "grad_norm": 0.45514792871791687, "learning_rate": 9.959458994867744e-06, "loss": 0.052, "step": 853 }, { "epoch": 0.4101825168107589, "grad_norm": 0.8035467442104323, "learning_rate": 9.959103078793364e-06, "loss": 0.0897, "step": 854 }, { "epoch": 0.4106628242074928, "grad_norm": 0.7097960300705127, "learning_rate": 9.95874561363238e-06, "loss": 0.0551, "step": 855 }, { "epoch": 0.4111431316042267, "grad_norm": 0.40391671652511296, "learning_rate": 9.95838659949645e-06, "loss": 0.0414, "step": 856 }, { "epoch": 0.4116234390009606, "grad_norm": 0.6170051420564876, "learning_rate": 9.958026036497723e-06, "loss": 0.0559, "step": 857 }, { "epoch": 0.41210374639769454, "grad_norm": 0.7115787557160136, "learning_rate": 9.957663924748828e-06, "loss": 0.0565, "step": 858 }, { "epoch": 0.4125840537944284, "grad_norm": 0.8705757145217116, "learning_rate": 9.957300264362878e-06, "loss": 0.0674, "step": 859 }, { "epoch": 0.41306436119116235, "grad_norm": 0.5500919289906383, "learning_rate": 9.95693505545347e-06, "loss": 0.0621, "step": 860 }, { "epoch": 0.41354466858789624, "grad_norm": 0.660836487601902, "learning_rate": 9.956568298134687e-06, "loss": 0.0632, "step": 861 }, { "epoch": 0.4140249759846302, "grad_norm": 0.9931232470867721, "learning_rate": 9.956199992521092e-06, "loss": 0.0653, "step": 862 }, { "epoch": 0.41450528338136405, "grad_norm": 0.7024061823998546, "learning_rate": 9.955830138727736e-06, "loss": 0.0593, "step": 863 }, { "epoch": 0.414985590778098, "grad_norm": 0.4764259054817133, "learning_rate": 9.955458736870148e-06, "loss": 0.0526, "step": 864 }, { "epoch": 0.41546589817483187, "grad_norm": 0.7066580129944023, "learning_rate": 9.955085787064344e-06, "loss": 0.0541, "step": 865 }, { "epoch": 0.4159462055715658, "grad_norm": 0.665310242989133, "learning_rate": 9.954711289426826e-06, "loss": 0.061, "step": 866 }, { "epoch": 0.4164265129682997, "grad_norm": 0.5484058455441252, "learning_rate": 9.954335244074575e-06, "loss": 0.0459, "step": 867 }, { "epoch": 0.4169068203650336, "grad_norm": 0.5666272682660084, "learning_rate": 9.953957651125056e-06, "loss": 0.0557, "step": 868 }, { "epoch": 0.4173871277617675, "grad_norm": 0.8668633540680836, "learning_rate": 9.95357851069622e-06, "loss": 0.0768, "step": 869 }, { "epoch": 0.41786743515850144, "grad_norm": 0.5410731748495422, "learning_rate": 9.9531978229065e-06, "loss": 0.0627, "step": 870 }, { "epoch": 0.4183477425552353, "grad_norm": 0.5775107068931661, "learning_rate": 9.952815587874811e-06, "loss": 0.0496, "step": 871 }, { "epoch": 0.41882804995196926, "grad_norm": 0.690239624690725, "learning_rate": 9.952431805720555e-06, "loss": 0.0751, "step": 872 }, { "epoch": 0.41930835734870314, "grad_norm": 0.49376413841588845, "learning_rate": 9.952046476563614e-06, "loss": 0.0552, "step": 873 }, { "epoch": 0.4197886647454371, "grad_norm": 0.6948681019679253, "learning_rate": 9.951659600524353e-06, "loss": 0.0527, "step": 874 }, { "epoch": 0.420268972142171, "grad_norm": 0.49290241630164605, "learning_rate": 9.951271177723623e-06, "loss": 0.0506, "step": 875 }, { "epoch": 0.4207492795389049, "grad_norm": 0.8721248981892464, "learning_rate": 9.950881208282755e-06, "loss": 0.0569, "step": 876 }, { "epoch": 0.42122958693563883, "grad_norm": 0.5497841692019572, "learning_rate": 9.950489692323564e-06, "loss": 0.0633, "step": 877 }, { "epoch": 0.4217098943323727, "grad_norm": 0.6397458240250244, "learning_rate": 9.950096629968353e-06, "loss": 0.0537, "step": 878 }, { "epoch": 0.42219020172910665, "grad_norm": 0.5682428472860261, "learning_rate": 9.949702021339897e-06, "loss": 0.0558, "step": 879 }, { "epoch": 0.42267050912584053, "grad_norm": 1.0280188654825122, "learning_rate": 9.949305866561468e-06, "loss": 0.0732, "step": 880 }, { "epoch": 0.42315081652257447, "grad_norm": 0.6451938425896641, "learning_rate": 9.94890816575681e-06, "loss": 0.0668, "step": 881 }, { "epoch": 0.42363112391930835, "grad_norm": 0.45108905962672585, "learning_rate": 9.948508919050153e-06, "loss": 0.0309, "step": 882 }, { "epoch": 0.4241114313160423, "grad_norm": 0.7378292399931741, "learning_rate": 9.948108126566213e-06, "loss": 0.0805, "step": 883 }, { "epoch": 0.42459173871277617, "grad_norm": 0.5519105751376474, "learning_rate": 9.947705788430185e-06, "loss": 0.0521, "step": 884 }, { "epoch": 0.4250720461095101, "grad_norm": 0.6856313077407358, "learning_rate": 9.94730190476775e-06, "loss": 0.0644, "step": 885 }, { "epoch": 0.425552353506244, "grad_norm": 0.5969546888876337, "learning_rate": 9.946896475705067e-06, "loss": 0.0434, "step": 886 }, { "epoch": 0.4260326609029779, "grad_norm": 0.45619128279304955, "learning_rate": 9.946489501368783e-06, "loss": 0.051, "step": 887 }, { "epoch": 0.4265129682997118, "grad_norm": 0.6592474450888789, "learning_rate": 9.946080981886025e-06, "loss": 0.0482, "step": 888 }, { "epoch": 0.42699327569644574, "grad_norm": 0.8000680158848036, "learning_rate": 9.945670917384404e-06, "loss": 0.0779, "step": 889 }, { "epoch": 0.4274735830931796, "grad_norm": 0.5334860843524303, "learning_rate": 9.94525930799201e-06, "loss": 0.0489, "step": 890 }, { "epoch": 0.42795389048991356, "grad_norm": 0.5254575930785399, "learning_rate": 9.944846153837423e-06, "loss": 0.0397, "step": 891 }, { "epoch": 0.42843419788664744, "grad_norm": 0.5660738613547326, "learning_rate": 9.944431455049697e-06, "loss": 0.0529, "step": 892 }, { "epoch": 0.4289145052833814, "grad_norm": 0.7532191005217946, "learning_rate": 9.944015211758375e-06, "loss": 0.0567, "step": 893 }, { "epoch": 0.42939481268011526, "grad_norm": 0.6751729764036046, "learning_rate": 9.943597424093477e-06, "loss": 0.0536, "step": 894 }, { "epoch": 0.4298751200768492, "grad_norm": 0.7399414072106839, "learning_rate": 9.943178092185511e-06, "loss": 0.0627, "step": 895 }, { "epoch": 0.4303554274735831, "grad_norm": 0.5319921340444789, "learning_rate": 9.942757216165464e-06, "loss": 0.0526, "step": 896 }, { "epoch": 0.430835734870317, "grad_norm": 0.9231172091302492, "learning_rate": 9.942334796164805e-06, "loss": 0.0611, "step": 897 }, { "epoch": 0.4313160422670509, "grad_norm": 0.7768175094389944, "learning_rate": 9.941910832315488e-06, "loss": 0.0639, "step": 898 }, { "epoch": 0.43179634966378483, "grad_norm": 0.4825476064702085, "learning_rate": 9.941485324749947e-06, "loss": 0.0518, "step": 899 }, { "epoch": 0.4322766570605187, "grad_norm": 0.7021828010660213, "learning_rate": 9.941058273601097e-06, "loss": 0.0593, "step": 900 }, { "epoch": 0.43275696445725265, "grad_norm": 0.5871945041167091, "learning_rate": 9.94062967900234e-06, "loss": 0.0574, "step": 901 }, { "epoch": 0.4332372718539865, "grad_norm": 0.7091892488086937, "learning_rate": 9.940199541087554e-06, "loss": 0.0709, "step": 902 }, { "epoch": 0.43371757925072046, "grad_norm": 0.5164796945559618, "learning_rate": 9.939767859991104e-06, "loss": 0.0567, "step": 903 }, { "epoch": 0.43419788664745435, "grad_norm": 1.0053328543134505, "learning_rate": 9.939334635847834e-06, "loss": 0.0623, "step": 904 }, { "epoch": 0.4346781940441883, "grad_norm": 1.014246583976263, "learning_rate": 9.938899868793074e-06, "loss": 0.0678, "step": 905 }, { "epoch": 0.43515850144092216, "grad_norm": 0.620672410400972, "learning_rate": 9.93846355896263e-06, "loss": 0.0499, "step": 906 }, { "epoch": 0.4356388088376561, "grad_norm": 0.6018522052869041, "learning_rate": 9.938025706492796e-06, "loss": 0.0633, "step": 907 }, { "epoch": 0.43611911623439004, "grad_norm": 0.7802965838379325, "learning_rate": 9.937586311520342e-06, "loss": 0.0542, "step": 908 }, { "epoch": 0.4365994236311239, "grad_norm": 0.7890750457426956, "learning_rate": 9.937145374182523e-06, "loss": 0.0674, "step": 909 }, { "epoch": 0.43707973102785785, "grad_norm": 0.6932375338768142, "learning_rate": 9.936702894617081e-06, "loss": 0.0705, "step": 910 }, { "epoch": 0.43756003842459174, "grad_norm": 0.6257587333406103, "learning_rate": 9.936258872962229e-06, "loss": 0.0628, "step": 911 }, { "epoch": 0.43804034582132567, "grad_norm": 0.5607233207353541, "learning_rate": 9.935813309356666e-06, "loss": 0.0544, "step": 912 }, { "epoch": 0.43852065321805955, "grad_norm": 0.6321356409882269, "learning_rate": 9.935366203939579e-06, "loss": 0.0491, "step": 913 }, { "epoch": 0.4390009606147935, "grad_norm": 0.6757767770961596, "learning_rate": 9.934917556850625e-06, "loss": 0.0434, "step": 914 }, { "epoch": 0.43948126801152737, "grad_norm": 0.7906338628745093, "learning_rate": 9.934467368229955e-06, "loss": 0.0638, "step": 915 }, { "epoch": 0.4399615754082613, "grad_norm": 0.4652920811833901, "learning_rate": 9.934015638218193e-06, "loss": 0.0519, "step": 916 }, { "epoch": 0.4404418828049952, "grad_norm": 0.5470520415253206, "learning_rate": 9.933562366956445e-06, "loss": 0.0574, "step": 917 }, { "epoch": 0.4409221902017291, "grad_norm": 0.7607179274563025, "learning_rate": 9.933107554586303e-06, "loss": 0.0675, "step": 918 }, { "epoch": 0.441402497598463, "grad_norm": 0.5532199345436009, "learning_rate": 9.93265120124984e-06, "loss": 0.0446, "step": 919 }, { "epoch": 0.44188280499519694, "grad_norm": 0.501284318665828, "learning_rate": 9.932193307089602e-06, "loss": 0.0518, "step": 920 }, { "epoch": 0.4423631123919308, "grad_norm": 0.6013902688897093, "learning_rate": 9.931733872248626e-06, "loss": 0.0584, "step": 921 }, { "epoch": 0.44284341978866476, "grad_norm": 0.5972118740317204, "learning_rate": 9.931272896870427e-06, "loss": 0.0476, "step": 922 }, { "epoch": 0.44332372718539864, "grad_norm": 2.253298791198679, "learning_rate": 9.930810381098999e-06, "loss": 0.0561, "step": 923 }, { "epoch": 0.4438040345821326, "grad_norm": 0.6102253348891266, "learning_rate": 9.93034632507882e-06, "loss": 0.0482, "step": 924 }, { "epoch": 0.44428434197886646, "grad_norm": 0.4713494577341782, "learning_rate": 9.929880728954853e-06, "loss": 0.0371, "step": 925 }, { "epoch": 0.4447646493756004, "grad_norm": 0.6627531547942727, "learning_rate": 9.92941359287253e-06, "loss": 0.0509, "step": 926 }, { "epoch": 0.4452449567723343, "grad_norm": 0.5428810469885857, "learning_rate": 9.928944916977775e-06, "loss": 0.051, "step": 927 }, { "epoch": 0.4457252641690682, "grad_norm": 0.6643297934290499, "learning_rate": 9.92847470141699e-06, "loss": 0.0765, "step": 928 }, { "epoch": 0.4462055715658021, "grad_norm": 0.5681242371551549, "learning_rate": 9.928002946337055e-06, "loss": 0.0487, "step": 929 }, { "epoch": 0.44668587896253603, "grad_norm": 0.45349350065521055, "learning_rate": 9.927529651885334e-06, "loss": 0.0548, "step": 930 }, { "epoch": 0.4471661863592699, "grad_norm": 0.42679372029366797, "learning_rate": 9.92705481820967e-06, "loss": 0.0529, "step": 931 }, { "epoch": 0.44764649375600385, "grad_norm": 0.4695476492659101, "learning_rate": 9.926578445458393e-06, "loss": 0.0414, "step": 932 }, { "epoch": 0.44812680115273773, "grad_norm": 0.5300464019739584, "learning_rate": 9.926100533780304e-06, "loss": 0.0439, "step": 933 }, { "epoch": 0.44860710854947167, "grad_norm": 0.4180296006214648, "learning_rate": 9.92562108332469e-06, "loss": 0.0391, "step": 934 }, { "epoch": 0.44908741594620555, "grad_norm": 0.40241583405442505, "learning_rate": 9.92514009424132e-06, "loss": 0.0523, "step": 935 }, { "epoch": 0.4495677233429395, "grad_norm": 0.4954138376589334, "learning_rate": 9.924657566680438e-06, "loss": 0.058, "step": 936 }, { "epoch": 0.45004803073967337, "grad_norm": 0.6273768947654529, "learning_rate": 9.924173500792775e-06, "loss": 0.055, "step": 937 }, { "epoch": 0.4505283381364073, "grad_norm": 0.3724637804837822, "learning_rate": 9.92368789672954e-06, "loss": 0.0334, "step": 938 }, { "epoch": 0.4510086455331412, "grad_norm": 0.5849978243981226, "learning_rate": 9.923200754642422e-06, "loss": 0.0443, "step": 939 }, { "epoch": 0.4514889529298751, "grad_norm": 0.49650880356005744, "learning_rate": 9.92271207468359e-06, "loss": 0.0436, "step": 940 }, { "epoch": 0.45196926032660906, "grad_norm": 0.44519643938812636, "learning_rate": 9.922221857005693e-06, "loss": 0.0523, "step": 941 }, { "epoch": 0.45244956772334294, "grad_norm": 0.983957220828581, "learning_rate": 9.921730101761865e-06, "loss": 0.0945, "step": 942 }, { "epoch": 0.4529298751200769, "grad_norm": 0.6035634372006827, "learning_rate": 9.921236809105711e-06, "loss": 0.0522, "step": 943 }, { "epoch": 0.45341018251681076, "grad_norm": 0.9095761086634716, "learning_rate": 9.92074197919133e-06, "loss": 0.0482, "step": 944 }, { "epoch": 0.4538904899135447, "grad_norm": 0.7038000923358203, "learning_rate": 9.920245612173288e-06, "loss": 0.06, "step": 945 }, { "epoch": 0.4543707973102786, "grad_norm": 0.5317259635381724, "learning_rate": 9.919747708206635e-06, "loss": 0.0513, "step": 946 }, { "epoch": 0.4548511047070125, "grad_norm": 1.612873760960674, "learning_rate": 9.919248267446904e-06, "loss": 0.0555, "step": 947 }, { "epoch": 0.4553314121037464, "grad_norm": 0.46423671806106315, "learning_rate": 9.918747290050108e-06, "loss": 0.0368, "step": 948 }, { "epoch": 0.45581171950048033, "grad_norm": 0.7137648813885012, "learning_rate": 9.918244776172739e-06, "loss": 0.058, "step": 949 }, { "epoch": 0.4562920268972142, "grad_norm": 0.677167631389359, "learning_rate": 9.917740725971765e-06, "loss": 0.0476, "step": 950 }, { "epoch": 0.45677233429394815, "grad_norm": 0.6324916873646828, "learning_rate": 9.91723513960464e-06, "loss": 0.0524, "step": 951 }, { "epoch": 0.457252641690682, "grad_norm": 0.5324488356285513, "learning_rate": 9.916728017229293e-06, "loss": 0.0398, "step": 952 }, { "epoch": 0.45773294908741596, "grad_norm": 0.6324777151535353, "learning_rate": 9.916219359004137e-06, "loss": 0.0571, "step": 953 }, { "epoch": 0.45821325648414984, "grad_norm": 0.6435220215655985, "learning_rate": 9.915709165088063e-06, "loss": 0.0696, "step": 954 }, { "epoch": 0.4586935638808838, "grad_norm": 0.8164681729134681, "learning_rate": 9.91519743564044e-06, "loss": 0.0611, "step": 955 }, { "epoch": 0.45917387127761766, "grad_norm": 0.5313310873883075, "learning_rate": 9.914684170821119e-06, "loss": 0.0476, "step": 956 }, { "epoch": 0.4596541786743516, "grad_norm": 0.5973052937068528, "learning_rate": 9.91416937079043e-06, "loss": 0.0757, "step": 957 }, { "epoch": 0.4601344860710855, "grad_norm": 0.4440073895198674, "learning_rate": 9.91365303570918e-06, "loss": 0.0435, "step": 958 }, { "epoch": 0.4606147934678194, "grad_norm": 0.5635682614371156, "learning_rate": 9.913135165738661e-06, "loss": 0.0601, "step": 959 }, { "epoch": 0.4610951008645533, "grad_norm": 0.6205543687338353, "learning_rate": 9.91261576104064e-06, "loss": 0.0607, "step": 960 }, { "epoch": 0.46157540826128723, "grad_norm": 0.8474866942319256, "learning_rate": 9.912094821777362e-06, "loss": 0.0508, "step": 961 }, { "epoch": 0.4620557156580211, "grad_norm": 0.4415856929796406, "learning_rate": 9.91157234811156e-06, "loss": 0.0426, "step": 962 }, { "epoch": 0.46253602305475505, "grad_norm": 0.4965072565036624, "learning_rate": 9.911048340206435e-06, "loss": 0.0485, "step": 963 }, { "epoch": 0.46301633045148893, "grad_norm": 0.7207884825617963, "learning_rate": 9.910522798225673e-06, "loss": 0.0767, "step": 964 }, { "epoch": 0.46349663784822287, "grad_norm": 0.6441736297504665, "learning_rate": 9.909995722333442e-06, "loss": 0.0582, "step": 965 }, { "epoch": 0.46397694524495675, "grad_norm": 0.6229851666491163, "learning_rate": 9.909467112694385e-06, "loss": 0.0649, "step": 966 }, { "epoch": 0.4644572526416907, "grad_norm": 0.7004494943555968, "learning_rate": 9.908936969473621e-06, "loss": 0.0479, "step": 967 }, { "epoch": 0.46493756003842457, "grad_norm": 0.7544245132592255, "learning_rate": 9.908405292836758e-06, "loss": 0.0604, "step": 968 }, { "epoch": 0.4654178674351585, "grad_norm": 0.7931595109408737, "learning_rate": 9.907872082949873e-06, "loss": 0.0822, "step": 969 }, { "epoch": 0.4658981748318924, "grad_norm": 0.6826313680933557, "learning_rate": 9.907337339979525e-06, "loss": 0.0481, "step": 970 }, { "epoch": 0.4663784822286263, "grad_norm": 0.5041754294571876, "learning_rate": 9.90680106409276e-06, "loss": 0.0417, "step": 971 }, { "epoch": 0.4668587896253602, "grad_norm": 0.8823164058501882, "learning_rate": 9.906263255457087e-06, "loss": 0.0652, "step": 972 }, { "epoch": 0.46733909702209414, "grad_norm": 0.7871751423538097, "learning_rate": 9.905723914240507e-06, "loss": 0.0654, "step": 973 }, { "epoch": 0.4678194044188281, "grad_norm": 0.8027977963955859, "learning_rate": 9.905183040611498e-06, "loss": 0.0545, "step": 974 }, { "epoch": 0.46829971181556196, "grad_norm": 0.8567181353492188, "learning_rate": 9.904640634739007e-06, "loss": 0.0728, "step": 975 }, { "epoch": 0.4687800192122959, "grad_norm": 0.4735301698046212, "learning_rate": 9.904096696792472e-06, "loss": 0.0402, "step": 976 }, { "epoch": 0.4692603266090298, "grad_norm": 0.9898206363808187, "learning_rate": 9.903551226941801e-06, "loss": 0.078, "step": 977 }, { "epoch": 0.4697406340057637, "grad_norm": 0.6741220652957427, "learning_rate": 9.903004225357387e-06, "loss": 0.0793, "step": 978 }, { "epoch": 0.4702209414024976, "grad_norm": 0.44391076772330806, "learning_rate": 9.902455692210094e-06, "loss": 0.0519, "step": 979 }, { "epoch": 0.47070124879923153, "grad_norm": 0.8124544269575636, "learning_rate": 9.901905627671273e-06, "loss": 0.0534, "step": 980 }, { "epoch": 0.4711815561959654, "grad_norm": 0.7481718577639421, "learning_rate": 9.901354031912746e-06, "loss": 0.0819, "step": 981 }, { "epoch": 0.47166186359269935, "grad_norm": 1.8659551510872119, "learning_rate": 9.900800905106817e-06, "loss": 0.0675, "step": 982 }, { "epoch": 0.47214217098943323, "grad_norm": 0.695535075758422, "learning_rate": 9.900246247426269e-06, "loss": 0.0622, "step": 983 }, { "epoch": 0.47262247838616717, "grad_norm": 0.49991906676132497, "learning_rate": 9.899690059044358e-06, "loss": 0.0573, "step": 984 }, { "epoch": 0.47310278578290105, "grad_norm": 0.8200411105348173, "learning_rate": 9.899132340134825e-06, "loss": 0.0867, "step": 985 }, { "epoch": 0.473583093179635, "grad_norm": 0.6775421926604714, "learning_rate": 9.898573090871885e-06, "loss": 0.05, "step": 986 }, { "epoch": 0.47406340057636887, "grad_norm": 0.5492047748590122, "learning_rate": 9.898012311430232e-06, "loss": 0.0538, "step": 987 }, { "epoch": 0.4745437079731028, "grad_norm": 0.6656920927770863, "learning_rate": 9.897450001985038e-06, "loss": 0.0677, "step": 988 }, { "epoch": 0.4750240153698367, "grad_norm": 1.072363462917926, "learning_rate": 9.896886162711955e-06, "loss": 0.0523, "step": 989 }, { "epoch": 0.4755043227665706, "grad_norm": 0.627671736207023, "learning_rate": 9.896320793787106e-06, "loss": 0.0499, "step": 990 }, { "epoch": 0.4759846301633045, "grad_norm": 0.8567917544335966, "learning_rate": 9.895753895387101e-06, "loss": 0.0564, "step": 991 }, { "epoch": 0.47646493756003844, "grad_norm": 1.0192507334666507, "learning_rate": 9.895185467689022e-06, "loss": 0.0674, "step": 992 }, { "epoch": 0.4769452449567723, "grad_norm": 0.7961715978827638, "learning_rate": 9.894615510870429e-06, "loss": 0.0831, "step": 993 }, { "epoch": 0.47742555235350626, "grad_norm": 0.5483648725578345, "learning_rate": 9.894044025109363e-06, "loss": 0.0485, "step": 994 }, { "epoch": 0.47790585975024014, "grad_norm": 0.8078314585601833, "learning_rate": 9.893471010584337e-06, "loss": 0.0625, "step": 995 }, { "epoch": 0.4783861671469741, "grad_norm": 0.6306541062019995, "learning_rate": 9.892896467474348e-06, "loss": 0.0678, "step": 996 }, { "epoch": 0.47886647454370795, "grad_norm": 0.4985055781130739, "learning_rate": 9.892320395958865e-06, "loss": 0.0508, "step": 997 }, { "epoch": 0.4793467819404419, "grad_norm": 0.5440005750071006, "learning_rate": 9.89174279621784e-06, "loss": 0.0441, "step": 998 }, { "epoch": 0.47982708933717577, "grad_norm": 0.7286303002598773, "learning_rate": 9.891163668431696e-06, "loss": 0.0555, "step": 999 }, { "epoch": 0.4803073967339097, "grad_norm": 0.5690411831803118, "learning_rate": 9.890583012781338e-06, "loss": 0.0458, "step": 1000 }, { "epoch": 0.4807877041306436, "grad_norm": 0.46362095698553785, "learning_rate": 9.890000829448145e-06, "loss": 0.0385, "step": 1001 }, { "epoch": 0.4812680115273775, "grad_norm": 0.6543145086812504, "learning_rate": 9.889417118613978e-06, "loss": 0.0683, "step": 1002 }, { "epoch": 0.4817483189241114, "grad_norm": 0.556537310472474, "learning_rate": 9.888831880461171e-06, "loss": 0.0454, "step": 1003 }, { "epoch": 0.48222862632084534, "grad_norm": 0.6944965802448689, "learning_rate": 9.888245115172535e-06, "loss": 0.0499, "step": 1004 }, { "epoch": 0.4827089337175792, "grad_norm": 1.9875426185604483, "learning_rate": 9.88765682293136e-06, "loss": 0.0623, "step": 1005 }, { "epoch": 0.48318924111431316, "grad_norm": 0.5589465167484113, "learning_rate": 9.887067003921412e-06, "loss": 0.0549, "step": 1006 }, { "epoch": 0.48366954851104704, "grad_norm": 0.5825022233011566, "learning_rate": 9.886475658326935e-06, "loss": 0.0415, "step": 1007 }, { "epoch": 0.484149855907781, "grad_norm": 0.8085519967345373, "learning_rate": 9.885882786332647e-06, "loss": 0.0618, "step": 1008 }, { "epoch": 0.4846301633045149, "grad_norm": 0.6678730905959983, "learning_rate": 9.885288388123748e-06, "loss": 0.0466, "step": 1009 }, { "epoch": 0.4851104707012488, "grad_norm": 0.6528204330501064, "learning_rate": 9.88469246388591e-06, "loss": 0.0462, "step": 1010 }, { "epoch": 0.48559077809798273, "grad_norm": 0.50393839362882, "learning_rate": 9.884095013805282e-06, "loss": 0.0418, "step": 1011 }, { "epoch": 0.4860710854947166, "grad_norm": 0.6274523697170888, "learning_rate": 9.88349603806849e-06, "loss": 0.0687, "step": 1012 }, { "epoch": 0.48655139289145055, "grad_norm": 0.6366989175368812, "learning_rate": 9.882895536862643e-06, "loss": 0.0485, "step": 1013 }, { "epoch": 0.48703170028818443, "grad_norm": 0.48888844573115503, "learning_rate": 9.882293510375314e-06, "loss": 0.0512, "step": 1014 }, { "epoch": 0.48751200768491837, "grad_norm": 0.5932068169061537, "learning_rate": 9.881689958794564e-06, "loss": 0.0488, "step": 1015 }, { "epoch": 0.48799231508165225, "grad_norm": 1.2627441745695316, "learning_rate": 9.881084882308924e-06, "loss": 0.0714, "step": 1016 }, { "epoch": 0.4884726224783862, "grad_norm": 0.9627073875348847, "learning_rate": 9.880478281107404e-06, "loss": 0.0577, "step": 1017 }, { "epoch": 0.48895292987512007, "grad_norm": 0.7088884846326223, "learning_rate": 9.87987015537949e-06, "loss": 0.0874, "step": 1018 }, { "epoch": 0.489433237271854, "grad_norm": 0.7529185355526793, "learning_rate": 9.879260505315143e-06, "loss": 0.0497, "step": 1019 }, { "epoch": 0.4899135446685879, "grad_norm": 0.6165894758520094, "learning_rate": 9.878649331104798e-06, "loss": 0.0604, "step": 1020 }, { "epoch": 0.4903938520653218, "grad_norm": 0.5042776271300216, "learning_rate": 9.878036632939374e-06, "loss": 0.047, "step": 1021 }, { "epoch": 0.4908741594620557, "grad_norm": 0.44643658788918444, "learning_rate": 9.877422411010257e-06, "loss": 0.0498, "step": 1022 }, { "epoch": 0.49135446685878964, "grad_norm": 0.5634531390244911, "learning_rate": 9.876806665509314e-06, "loss": 0.0569, "step": 1023 }, { "epoch": 0.4918347742555235, "grad_norm": 0.5619776061410058, "learning_rate": 9.876189396628889e-06, "loss": 0.0446, "step": 1024 }, { "epoch": 0.49231508165225746, "grad_norm": 0.9390023063566659, "learning_rate": 9.875570604561796e-06, "loss": 0.0792, "step": 1025 }, { "epoch": 0.49279538904899134, "grad_norm": 0.5013365650894346, "learning_rate": 9.874950289501332e-06, "loss": 0.0519, "step": 1026 }, { "epoch": 0.4932756964457253, "grad_norm": 0.46104032503551207, "learning_rate": 9.874328451641264e-06, "loss": 0.0458, "step": 1027 }, { "epoch": 0.49375600384245916, "grad_norm": 0.46235258731993134, "learning_rate": 9.873705091175838e-06, "loss": 0.0417, "step": 1028 }, { "epoch": 0.4942363112391931, "grad_norm": 0.49971536792567545, "learning_rate": 9.873080208299773e-06, "loss": 0.045, "step": 1029 }, { "epoch": 0.494716618635927, "grad_norm": 0.5340542754950781, "learning_rate": 9.872453803208268e-06, "loss": 0.0455, "step": 1030 }, { "epoch": 0.4951969260326609, "grad_norm": 0.5727266302637889, "learning_rate": 9.871825876096992e-06, "loss": 0.0461, "step": 1031 }, { "epoch": 0.4956772334293948, "grad_norm": 0.6471275463338492, "learning_rate": 9.871196427162094e-06, "loss": 0.0531, "step": 1032 }, { "epoch": 0.49615754082612873, "grad_norm": 0.8462618075680615, "learning_rate": 9.870565456600194e-06, "loss": 0.0913, "step": 1033 }, { "epoch": 0.4966378482228626, "grad_norm": 0.6248139766098908, "learning_rate": 9.869932964608392e-06, "loss": 0.0488, "step": 1034 }, { "epoch": 0.49711815561959655, "grad_norm": 0.5031224522406227, "learning_rate": 9.86929895138426e-06, "loss": 0.0463, "step": 1035 }, { "epoch": 0.49759846301633043, "grad_norm": 0.6015714100591678, "learning_rate": 9.868663417125849e-06, "loss": 0.0543, "step": 1036 }, { "epoch": 0.49807877041306436, "grad_norm": 0.7015166961244664, "learning_rate": 9.868026362031676e-06, "loss": 0.0577, "step": 1037 }, { "epoch": 0.49855907780979825, "grad_norm": 0.5811321733944121, "learning_rate": 9.867387786300743e-06, "loss": 0.046, "step": 1038 }, { "epoch": 0.4990393852065322, "grad_norm": 0.9085434522653888, "learning_rate": 9.866747690132527e-06, "loss": 0.073, "step": 1039 }, { "epoch": 0.49951969260326606, "grad_norm": 0.6591822101096578, "learning_rate": 9.866106073726971e-06, "loss": 0.0567, "step": 1040 }, { "epoch": 0.5, "grad_norm": 0.835558730775927, "learning_rate": 9.865462937284501e-06, "loss": 0.0535, "step": 1041 }, { "epoch": 0.5004803073967339, "grad_norm": 0.5892727698062462, "learning_rate": 9.864818281006013e-06, "loss": 0.0439, "step": 1042 }, { "epoch": 0.5009606147934679, "grad_norm": 0.8395833865727974, "learning_rate": 9.86417210509288e-06, "loss": 0.0718, "step": 1043 }, { "epoch": 0.5014409221902018, "grad_norm": 1.1531297612195082, "learning_rate": 9.86352440974695e-06, "loss": 0.0701, "step": 1044 }, { "epoch": 0.5019212295869356, "grad_norm": 0.5682920886173015, "learning_rate": 9.862875195170547e-06, "loss": 0.0478, "step": 1045 }, { "epoch": 0.5024015369836695, "grad_norm": 0.5479631332409276, "learning_rate": 9.862224461566467e-06, "loss": 0.0481, "step": 1046 }, { "epoch": 0.5028818443804035, "grad_norm": 0.6686914490763103, "learning_rate": 9.861572209137978e-06, "loss": 0.054, "step": 1047 }, { "epoch": 0.5033621517771374, "grad_norm": 0.5548966562906734, "learning_rate": 9.860918438088828e-06, "loss": 0.058, "step": 1048 }, { "epoch": 0.5038424591738713, "grad_norm": 0.6221834543978872, "learning_rate": 9.860263148623238e-06, "loss": 0.0663, "step": 1049 }, { "epoch": 0.5043227665706052, "grad_norm": 0.4096266159270642, "learning_rate": 9.859606340945904e-06, "loss": 0.0339, "step": 1050 }, { "epoch": 0.5048030739673391, "grad_norm": 1.091503069790824, "learning_rate": 9.858948015261988e-06, "loss": 0.0628, "step": 1051 }, { "epoch": 0.505283381364073, "grad_norm": 0.6683717006752189, "learning_rate": 9.858288171777137e-06, "loss": 0.0588, "step": 1052 }, { "epoch": 0.5057636887608069, "grad_norm": 0.6905695109286866, "learning_rate": 9.857626810697468e-06, "loss": 0.0633, "step": 1053 }, { "epoch": 0.5062439961575408, "grad_norm": 0.5075936442073656, "learning_rate": 9.85696393222957e-06, "loss": 0.0475, "step": 1054 }, { "epoch": 0.5067243035542748, "grad_norm": 0.59460142658833, "learning_rate": 9.856299536580511e-06, "loss": 0.0494, "step": 1055 }, { "epoch": 0.5072046109510087, "grad_norm": 1.6512164384609662, "learning_rate": 9.855633623957828e-06, "loss": 0.0727, "step": 1056 }, { "epoch": 0.5076849183477425, "grad_norm": 0.5580012014124549, "learning_rate": 9.854966194569533e-06, "loss": 0.0441, "step": 1057 }, { "epoch": 0.5081652257444764, "grad_norm": 0.5763621029835471, "learning_rate": 9.854297248624113e-06, "loss": 0.0477, "step": 1058 }, { "epoch": 0.5086455331412104, "grad_norm": 0.8992511423911591, "learning_rate": 9.853626786330529e-06, "loss": 0.0812, "step": 1059 }, { "epoch": 0.5091258405379443, "grad_norm": 0.5368922969181873, "learning_rate": 9.852954807898212e-06, "loss": 0.0467, "step": 1060 }, { "epoch": 0.5096061479346782, "grad_norm": 0.44033960156979046, "learning_rate": 9.852281313537074e-06, "loss": 0.0426, "step": 1061 }, { "epoch": 0.5100864553314121, "grad_norm": 0.46569533955278264, "learning_rate": 9.851606303457492e-06, "loss": 0.0423, "step": 1062 }, { "epoch": 0.510566762728146, "grad_norm": 0.6709478469641016, "learning_rate": 9.850929777870324e-06, "loss": 0.0635, "step": 1063 }, { "epoch": 0.5110470701248799, "grad_norm": 0.7589418223076211, "learning_rate": 9.850251736986895e-06, "loss": 0.0485, "step": 1064 }, { "epoch": 0.5115273775216138, "grad_norm": 0.5039707034476073, "learning_rate": 9.849572181019008e-06, "loss": 0.0396, "step": 1065 }, { "epoch": 0.5120076849183477, "grad_norm": 0.665169242046722, "learning_rate": 9.848891110178936e-06, "loss": 0.056, "step": 1066 }, { "epoch": 0.5124879923150817, "grad_norm": 0.7006135018748972, "learning_rate": 9.848208524679426e-06, "loss": 0.0545, "step": 1067 }, { "epoch": 0.5129682997118156, "grad_norm": 1.371433470057797, "learning_rate": 9.847524424733701e-06, "loss": 0.0592, "step": 1068 }, { "epoch": 0.5134486071085494, "grad_norm": 0.5371431774356002, "learning_rate": 9.846838810555454e-06, "loss": 0.0481, "step": 1069 }, { "epoch": 0.5139289145052833, "grad_norm": 0.9546711896225505, "learning_rate": 9.846151682358853e-06, "loss": 0.0605, "step": 1070 }, { "epoch": 0.5144092219020173, "grad_norm": 0.4469628087936052, "learning_rate": 9.845463040358538e-06, "loss": 0.0461, "step": 1071 }, { "epoch": 0.5148895292987512, "grad_norm": 0.7337259088697539, "learning_rate": 9.84477288476962e-06, "loss": 0.0344, "step": 1072 }, { "epoch": 0.5153698366954851, "grad_norm": 0.6176921441355193, "learning_rate": 9.844081215807684e-06, "loss": 0.0554, "step": 1073 }, { "epoch": 0.515850144092219, "grad_norm": 0.7254549945035398, "learning_rate": 9.843388033688789e-06, "loss": 0.0406, "step": 1074 }, { "epoch": 0.516330451488953, "grad_norm": 0.6081358712579048, "learning_rate": 9.842693338629468e-06, "loss": 0.0607, "step": 1075 }, { "epoch": 0.5168107588856868, "grad_norm": 0.5697699264079668, "learning_rate": 9.84199713084672e-06, "loss": 0.0693, "step": 1076 }, { "epoch": 0.5172910662824207, "grad_norm": 0.3687520833244058, "learning_rate": 9.841299410558026e-06, "loss": 0.035, "step": 1077 }, { "epoch": 0.5177713736791547, "grad_norm": 0.6188868823780803, "learning_rate": 9.840600177981331e-06, "loss": 0.0378, "step": 1078 }, { "epoch": 0.5182516810758886, "grad_norm": 0.44919065810406017, "learning_rate": 9.839899433335059e-06, "loss": 0.0569, "step": 1079 }, { "epoch": 0.5187319884726225, "grad_norm": 0.525069381664302, "learning_rate": 9.839197176838102e-06, "loss": 0.0534, "step": 1080 }, { "epoch": 0.5192122958693564, "grad_norm": 0.5212151097899781, "learning_rate": 9.838493408709823e-06, "loss": 0.051, "step": 1081 }, { "epoch": 0.5196926032660903, "grad_norm": 0.6605149345379034, "learning_rate": 9.837788129170063e-06, "loss": 0.076, "step": 1082 }, { "epoch": 0.5201729106628242, "grad_norm": 0.7131108392835402, "learning_rate": 9.83708133843913e-06, "loss": 0.0723, "step": 1083 }, { "epoch": 0.5206532180595581, "grad_norm": 0.5238463836985972, "learning_rate": 9.836373036737805e-06, "loss": 0.0476, "step": 1084 }, { "epoch": 0.521133525456292, "grad_norm": 0.4394705568289204, "learning_rate": 9.835663224287343e-06, "loss": 0.052, "step": 1085 }, { "epoch": 0.521613832853026, "grad_norm": 0.6627060321697689, "learning_rate": 9.834951901309473e-06, "loss": 0.0514, "step": 1086 }, { "epoch": 0.5220941402497599, "grad_norm": 0.6047813561651657, "learning_rate": 9.834239068026388e-06, "loss": 0.0662, "step": 1087 }, { "epoch": 0.5225744476464937, "grad_norm": 0.608811144599817, "learning_rate": 9.83352472466076e-06, "loss": 0.046, "step": 1088 }, { "epoch": 0.5230547550432276, "grad_norm": 1.158801190471405, "learning_rate": 9.832808871435728e-06, "loss": 0.0524, "step": 1089 }, { "epoch": 0.5235350624399616, "grad_norm": 0.46635581913051444, "learning_rate": 9.832091508574906e-06, "loss": 0.0449, "step": 1090 }, { "epoch": 0.5240153698366955, "grad_norm": 0.48367373014821347, "learning_rate": 9.831372636302379e-06, "loss": 0.0458, "step": 1091 }, { "epoch": 0.5244956772334294, "grad_norm": 0.361218140616094, "learning_rate": 9.8306522548427e-06, "loss": 0.04, "step": 1092 }, { "epoch": 0.5249759846301633, "grad_norm": 0.655690358593338, "learning_rate": 9.829930364420902e-06, "loss": 0.0554, "step": 1093 }, { "epoch": 0.5254562920268973, "grad_norm": 0.7505215320060055, "learning_rate": 9.829206965262477e-06, "loss": 0.0743, "step": 1094 }, { "epoch": 0.5259365994236311, "grad_norm": 0.5750818638699632, "learning_rate": 9.828482057593397e-06, "loss": 0.0576, "step": 1095 }, { "epoch": 0.526416906820365, "grad_norm": 0.47388206139707323, "learning_rate": 9.827755641640105e-06, "loss": 0.0379, "step": 1096 }, { "epoch": 0.5268972142170989, "grad_norm": 0.5077311880425335, "learning_rate": 9.827027717629511e-06, "loss": 0.0623, "step": 1097 }, { "epoch": 0.5273775216138329, "grad_norm": 0.45204043027410235, "learning_rate": 9.826298285789002e-06, "loss": 0.0398, "step": 1098 }, { "epoch": 0.5278578290105668, "grad_norm": 0.7965027074583484, "learning_rate": 9.825567346346427e-06, "loss": 0.0678, "step": 1099 }, { "epoch": 0.5283381364073007, "grad_norm": 0.6917329847032074, "learning_rate": 9.824834899530116e-06, "loss": 0.0619, "step": 1100 }, { "epoch": 0.5288184438040345, "grad_norm": 0.5524043158252445, "learning_rate": 9.824100945568862e-06, "loss": 0.0465, "step": 1101 }, { "epoch": 0.5292987512007685, "grad_norm": 0.5638562567517995, "learning_rate": 9.823365484691933e-06, "loss": 0.0466, "step": 1102 }, { "epoch": 0.5297790585975024, "grad_norm": 0.8316442556169921, "learning_rate": 9.822628517129067e-06, "loss": 0.0881, "step": 1103 }, { "epoch": 0.5302593659942363, "grad_norm": 0.5548828049736547, "learning_rate": 9.821890043110471e-06, "loss": 0.0518, "step": 1104 }, { "epoch": 0.5307396733909702, "grad_norm": 0.41550027200381046, "learning_rate": 9.821150062866826e-06, "loss": 0.0408, "step": 1105 }, { "epoch": 0.5312199807877042, "grad_norm": 0.5895290692704226, "learning_rate": 9.82040857662928e-06, "loss": 0.051, "step": 1106 }, { "epoch": 0.531700288184438, "grad_norm": 0.5248740479596763, "learning_rate": 9.819665584629453e-06, "loss": 0.0561, "step": 1107 }, { "epoch": 0.5321805955811719, "grad_norm": 0.6505572311350228, "learning_rate": 9.818921087099435e-06, "loss": 0.059, "step": 1108 }, { "epoch": 0.5326609029779059, "grad_norm": 0.6556567838840428, "learning_rate": 9.818175084271786e-06, "loss": 0.0559, "step": 1109 }, { "epoch": 0.5331412103746398, "grad_norm": 0.5382916731099718, "learning_rate": 9.817427576379536e-06, "loss": 0.0485, "step": 1110 }, { "epoch": 0.5336215177713737, "grad_norm": 0.3758870057448877, "learning_rate": 9.81667856365619e-06, "loss": 0.0496, "step": 1111 }, { "epoch": 0.5341018251681076, "grad_norm": 0.6578831277841793, "learning_rate": 9.815928046335713e-06, "loss": 0.0557, "step": 1112 }, { "epoch": 0.5345821325648416, "grad_norm": 1.060321357406775, "learning_rate": 9.81517602465255e-06, "loss": 0.0532, "step": 1113 }, { "epoch": 0.5350624399615754, "grad_norm": 0.5103225144908645, "learning_rate": 9.81442249884161e-06, "loss": 0.035, "step": 1114 }, { "epoch": 0.5355427473583093, "grad_norm": 0.37825548058600067, "learning_rate": 9.813667469138273e-06, "loss": 0.0429, "step": 1115 }, { "epoch": 0.5360230547550432, "grad_norm": 0.805237319756677, "learning_rate": 9.812910935778393e-06, "loss": 0.068, "step": 1116 }, { "epoch": 0.5365033621517772, "grad_norm": 0.42363680651332974, "learning_rate": 9.812152898998286e-06, "loss": 0.0469, "step": 1117 }, { "epoch": 0.5369836695485111, "grad_norm": 0.6106525825949898, "learning_rate": 9.811393359034742e-06, "loss": 0.0542, "step": 1118 }, { "epoch": 0.537463976945245, "grad_norm": 0.8001537869444254, "learning_rate": 9.810632316125023e-06, "loss": 0.0565, "step": 1119 }, { "epoch": 0.5379442843419788, "grad_norm": 0.7262847802024673, "learning_rate": 9.809869770506855e-06, "loss": 0.0695, "step": 1120 }, { "epoch": 0.5384245917387128, "grad_norm": 1.1087961916090308, "learning_rate": 9.80910572241844e-06, "loss": 0.0657, "step": 1121 }, { "epoch": 0.5389048991354467, "grad_norm": 0.41628123944641576, "learning_rate": 9.80834017209844e-06, "loss": 0.0463, "step": 1122 }, { "epoch": 0.5393852065321806, "grad_norm": 0.5808409195734693, "learning_rate": 9.807573119785995e-06, "loss": 0.0606, "step": 1123 }, { "epoch": 0.5398655139289145, "grad_norm": 0.49580631030323374, "learning_rate": 9.806804565720712e-06, "loss": 0.0302, "step": 1124 }, { "epoch": 0.5403458213256485, "grad_norm": 1.0814853755104163, "learning_rate": 9.806034510142664e-06, "loss": 0.0675, "step": 1125 }, { "epoch": 0.5408261287223823, "grad_norm": 0.549003216277453, "learning_rate": 9.805262953292395e-06, "loss": 0.0533, "step": 1126 }, { "epoch": 0.5413064361191162, "grad_norm": 0.5609756437216896, "learning_rate": 9.80448989541092e-06, "loss": 0.057, "step": 1127 }, { "epoch": 0.5417867435158501, "grad_norm": 0.9338428086220139, "learning_rate": 9.803715336739717e-06, "loss": 0.0834, "step": 1128 }, { "epoch": 0.5422670509125841, "grad_norm": 0.5122018899106414, "learning_rate": 9.802939277520742e-06, "loss": 0.0484, "step": 1129 }, { "epoch": 0.542747358309318, "grad_norm": 0.5809571628506907, "learning_rate": 9.80216171799641e-06, "loss": 0.0446, "step": 1130 }, { "epoch": 0.5432276657060519, "grad_norm": 0.5453610787253219, "learning_rate": 9.801382658409611e-06, "loss": 0.0605, "step": 1131 }, { "epoch": 0.5437079731027857, "grad_norm": 0.45815751598603865, "learning_rate": 9.800602099003702e-06, "loss": 0.0441, "step": 1132 }, { "epoch": 0.5441882804995197, "grad_norm": 0.618974171564184, "learning_rate": 9.799820040022507e-06, "loss": 0.0565, "step": 1133 }, { "epoch": 0.5446685878962536, "grad_norm": 0.577552914326065, "learning_rate": 9.79903648171032e-06, "loss": 0.0625, "step": 1134 }, { "epoch": 0.5451488952929875, "grad_norm": 0.674317185574733, "learning_rate": 9.798251424311904e-06, "loss": 0.0601, "step": 1135 }, { "epoch": 0.5456292026897214, "grad_norm": 0.5368692747640629, "learning_rate": 9.797464868072489e-06, "loss": 0.0425, "step": 1136 }, { "epoch": 0.5461095100864554, "grad_norm": 0.8059063068692716, "learning_rate": 9.79667681323777e-06, "loss": 0.0607, "step": 1137 }, { "epoch": 0.5465898174831892, "grad_norm": 0.44591141730781386, "learning_rate": 9.795887260053918e-06, "loss": 0.0458, "step": 1138 }, { "epoch": 0.5470701248799231, "grad_norm": 0.49579824081441304, "learning_rate": 9.795096208767565e-06, "loss": 0.0417, "step": 1139 }, { "epoch": 0.547550432276657, "grad_norm": 0.9974120235484584, "learning_rate": 9.794303659625815e-06, "loss": 0.067, "step": 1140 }, { "epoch": 0.548030739673391, "grad_norm": 0.6293616673344423, "learning_rate": 9.793509612876237e-06, "loss": 0.0574, "step": 1141 }, { "epoch": 0.5485110470701249, "grad_norm": 0.5069087987447661, "learning_rate": 9.792714068766872e-06, "loss": 0.0507, "step": 1142 }, { "epoch": 0.5489913544668588, "grad_norm": 0.44270498114612894, "learning_rate": 9.791917027546223e-06, "loss": 0.0511, "step": 1143 }, { "epoch": 0.5494716618635928, "grad_norm": 0.5614291657476101, "learning_rate": 9.791118489463265e-06, "loss": 0.0482, "step": 1144 }, { "epoch": 0.5499519692603266, "grad_norm": 0.6022614849565968, "learning_rate": 9.790318454767438e-06, "loss": 0.0433, "step": 1145 }, { "epoch": 0.5504322766570605, "grad_norm": 0.4960237823747353, "learning_rate": 9.78951692370865e-06, "loss": 0.0618, "step": 1146 }, { "epoch": 0.5509125840537944, "grad_norm": 0.7042914518317633, "learning_rate": 9.78871389653728e-06, "loss": 0.058, "step": 1147 }, { "epoch": 0.5513928914505284, "grad_norm": 0.6312566271413287, "learning_rate": 9.787909373504172e-06, "loss": 0.0429, "step": 1148 }, { "epoch": 0.5518731988472623, "grad_norm": 0.39432463738714213, "learning_rate": 9.787103354860633e-06, "loss": 0.0346, "step": 1149 }, { "epoch": 0.5523535062439962, "grad_norm": 0.9311799513518504, "learning_rate": 9.786295840858444e-06, "loss": 0.0658, "step": 1150 }, { "epoch": 0.55283381364073, "grad_norm": 0.5007348827746324, "learning_rate": 9.785486831749847e-06, "loss": 0.0452, "step": 1151 }, { "epoch": 0.553314121037464, "grad_norm": 0.49962024985272985, "learning_rate": 9.784676327787557e-06, "loss": 0.0568, "step": 1152 }, { "epoch": 0.5537944284341979, "grad_norm": 0.3691640957902528, "learning_rate": 9.783864329224752e-06, "loss": 0.0451, "step": 1153 }, { "epoch": 0.5542747358309318, "grad_norm": 0.413567800244177, "learning_rate": 9.783050836315078e-06, "loss": 0.0472, "step": 1154 }, { "epoch": 0.5547550432276657, "grad_norm": 0.39518046681000146, "learning_rate": 9.782235849312647e-06, "loss": 0.041, "step": 1155 }, { "epoch": 0.5552353506243997, "grad_norm": 0.49523548669900735, "learning_rate": 9.781419368472039e-06, "loss": 0.0441, "step": 1156 }, { "epoch": 0.5557156580211335, "grad_norm": 0.6024044401609281, "learning_rate": 9.7806013940483e-06, "loss": 0.0536, "step": 1157 }, { "epoch": 0.5561959654178674, "grad_norm": 1.2088324636184875, "learning_rate": 9.779781926296942e-06, "loss": 0.0708, "step": 1158 }, { "epoch": 0.5566762728146013, "grad_norm": 0.6166764403933446, "learning_rate": 9.778960965473945e-06, "loss": 0.0482, "step": 1159 }, { "epoch": 0.5571565802113353, "grad_norm": 0.9288628974217886, "learning_rate": 9.778138511835753e-06, "loss": 0.0672, "step": 1160 }, { "epoch": 0.5576368876080692, "grad_norm": 1.2330701366219725, "learning_rate": 9.77731456563928e-06, "loss": 0.0558, "step": 1161 }, { "epoch": 0.5581171950048031, "grad_norm": 0.4791648541731939, "learning_rate": 9.776489127141902e-06, "loss": 0.043, "step": 1162 }, { "epoch": 0.5585975024015369, "grad_norm": 0.5709504036661929, "learning_rate": 9.775662196601464e-06, "loss": 0.0481, "step": 1163 }, { "epoch": 0.5590778097982709, "grad_norm": 0.6468021810734045, "learning_rate": 9.774833774276278e-06, "loss": 0.0541, "step": 1164 }, { "epoch": 0.5595581171950048, "grad_norm": 0.46892414876617317, "learning_rate": 9.774003860425116e-06, "loss": 0.0408, "step": 1165 }, { "epoch": 0.5600384245917387, "grad_norm": 0.7110924147202355, "learning_rate": 9.773172455307223e-06, "loss": 0.0869, "step": 1166 }, { "epoch": 0.5605187319884726, "grad_norm": 0.5925406633330682, "learning_rate": 9.772339559182307e-06, "loss": 0.038, "step": 1167 }, { "epoch": 0.5609990393852066, "grad_norm": 0.4344853797907911, "learning_rate": 9.77150517231054e-06, "loss": 0.0448, "step": 1168 }, { "epoch": 0.5614793467819404, "grad_norm": 1.5378334777798277, "learning_rate": 9.770669294952562e-06, "loss": 0.0455, "step": 1169 }, { "epoch": 0.5619596541786743, "grad_norm": 0.661380367146787, "learning_rate": 9.76983192736948e-06, "loss": 0.0646, "step": 1170 }, { "epoch": 0.5624399615754082, "grad_norm": 0.6292936111066385, "learning_rate": 9.768993069822862e-06, "loss": 0.0543, "step": 1171 }, { "epoch": 0.5629202689721422, "grad_norm": 0.5168700137660031, "learning_rate": 9.768152722574747e-06, "loss": 0.0482, "step": 1172 }, { "epoch": 0.5634005763688761, "grad_norm": 0.48793915527548615, "learning_rate": 9.76731088588763e-06, "loss": 0.052, "step": 1173 }, { "epoch": 0.56388088376561, "grad_norm": 0.8671102690781609, "learning_rate": 9.766467560024485e-06, "loss": 0.07, "step": 1174 }, { "epoch": 0.5643611911623438, "grad_norm": 0.5202999857616358, "learning_rate": 9.765622745248739e-06, "loss": 0.0541, "step": 1175 }, { "epoch": 0.5648414985590778, "grad_norm": 1.2213889314313011, "learning_rate": 9.76477644182429e-06, "loss": 0.045, "step": 1176 }, { "epoch": 0.5653218059558117, "grad_norm": 0.5437848268666584, "learning_rate": 9.7639286500155e-06, "loss": 0.0434, "step": 1177 }, { "epoch": 0.5658021133525456, "grad_norm": 0.46403176390550605, "learning_rate": 9.763079370087196e-06, "loss": 0.0352, "step": 1178 }, { "epoch": 0.5662824207492796, "grad_norm": 0.7171751460163477, "learning_rate": 9.762228602304667e-06, "loss": 0.0578, "step": 1179 }, { "epoch": 0.5667627281460135, "grad_norm": 0.48339364673648727, "learning_rate": 9.761376346933672e-06, "loss": 0.0464, "step": 1180 }, { "epoch": 0.5672430355427474, "grad_norm": 0.4414131140482316, "learning_rate": 9.760522604240434e-06, "loss": 0.0356, "step": 1181 }, { "epoch": 0.5677233429394812, "grad_norm": 0.995263931495515, "learning_rate": 9.759667374491632e-06, "loss": 0.064, "step": 1182 }, { "epoch": 0.5682036503362152, "grad_norm": 0.6310705676300887, "learning_rate": 9.758810657954424e-06, "loss": 0.0643, "step": 1183 }, { "epoch": 0.5686839577329491, "grad_norm": 0.8865712295477853, "learning_rate": 9.757952454896418e-06, "loss": 0.0707, "step": 1184 }, { "epoch": 0.569164265129683, "grad_norm": 0.44330618752194106, "learning_rate": 9.757092765585695e-06, "loss": 0.0427, "step": 1185 }, { "epoch": 0.5696445725264169, "grad_norm": 0.5101911660662402, "learning_rate": 9.7562315902908e-06, "loss": 0.0612, "step": 1186 }, { "epoch": 0.5701248799231509, "grad_norm": 0.3634609856460933, "learning_rate": 9.755368929280738e-06, "loss": 0.0359, "step": 1187 }, { "epoch": 0.5706051873198847, "grad_norm": 0.8929703488376083, "learning_rate": 9.754504782824982e-06, "loss": 0.0608, "step": 1188 }, { "epoch": 0.5710854947166186, "grad_norm": 0.7621720458573135, "learning_rate": 9.753639151193468e-06, "loss": 0.0747, "step": 1189 }, { "epoch": 0.5715658021133525, "grad_norm": 0.4562290573979324, "learning_rate": 9.752772034656593e-06, "loss": 0.0453, "step": 1190 }, { "epoch": 0.5720461095100865, "grad_norm": 0.454374886984673, "learning_rate": 9.75190343348522e-06, "loss": 0.046, "step": 1191 }, { "epoch": 0.5725264169068204, "grad_norm": 0.4985067047730674, "learning_rate": 9.75103334795068e-06, "loss": 0.0465, "step": 1192 }, { "epoch": 0.5730067243035543, "grad_norm": 0.62518531188208, "learning_rate": 9.750161778324759e-06, "loss": 0.0546, "step": 1193 }, { "epoch": 0.5734870317002881, "grad_norm": 0.5218968728591356, "learning_rate": 9.749288724879716e-06, "loss": 0.0451, "step": 1194 }, { "epoch": 0.5739673390970221, "grad_norm": 0.520699846944625, "learning_rate": 9.748414187888262e-06, "loss": 0.0511, "step": 1195 }, { "epoch": 0.574447646493756, "grad_norm": 0.7212870071975475, "learning_rate": 9.747538167623585e-06, "loss": 0.0539, "step": 1196 }, { "epoch": 0.5749279538904899, "grad_norm": 0.41354709356039293, "learning_rate": 9.746660664359326e-06, "loss": 0.0348, "step": 1197 }, { "epoch": 0.5754082612872238, "grad_norm": 0.47323694276958017, "learning_rate": 9.745781678369594e-06, "loss": 0.0342, "step": 1198 }, { "epoch": 0.5758885686839578, "grad_norm": 0.7798319365982702, "learning_rate": 9.744901209928959e-06, "loss": 0.0386, "step": 1199 }, { "epoch": 0.5763688760806917, "grad_norm": 0.41388948108418533, "learning_rate": 9.744019259312454e-06, "loss": 0.0404, "step": 1200 }, { "epoch": 0.5768491834774255, "grad_norm": 0.45590776454332793, "learning_rate": 9.74313582679558e-06, "loss": 0.059, "step": 1201 }, { "epoch": 0.5773294908741594, "grad_norm": 0.8624611717777299, "learning_rate": 9.742250912654292e-06, "loss": 0.0573, "step": 1202 }, { "epoch": 0.5778097982708934, "grad_norm": 0.4105792475406539, "learning_rate": 9.741364517165017e-06, "loss": 0.0474, "step": 1203 }, { "epoch": 0.5782901056676273, "grad_norm": 0.4030691281347931, "learning_rate": 9.740476640604637e-06, "loss": 0.0533, "step": 1204 }, { "epoch": 0.5787704130643612, "grad_norm": 0.8208559639247031, "learning_rate": 9.7395872832505e-06, "loss": 0.0491, "step": 1205 }, { "epoch": 0.579250720461095, "grad_norm": 0.6089308840844816, "learning_rate": 9.73869644538042e-06, "loss": 0.069, "step": 1206 }, { "epoch": 0.579731027857829, "grad_norm": 0.6462057166604767, "learning_rate": 9.737804127272668e-06, "loss": 0.051, "step": 1207 }, { "epoch": 0.5802113352545629, "grad_norm": 0.4269852827384148, "learning_rate": 9.73691032920598e-06, "loss": 0.0434, "step": 1208 }, { "epoch": 0.5806916426512968, "grad_norm": 0.4245254269716441, "learning_rate": 9.736015051459551e-06, "loss": 0.0372, "step": 1209 }, { "epoch": 0.5811719500480308, "grad_norm": 0.3532506765163264, "learning_rate": 9.735118294313045e-06, "loss": 0.0466, "step": 1210 }, { "epoch": 0.5816522574447647, "grad_norm": 0.34262000331545467, "learning_rate": 9.734220058046582e-06, "loss": 0.0356, "step": 1211 }, { "epoch": 0.5821325648414986, "grad_norm": 0.5754600784602188, "learning_rate": 9.733320342940747e-06, "loss": 0.0543, "step": 1212 }, { "epoch": 0.5826128722382324, "grad_norm": 0.5170774377794336, "learning_rate": 9.732419149276586e-06, "loss": 0.0558, "step": 1213 }, { "epoch": 0.5830931796349664, "grad_norm": 0.5109210528826162, "learning_rate": 9.731516477335607e-06, "loss": 0.062, "step": 1214 }, { "epoch": 0.5835734870317003, "grad_norm": 0.4985819768058242, "learning_rate": 9.73061232739978e-06, "loss": 0.0529, "step": 1215 }, { "epoch": 0.5840537944284342, "grad_norm": 0.5498108762960605, "learning_rate": 9.729706699751535e-06, "loss": 0.0517, "step": 1216 }, { "epoch": 0.5845341018251681, "grad_norm": 0.7920509732214022, "learning_rate": 9.728799594673766e-06, "loss": 0.0478, "step": 1217 }, { "epoch": 0.5850144092219021, "grad_norm": 0.5652672810126951, "learning_rate": 9.727891012449827e-06, "loss": 0.0525, "step": 1218 }, { "epoch": 0.585494716618636, "grad_norm": 0.5691398549888063, "learning_rate": 9.726980953363536e-06, "loss": 0.0496, "step": 1219 }, { "epoch": 0.5859750240153698, "grad_norm": 0.5253745870416723, "learning_rate": 9.726069417699167e-06, "loss": 0.0483, "step": 1220 }, { "epoch": 0.5864553314121037, "grad_norm": 0.6773929868480058, "learning_rate": 9.725156405741461e-06, "loss": 0.0455, "step": 1221 }, { "epoch": 0.5869356388088377, "grad_norm": 0.8365763274612497, "learning_rate": 9.724241917775616e-06, "loss": 0.0619, "step": 1222 }, { "epoch": 0.5874159462055716, "grad_norm": 0.6630891954815319, "learning_rate": 9.723325954087294e-06, "loss": 0.064, "step": 1223 }, { "epoch": 0.5878962536023055, "grad_norm": 0.34729186105181487, "learning_rate": 9.722408514962619e-06, "loss": 0.0353, "step": 1224 }, { "epoch": 0.5883765609990393, "grad_norm": 0.44671842700761955, "learning_rate": 9.721489600688168e-06, "loss": 0.0442, "step": 1225 }, { "epoch": 0.5888568683957733, "grad_norm": 0.44380993259781293, "learning_rate": 9.720569211550988e-06, "loss": 0.0401, "step": 1226 }, { "epoch": 0.5893371757925072, "grad_norm": 0.7426688381698872, "learning_rate": 9.719647347838584e-06, "loss": 0.0536, "step": 1227 }, { "epoch": 0.5898174831892411, "grad_norm": 0.6909672290678518, "learning_rate": 9.718724009838917e-06, "loss": 0.0488, "step": 1228 }, { "epoch": 0.590297790585975, "grad_norm": 0.5801124250465916, "learning_rate": 9.717799197840416e-06, "loss": 0.0559, "step": 1229 }, { "epoch": 0.590778097982709, "grad_norm": 0.38585748356811145, "learning_rate": 9.716872912131964e-06, "loss": 0.0387, "step": 1230 }, { "epoch": 0.5912584053794429, "grad_norm": 0.5037471388123007, "learning_rate": 9.715945153002908e-06, "loss": 0.0478, "step": 1231 }, { "epoch": 0.5917387127761767, "grad_norm": 0.42301309786103813, "learning_rate": 9.715015920743056e-06, "loss": 0.0441, "step": 1232 }, { "epoch": 0.5922190201729106, "grad_norm": 0.5895965832076508, "learning_rate": 9.714085215642672e-06, "loss": 0.0442, "step": 1233 }, { "epoch": 0.5926993275696446, "grad_norm": 0.6111729772870694, "learning_rate": 9.713153037992484e-06, "loss": 0.0494, "step": 1234 }, { "epoch": 0.5931796349663785, "grad_norm": 0.6116675844026525, "learning_rate": 9.712219388083676e-06, "loss": 0.0578, "step": 1235 }, { "epoch": 0.5936599423631124, "grad_norm": 0.46491825617527227, "learning_rate": 9.711284266207899e-06, "loss": 0.0564, "step": 1236 }, { "epoch": 0.5941402497598463, "grad_norm": 0.952450984803784, "learning_rate": 9.710347672657254e-06, "loss": 0.046, "step": 1237 }, { "epoch": 0.5946205571565802, "grad_norm": 0.6313323004545888, "learning_rate": 9.70940960772431e-06, "loss": 0.0452, "step": 1238 }, { "epoch": 0.5951008645533141, "grad_norm": 0.5509216833295507, "learning_rate": 9.708470071702094e-06, "loss": 0.0517, "step": 1239 }, { "epoch": 0.595581171950048, "grad_norm": 1.043696226103975, "learning_rate": 9.707529064884087e-06, "loss": 0.0586, "step": 1240 }, { "epoch": 0.5960614793467819, "grad_norm": 0.5486854474154073, "learning_rate": 9.706586587564236e-06, "loss": 0.057, "step": 1241 }, { "epoch": 0.5965417867435159, "grad_norm": 0.633733102374295, "learning_rate": 9.705642640036945e-06, "loss": 0.0548, "step": 1242 }, { "epoch": 0.5970220941402498, "grad_norm": 0.4966097609578802, "learning_rate": 9.704697222597074e-06, "loss": 0.0428, "step": 1243 }, { "epoch": 0.5975024015369836, "grad_norm": 0.5824655077330811, "learning_rate": 9.703750335539952e-06, "loss": 0.0565, "step": 1244 }, { "epoch": 0.5979827089337176, "grad_norm": 0.563546323378018, "learning_rate": 9.702801979161353e-06, "loss": 0.0543, "step": 1245 }, { "epoch": 0.5984630163304515, "grad_norm": 0.5563049946399633, "learning_rate": 9.70185215375752e-06, "loss": 0.049, "step": 1246 }, { "epoch": 0.5989433237271854, "grad_norm": 0.3744869072214263, "learning_rate": 9.700900859625155e-06, "loss": 0.032, "step": 1247 }, { "epoch": 0.5994236311239193, "grad_norm": 0.5977503365912346, "learning_rate": 9.699948097061412e-06, "loss": 0.0598, "step": 1248 }, { "epoch": 0.5999039385206533, "grad_norm": 0.7030336814456123, "learning_rate": 9.69899386636391e-06, "loss": 0.0446, "step": 1249 }, { "epoch": 0.6003842459173871, "grad_norm": 0.4669080970568829, "learning_rate": 9.698038167830722e-06, "loss": 0.0515, "step": 1250 }, { "epoch": 0.600864553314121, "grad_norm": 0.49830925851066343, "learning_rate": 9.697081001760384e-06, "loss": 0.0576, "step": 1251 }, { "epoch": 0.6013448607108549, "grad_norm": 0.6686926230527453, "learning_rate": 9.696122368451887e-06, "loss": 0.0399, "step": 1252 }, { "epoch": 0.6018251681075889, "grad_norm": 0.6340142308992163, "learning_rate": 9.695162268204681e-06, "loss": 0.0792, "step": 1253 }, { "epoch": 0.6023054755043228, "grad_norm": 0.532128653279977, "learning_rate": 9.694200701318679e-06, "loss": 0.041, "step": 1254 }, { "epoch": 0.6027857829010567, "grad_norm": 0.5065554349402721, "learning_rate": 9.693237668094242e-06, "loss": 0.0466, "step": 1255 }, { "epoch": 0.6032660902977905, "grad_norm": 0.5153614448612733, "learning_rate": 9.692273168832198e-06, "loss": 0.054, "step": 1256 }, { "epoch": 0.6037463976945245, "grad_norm": 0.5353390841759837, "learning_rate": 9.69130720383383e-06, "loss": 0.0374, "step": 1257 }, { "epoch": 0.6042267050912584, "grad_norm": 0.639129347628516, "learning_rate": 9.690339773400876e-06, "loss": 0.0432, "step": 1258 }, { "epoch": 0.6047070124879923, "grad_norm": 0.58928331069562, "learning_rate": 9.689370877835538e-06, "loss": 0.0357, "step": 1259 }, { "epoch": 0.6051873198847262, "grad_norm": 0.5336886529718406, "learning_rate": 9.688400517440471e-06, "loss": 0.056, "step": 1260 }, { "epoch": 0.6056676272814602, "grad_norm": 0.5656089550860497, "learning_rate": 9.687428692518789e-06, "loss": 0.0539, "step": 1261 }, { "epoch": 0.6061479346781941, "grad_norm": 0.4162011114169869, "learning_rate": 9.686455403374062e-06, "loss": 0.0354, "step": 1262 }, { "epoch": 0.6066282420749279, "grad_norm": 0.6906145746227967, "learning_rate": 9.685480650310319e-06, "loss": 0.0526, "step": 1263 }, { "epoch": 0.6071085494716618, "grad_norm": 1.028325898856233, "learning_rate": 9.684504433632049e-06, "loss": 0.0588, "step": 1264 }, { "epoch": 0.6075888568683958, "grad_norm": 0.5167645947728937, "learning_rate": 9.68352675364419e-06, "loss": 0.0354, "step": 1265 }, { "epoch": 0.6080691642651297, "grad_norm": 0.6196435075019047, "learning_rate": 9.682547610652145e-06, "loss": 0.05, "step": 1266 }, { "epoch": 0.6085494716618636, "grad_norm": 0.47414855740871636, "learning_rate": 9.681567004961769e-06, "loss": 0.0495, "step": 1267 }, { "epoch": 0.6090297790585975, "grad_norm": 0.4592587792291843, "learning_rate": 9.68058493687938e-06, "loss": 0.0386, "step": 1268 }, { "epoch": 0.6095100864553314, "grad_norm": 0.46426133154926164, "learning_rate": 9.679601406711746e-06, "loss": 0.0631, "step": 1269 }, { "epoch": 0.6099903938520653, "grad_norm": 0.4772199148883563, "learning_rate": 9.678616414766096e-06, "loss": 0.056, "step": 1270 }, { "epoch": 0.6104707012487992, "grad_norm": 0.7108506602194041, "learning_rate": 9.677629961350113e-06, "loss": 0.0658, "step": 1271 }, { "epoch": 0.6109510086455331, "grad_norm": 0.6231876402263944, "learning_rate": 9.676642046771938e-06, "loss": 0.0433, "step": 1272 }, { "epoch": 0.6114313160422671, "grad_norm": 0.47557579581945064, "learning_rate": 9.675652671340169e-06, "loss": 0.04, "step": 1273 }, { "epoch": 0.611911623439001, "grad_norm": 0.7107171113991735, "learning_rate": 9.67466183536386e-06, "loss": 0.0628, "step": 1274 }, { "epoch": 0.6123919308357348, "grad_norm": 0.9476315212332672, "learning_rate": 9.673669539152518e-06, "loss": 0.0443, "step": 1275 }, { "epoch": 0.6128722382324687, "grad_norm": 0.7168398810301742, "learning_rate": 9.67267578301611e-06, "loss": 0.0744, "step": 1276 }, { "epoch": 0.6133525456292027, "grad_norm": 0.5729632454572932, "learning_rate": 9.67168056726506e-06, "loss": 0.0514, "step": 1277 }, { "epoch": 0.6138328530259366, "grad_norm": 0.9643183924220915, "learning_rate": 9.670683892210245e-06, "loss": 0.0622, "step": 1278 }, { "epoch": 0.6143131604226705, "grad_norm": 0.6683536482508268, "learning_rate": 9.669685758162996e-06, "loss": 0.0764, "step": 1279 }, { "epoch": 0.6147934678194045, "grad_norm": 0.4503574435017507, "learning_rate": 9.668686165435106e-06, "loss": 0.0494, "step": 1280 }, { "epoch": 0.6152737752161384, "grad_norm": 0.4920339006116487, "learning_rate": 9.667685114338819e-06, "loss": 0.0478, "step": 1281 }, { "epoch": 0.6157540826128722, "grad_norm": 0.5067811483748804, "learning_rate": 9.666682605186834e-06, "loss": 0.0502, "step": 1282 }, { "epoch": 0.6162343900096061, "grad_norm": 0.4640157427837885, "learning_rate": 9.66567863829231e-06, "loss": 0.0317, "step": 1283 }, { "epoch": 0.6167146974063401, "grad_norm": 0.7574081208925442, "learning_rate": 9.664673213968856e-06, "loss": 0.0478, "step": 1284 }, { "epoch": 0.617195004803074, "grad_norm": 0.6090301906896739, "learning_rate": 9.663666332530541e-06, "loss": 0.0532, "step": 1285 }, { "epoch": 0.6176753121998079, "grad_norm": 0.5041461690169365, "learning_rate": 9.662657994291884e-06, "loss": 0.059, "step": 1286 }, { "epoch": 0.6181556195965417, "grad_norm": 0.3750500141302966, "learning_rate": 9.661648199567866e-06, "loss": 0.0367, "step": 1287 }, { "epoch": 0.6186359269932757, "grad_norm": 0.5062593535367501, "learning_rate": 9.660636948673913e-06, "loss": 0.0353, "step": 1288 }, { "epoch": 0.6191162343900096, "grad_norm": 0.639235731952125, "learning_rate": 9.659624241925917e-06, "loss": 0.0596, "step": 1289 }, { "epoch": 0.6195965417867435, "grad_norm": 0.5049024822490057, "learning_rate": 9.65861007964022e-06, "loss": 0.0447, "step": 1290 }, { "epoch": 0.6200768491834774, "grad_norm": 0.4296613053659539, "learning_rate": 9.657594462133614e-06, "loss": 0.044, "step": 1291 }, { "epoch": 0.6205571565802114, "grad_norm": 0.5138878968910088, "learning_rate": 9.656577389723353e-06, "loss": 0.0493, "step": 1292 }, { "epoch": 0.6210374639769453, "grad_norm": 0.46036782082872985, "learning_rate": 9.655558862727141e-06, "loss": 0.037, "step": 1293 }, { "epoch": 0.6215177713736791, "grad_norm": 0.4704621930317916, "learning_rate": 9.654538881463139e-06, "loss": 0.048, "step": 1294 }, { "epoch": 0.621998078770413, "grad_norm": 0.48737100335382305, "learning_rate": 9.653517446249955e-06, "loss": 0.0396, "step": 1295 }, { "epoch": 0.622478386167147, "grad_norm": 0.49092045898074815, "learning_rate": 9.652494557406666e-06, "loss": 0.0456, "step": 1296 }, { "epoch": 0.6229586935638809, "grad_norm": 0.5835467675400211, "learning_rate": 9.65147021525279e-06, "loss": 0.0378, "step": 1297 }, { "epoch": 0.6234390009606148, "grad_norm": 0.42343488555568504, "learning_rate": 9.650444420108303e-06, "loss": 0.0378, "step": 1298 }, { "epoch": 0.6239193083573487, "grad_norm": 0.49168464423068825, "learning_rate": 9.649417172293636e-06, "loss": 0.0522, "step": 1299 }, { "epoch": 0.6243996157540826, "grad_norm": 0.5154918146605992, "learning_rate": 9.648388472129671e-06, "loss": 0.044, "step": 1300 }, { "epoch": 0.6248799231508165, "grad_norm": 0.3503174379219431, "learning_rate": 9.647358319937746e-06, "loss": 0.0347, "step": 1301 }, { "epoch": 0.6253602305475504, "grad_norm": 0.4137072426795727, "learning_rate": 9.646326716039653e-06, "loss": 0.0296, "step": 1302 }, { "epoch": 0.6258405379442843, "grad_norm": 0.5333253731829095, "learning_rate": 9.645293660757637e-06, "loss": 0.0504, "step": 1303 }, { "epoch": 0.6263208453410183, "grad_norm": 0.45225710559548166, "learning_rate": 9.644259154414396e-06, "loss": 0.0433, "step": 1304 }, { "epoch": 0.6268011527377522, "grad_norm": 0.5129776997920263, "learning_rate": 9.643223197333078e-06, "loss": 0.0555, "step": 1305 }, { "epoch": 0.627281460134486, "grad_norm": 0.6635014880318021, "learning_rate": 9.64218578983729e-06, "loss": 0.0663, "step": 1306 }, { "epoch": 0.6277617675312199, "grad_norm": 0.40151280112857785, "learning_rate": 9.641146932251088e-06, "loss": 0.0449, "step": 1307 }, { "epoch": 0.6282420749279539, "grad_norm": 0.4750922210262119, "learning_rate": 9.640106624898985e-06, "loss": 0.0458, "step": 1308 }, { "epoch": 0.6287223823246878, "grad_norm": 0.534233777229496, "learning_rate": 9.639064868105943e-06, "loss": 0.0522, "step": 1309 }, { "epoch": 0.6292026897214217, "grad_norm": 0.48289449635982123, "learning_rate": 9.638021662197376e-06, "loss": 0.0373, "step": 1310 }, { "epoch": 0.6296829971181557, "grad_norm": 0.5462852993159404, "learning_rate": 9.636977007499153e-06, "loss": 0.0742, "step": 1311 }, { "epoch": 0.6301633045148896, "grad_norm": 0.5378449863960542, "learning_rate": 9.6359309043376e-06, "loss": 0.0493, "step": 1312 }, { "epoch": 0.6306436119116234, "grad_norm": 0.7788750925559146, "learning_rate": 9.634883353039484e-06, "loss": 0.0855, "step": 1313 }, { "epoch": 0.6311239193083573, "grad_norm": 0.4117391703483272, "learning_rate": 9.633834353932035e-06, "loss": 0.0424, "step": 1314 }, { "epoch": 0.6316042267050913, "grad_norm": 0.7547438553021464, "learning_rate": 9.63278390734293e-06, "loss": 0.0534, "step": 1315 }, { "epoch": 0.6320845341018252, "grad_norm": 0.6007646020799848, "learning_rate": 9.631732013600302e-06, "loss": 0.0403, "step": 1316 }, { "epoch": 0.6325648414985591, "grad_norm": 0.589813047694365, "learning_rate": 9.63067867303273e-06, "loss": 0.0536, "step": 1317 }, { "epoch": 0.633045148895293, "grad_norm": 0.5824782124675916, "learning_rate": 9.62962388596925e-06, "loss": 0.0588, "step": 1318 }, { "epoch": 0.633525456292027, "grad_norm": 0.5558162647237488, "learning_rate": 9.628567652739348e-06, "loss": 0.0459, "step": 1319 }, { "epoch": 0.6340057636887608, "grad_norm": 0.43466194972289696, "learning_rate": 9.627509973672962e-06, "loss": 0.0504, "step": 1320 }, { "epoch": 0.6344860710854947, "grad_norm": 0.5311930203126808, "learning_rate": 9.626450849100483e-06, "loss": 0.0493, "step": 1321 }, { "epoch": 0.6349663784822286, "grad_norm": 0.4875635622342151, "learning_rate": 9.62539027935275e-06, "loss": 0.0502, "step": 1322 }, { "epoch": 0.6354466858789626, "grad_norm": 0.7400810886912117, "learning_rate": 9.624328264761056e-06, "loss": 0.0834, "step": 1323 }, { "epoch": 0.6359269932756965, "grad_norm": 0.48011574918186833, "learning_rate": 9.623264805657146e-06, "loss": 0.0461, "step": 1324 }, { "epoch": 0.6364073006724303, "grad_norm": 0.36881464066476766, "learning_rate": 9.622199902373218e-06, "loss": 0.0344, "step": 1325 }, { "epoch": 0.6368876080691642, "grad_norm": 0.4130880239031028, "learning_rate": 9.621133555241912e-06, "loss": 0.0326, "step": 1326 }, { "epoch": 0.6373679154658982, "grad_norm": 0.4479060787169812, "learning_rate": 9.620065764596328e-06, "loss": 0.0393, "step": 1327 }, { "epoch": 0.6378482228626321, "grad_norm": 0.5942961946492873, "learning_rate": 9.618996530770018e-06, "loss": 0.0432, "step": 1328 }, { "epoch": 0.638328530259366, "grad_norm": 0.5638567374914406, "learning_rate": 9.617925854096975e-06, "loss": 0.0501, "step": 1329 }, { "epoch": 0.6388088376560999, "grad_norm": 0.42880488417099755, "learning_rate": 9.616853734911653e-06, "loss": 0.0374, "step": 1330 }, { "epoch": 0.6392891450528339, "grad_norm": 0.5463743835118613, "learning_rate": 9.615780173548952e-06, "loss": 0.0516, "step": 1331 }, { "epoch": 0.6397694524495677, "grad_norm": 0.691161000342225, "learning_rate": 9.614705170344221e-06, "loss": 0.0848, "step": 1332 }, { "epoch": 0.6402497598463016, "grad_norm": 0.6156410304491652, "learning_rate": 9.613628725633262e-06, "loss": 0.056, "step": 1333 }, { "epoch": 0.6407300672430355, "grad_norm": 0.45322396316185093, "learning_rate": 9.612550839752326e-06, "loss": 0.044, "step": 1334 }, { "epoch": 0.6412103746397695, "grad_norm": 0.6507298412844991, "learning_rate": 9.611471513038115e-06, "loss": 0.0687, "step": 1335 }, { "epoch": 0.6416906820365034, "grad_norm": 0.6325291411421659, "learning_rate": 9.610390745827783e-06, "loss": 0.0608, "step": 1336 }, { "epoch": 0.6421709894332372, "grad_norm": 0.60388801983135, "learning_rate": 9.60930853845893e-06, "loss": 0.0402, "step": 1337 }, { "epoch": 0.6426512968299711, "grad_norm": 0.4995528775460853, "learning_rate": 9.608224891269607e-06, "loss": 0.0381, "step": 1338 }, { "epoch": 0.6431316042267051, "grad_norm": 0.3700705272643591, "learning_rate": 9.607139804598316e-06, "loss": 0.0349, "step": 1339 }, { "epoch": 0.643611911623439, "grad_norm": 0.4362560597879333, "learning_rate": 9.606053278784009e-06, "loss": 0.0403, "step": 1340 }, { "epoch": 0.6440922190201729, "grad_norm": 0.54879971648486, "learning_rate": 9.604965314166085e-06, "loss": 0.0541, "step": 1341 }, { "epoch": 0.6445725264169068, "grad_norm": 0.4929293793701462, "learning_rate": 9.603875911084394e-06, "loss": 0.0365, "step": 1342 }, { "epoch": 0.6450528338136408, "grad_norm": 0.5628523668339881, "learning_rate": 9.602785069879239e-06, "loss": 0.0598, "step": 1343 }, { "epoch": 0.6455331412103746, "grad_norm": 0.467411924570006, "learning_rate": 9.601692790891363e-06, "loss": 0.0521, "step": 1344 }, { "epoch": 0.6460134486071085, "grad_norm": 0.5210675000514947, "learning_rate": 9.600599074461967e-06, "loss": 0.0637, "step": 1345 }, { "epoch": 0.6464937560038425, "grad_norm": 0.6153647716384278, "learning_rate": 9.599503920932698e-06, "loss": 0.0479, "step": 1346 }, { "epoch": 0.6469740634005764, "grad_norm": 0.5616593537777448, "learning_rate": 9.59840733064565e-06, "loss": 0.0364, "step": 1347 }, { "epoch": 0.6474543707973103, "grad_norm": 0.3344274743801641, "learning_rate": 9.59730930394337e-06, "loss": 0.0314, "step": 1348 }, { "epoch": 0.6479346781940442, "grad_norm": 0.6754093193995799, "learning_rate": 9.59620984116885e-06, "loss": 0.0438, "step": 1349 }, { "epoch": 0.6484149855907781, "grad_norm": 0.8533549419575339, "learning_rate": 9.595108942665528e-06, "loss": 0.0513, "step": 1350 }, { "epoch": 0.648895292987512, "grad_norm": 0.3620854442522442, "learning_rate": 9.5940066087773e-06, "loss": 0.0423, "step": 1351 }, { "epoch": 0.6493756003842459, "grad_norm": 0.5893844408390839, "learning_rate": 9.592902839848502e-06, "loss": 0.0377, "step": 1352 }, { "epoch": 0.6498559077809798, "grad_norm": 0.4092201290840598, "learning_rate": 9.591797636223921e-06, "loss": 0.0411, "step": 1353 }, { "epoch": 0.6503362151777138, "grad_norm": 0.47322297966868715, "learning_rate": 9.590690998248791e-06, "loss": 0.043, "step": 1354 }, { "epoch": 0.6508165225744477, "grad_norm": 0.4636748975157291, "learning_rate": 9.589582926268798e-06, "loss": 0.0429, "step": 1355 }, { "epoch": 0.6512968299711815, "grad_norm": 0.4349692458727795, "learning_rate": 9.588473420630071e-06, "loss": 0.0444, "step": 1356 }, { "epoch": 0.6517771373679154, "grad_norm": 0.5677476836765938, "learning_rate": 9.587362481679187e-06, "loss": 0.0562, "step": 1357 }, { "epoch": 0.6522574447646494, "grad_norm": 0.4371170679582839, "learning_rate": 9.586250109763176e-06, "loss": 0.0371, "step": 1358 }, { "epoch": 0.6527377521613833, "grad_norm": 0.6019100000609227, "learning_rate": 9.585136305229513e-06, "loss": 0.0488, "step": 1359 }, { "epoch": 0.6532180595581172, "grad_norm": 0.38363888520290607, "learning_rate": 9.584021068426114e-06, "loss": 0.0338, "step": 1360 }, { "epoch": 0.6536983669548511, "grad_norm": 0.4993078534835986, "learning_rate": 9.582904399701353e-06, "loss": 0.0431, "step": 1361 }, { "epoch": 0.654178674351585, "grad_norm": 0.5671412832634853, "learning_rate": 9.581786299404046e-06, "loss": 0.0561, "step": 1362 }, { "epoch": 0.6546589817483189, "grad_norm": 0.7306005927500124, "learning_rate": 9.580666767883456e-06, "loss": 0.042, "step": 1363 }, { "epoch": 0.6551392891450528, "grad_norm": 0.5032575865518208, "learning_rate": 9.579545805489292e-06, "loss": 0.0518, "step": 1364 }, { "epoch": 0.6556195965417867, "grad_norm": 0.6343768547939731, "learning_rate": 9.578423412571713e-06, "loss": 0.055, "step": 1365 }, { "epoch": 0.6560999039385207, "grad_norm": 0.6313252829889849, "learning_rate": 9.577299589481325e-06, "loss": 0.0499, "step": 1366 }, { "epoch": 0.6565802113352546, "grad_norm": 0.5874538450840524, "learning_rate": 9.576174336569177e-06, "loss": 0.0546, "step": 1367 }, { "epoch": 0.6570605187319885, "grad_norm": 0.4208422595696595, "learning_rate": 9.575047654186768e-06, "loss": 0.0427, "step": 1368 }, { "epoch": 0.6575408261287223, "grad_norm": 0.6079831399082672, "learning_rate": 9.57391954268604e-06, "loss": 0.051, "step": 1369 }, { "epoch": 0.6580211335254563, "grad_norm": 0.5767257558258905, "learning_rate": 9.57279000241939e-06, "loss": 0.0402, "step": 1370 }, { "epoch": 0.6585014409221902, "grad_norm": 0.4733503070820277, "learning_rate": 9.571659033739648e-06, "loss": 0.048, "step": 1371 }, { "epoch": 0.6589817483189241, "grad_norm": 0.4524785331372919, "learning_rate": 9.570526637000102e-06, "loss": 0.04, "step": 1372 }, { "epoch": 0.659462055715658, "grad_norm": 0.6668276001778005, "learning_rate": 9.56939281255448e-06, "loss": 0.0498, "step": 1373 }, { "epoch": 0.659942363112392, "grad_norm": 0.49259152852847804, "learning_rate": 9.568257560756955e-06, "loss": 0.0386, "step": 1374 }, { "epoch": 0.6604226705091258, "grad_norm": 0.8583845386777227, "learning_rate": 9.567120881962152e-06, "loss": 0.0498, "step": 1375 }, { "epoch": 0.6609029779058597, "grad_norm": 0.533492536983965, "learning_rate": 9.565982776525136e-06, "loss": 0.0472, "step": 1376 }, { "epoch": 0.6613832853025937, "grad_norm": 0.6027948215201636, "learning_rate": 9.56484324480142e-06, "loss": 0.047, "step": 1377 }, { "epoch": 0.6618635926993276, "grad_norm": 0.3382733587601467, "learning_rate": 9.563702287146963e-06, "loss": 0.0423, "step": 1378 }, { "epoch": 0.6623439000960615, "grad_norm": 0.40078317285361365, "learning_rate": 9.562559903918167e-06, "loss": 0.0437, "step": 1379 }, { "epoch": 0.6628242074927954, "grad_norm": 0.4152293792382375, "learning_rate": 9.561416095471882e-06, "loss": 0.0487, "step": 1380 }, { "epoch": 0.6633045148895294, "grad_norm": 0.37525899009119723, "learning_rate": 9.560270862165401e-06, "loss": 0.0445, "step": 1381 }, { "epoch": 0.6637848222862632, "grad_norm": 0.4239007996552649, "learning_rate": 9.559124204356465e-06, "loss": 0.0365, "step": 1382 }, { "epoch": 0.6642651296829971, "grad_norm": 0.44856667071995515, "learning_rate": 9.557976122403259e-06, "loss": 0.0432, "step": 1383 }, { "epoch": 0.664745437079731, "grad_norm": 0.40105877291250286, "learning_rate": 9.556826616664408e-06, "loss": 0.0375, "step": 1384 }, { "epoch": 0.665225744476465, "grad_norm": 0.6413226916161751, "learning_rate": 9.555675687498988e-06, "loss": 0.0491, "step": 1385 }, { "epoch": 0.6657060518731989, "grad_norm": 0.4994127702541754, "learning_rate": 9.554523335266519e-06, "loss": 0.0362, "step": 1386 }, { "epoch": 0.6661863592699327, "grad_norm": 0.43217572367178236, "learning_rate": 9.553369560326961e-06, "loss": 0.041, "step": 1387 }, { "epoch": 0.6666666666666666, "grad_norm": 0.5576149940821006, "learning_rate": 9.552214363040725e-06, "loss": 0.0609, "step": 1388 }, { "epoch": 0.6671469740634006, "grad_norm": 0.4612462479554566, "learning_rate": 9.551057743768658e-06, "loss": 0.0326, "step": 1389 }, { "epoch": 0.6676272814601345, "grad_norm": 0.7707155411339915, "learning_rate": 9.54989970287206e-06, "loss": 0.0515, "step": 1390 }, { "epoch": 0.6681075888568684, "grad_norm": 0.6183305732020524, "learning_rate": 9.54874024071267e-06, "loss": 0.0573, "step": 1391 }, { "epoch": 0.6685878962536023, "grad_norm": 0.7355014675302118, "learning_rate": 9.54757935765267e-06, "loss": 0.0509, "step": 1392 }, { "epoch": 0.6690682036503363, "grad_norm": 0.41942626169957864, "learning_rate": 9.54641705405469e-06, "loss": 0.0346, "step": 1393 }, { "epoch": 0.6695485110470701, "grad_norm": 0.46998738583164973, "learning_rate": 9.5452533302818e-06, "loss": 0.0441, "step": 1394 }, { "epoch": 0.670028818443804, "grad_norm": 0.8255715566434525, "learning_rate": 9.544088186697515e-06, "loss": 0.0762, "step": 1395 }, { "epoch": 0.6705091258405379, "grad_norm": 0.742976741485616, "learning_rate": 9.542921623665796e-06, "loss": 0.056, "step": 1396 }, { "epoch": 0.6709894332372719, "grad_norm": 0.7975592021603152, "learning_rate": 9.541753641551042e-06, "loss": 0.0618, "step": 1397 }, { "epoch": 0.6714697406340058, "grad_norm": 0.7133686245791877, "learning_rate": 9.540584240718098e-06, "loss": 0.0359, "step": 1398 }, { "epoch": 0.6719500480307397, "grad_norm": 0.6417332791114704, "learning_rate": 9.539413421532256e-06, "loss": 0.067, "step": 1399 }, { "epoch": 0.6724303554274735, "grad_norm": 0.6512453356740282, "learning_rate": 9.538241184359245e-06, "loss": 0.057, "step": 1400 }, { "epoch": 0.6729106628242075, "grad_norm": 0.6011163829953513, "learning_rate": 9.537067529565241e-06, "loss": 0.0433, "step": 1401 }, { "epoch": 0.6733909702209414, "grad_norm": 0.5605867258554449, "learning_rate": 9.535892457516858e-06, "loss": 0.07, "step": 1402 }, { "epoch": 0.6738712776176753, "grad_norm": 0.44843028713428207, "learning_rate": 9.53471596858116e-06, "loss": 0.0346, "step": 1403 }, { "epoch": 0.6743515850144092, "grad_norm": 0.45306372964448993, "learning_rate": 9.533538063125649e-06, "loss": 0.0395, "step": 1404 }, { "epoch": 0.6748318924111432, "grad_norm": 0.665610180113429, "learning_rate": 9.532358741518265e-06, "loss": 0.0727, "step": 1405 }, { "epoch": 0.675312199807877, "grad_norm": 0.40211030378348633, "learning_rate": 9.531178004127404e-06, "loss": 0.0415, "step": 1406 }, { "epoch": 0.6757925072046109, "grad_norm": 0.4867082233781477, "learning_rate": 9.52999585132189e-06, "loss": 0.0413, "step": 1407 }, { "epoch": 0.6762728146013448, "grad_norm": 0.7741469232795118, "learning_rate": 9.528812283470994e-06, "loss": 0.0717, "step": 1408 }, { "epoch": 0.6767531219980788, "grad_norm": 0.42760299424580006, "learning_rate": 9.527627300944434e-06, "loss": 0.0378, "step": 1409 }, { "epoch": 0.6772334293948127, "grad_norm": 0.6198122845932187, "learning_rate": 9.52644090411236e-06, "loss": 0.0644, "step": 1410 }, { "epoch": 0.6777137367915466, "grad_norm": 0.7010269392103107, "learning_rate": 9.525253093345377e-06, "loss": 0.0742, "step": 1411 }, { "epoch": 0.6781940441882806, "grad_norm": 0.46098176739925134, "learning_rate": 9.524063869014517e-06, "loss": 0.0539, "step": 1412 }, { "epoch": 0.6786743515850144, "grad_norm": 0.45626576421316317, "learning_rate": 9.522873231491268e-06, "loss": 0.0459, "step": 1413 }, { "epoch": 0.6791546589817483, "grad_norm": 0.5235232885723231, "learning_rate": 9.521681181147544e-06, "loss": 0.0354, "step": 1414 }, { "epoch": 0.6796349663784822, "grad_norm": 0.4083701335457131, "learning_rate": 9.520487718355716e-06, "loss": 0.0376, "step": 1415 }, { "epoch": 0.6801152737752162, "grad_norm": 0.4519589464495528, "learning_rate": 9.519292843488584e-06, "loss": 0.0435, "step": 1416 }, { "epoch": 0.6805955811719501, "grad_norm": 0.5270239622507114, "learning_rate": 9.518096556919396e-06, "loss": 0.0385, "step": 1417 }, { "epoch": 0.681075888568684, "grad_norm": 0.49655054870938087, "learning_rate": 9.51689885902184e-06, "loss": 0.0418, "step": 1418 }, { "epoch": 0.6815561959654178, "grad_norm": 0.48390295607618405, "learning_rate": 9.51569975017004e-06, "loss": 0.0473, "step": 1419 }, { "epoch": 0.6820365033621518, "grad_norm": 0.4134572167278166, "learning_rate": 9.514499230738567e-06, "loss": 0.0356, "step": 1420 }, { "epoch": 0.6825168107588857, "grad_norm": 0.42968236699716256, "learning_rate": 9.51329730110243e-06, "loss": 0.0396, "step": 1421 }, { "epoch": 0.6829971181556196, "grad_norm": 0.507732504462261, "learning_rate": 9.512093961637077e-06, "loss": 0.044, "step": 1422 }, { "epoch": 0.6834774255523535, "grad_norm": 0.513408290011866, "learning_rate": 9.510889212718398e-06, "loss": 0.0539, "step": 1423 }, { "epoch": 0.6839577329490875, "grad_norm": 0.5815081646240737, "learning_rate": 9.509683054722726e-06, "loss": 0.0398, "step": 1424 }, { "epoch": 0.6844380403458213, "grad_norm": 0.4078417115979057, "learning_rate": 9.508475488026829e-06, "loss": 0.0332, "step": 1425 }, { "epoch": 0.6849183477425552, "grad_norm": 1.0586820403034098, "learning_rate": 9.507266513007918e-06, "loss": 0.0763, "step": 1426 }, { "epoch": 0.6853986551392891, "grad_norm": 0.564383510484416, "learning_rate": 9.506056130043644e-06, "loss": 0.0434, "step": 1427 }, { "epoch": 0.6858789625360231, "grad_norm": 0.3768335964210522, "learning_rate": 9.504844339512096e-06, "loss": 0.0326, "step": 1428 }, { "epoch": 0.686359269932757, "grad_norm": 0.5437219091200941, "learning_rate": 9.503631141791804e-06, "loss": 0.0551, "step": 1429 }, { "epoch": 0.6868395773294909, "grad_norm": 0.39882576370499795, "learning_rate": 9.502416537261739e-06, "loss": 0.0394, "step": 1430 }, { "epoch": 0.6873198847262247, "grad_norm": 0.5298522285045902, "learning_rate": 9.50120052630131e-06, "loss": 0.0424, "step": 1431 }, { "epoch": 0.6878001921229587, "grad_norm": 0.7335489985769658, "learning_rate": 9.499983109290361e-06, "loss": 0.0695, "step": 1432 }, { "epoch": 0.6882804995196926, "grad_norm": 0.7767384589812387, "learning_rate": 9.498764286609183e-06, "loss": 0.0684, "step": 1433 }, { "epoch": 0.6887608069164265, "grad_norm": 0.4485179955778425, "learning_rate": 9.4975440586385e-06, "loss": 0.0545, "step": 1434 }, { "epoch": 0.6892411143131604, "grad_norm": 0.7644156832335937, "learning_rate": 9.496322425759484e-06, "loss": 0.056, "step": 1435 }, { "epoch": 0.6897214217098944, "grad_norm": 0.6682437861904863, "learning_rate": 9.495099388353731e-06, "loss": 0.0413, "step": 1436 }, { "epoch": 0.6902017291066282, "grad_norm": 0.2782387488882061, "learning_rate": 9.493874946803287e-06, "loss": 0.0307, "step": 1437 }, { "epoch": 0.6906820365033621, "grad_norm": 0.7196277741729624, "learning_rate": 9.492649101490636e-06, "loss": 0.0605, "step": 1438 }, { "epoch": 0.691162343900096, "grad_norm": 0.43324754881869854, "learning_rate": 9.491421852798695e-06, "loss": 0.0475, "step": 1439 }, { "epoch": 0.69164265129683, "grad_norm": 0.43738740093767214, "learning_rate": 9.490193201110825e-06, "loss": 0.0375, "step": 1440 }, { "epoch": 0.6921229586935639, "grad_norm": 0.5488539442142931, "learning_rate": 9.48896314681082e-06, "loss": 0.0728, "step": 1441 }, { "epoch": 0.6926032660902978, "grad_norm": 1.1540239102800764, "learning_rate": 9.48773169028292e-06, "loss": 0.0586, "step": 1442 }, { "epoch": 0.6930835734870316, "grad_norm": 0.6283641043852788, "learning_rate": 9.486498831911792e-06, "loss": 0.0736, "step": 1443 }, { "epoch": 0.6935638808837656, "grad_norm": 1.0108486453877055, "learning_rate": 9.485264572082551e-06, "loss": 0.0918, "step": 1444 }, { "epoch": 0.6940441882804995, "grad_norm": 0.40880665155636625, "learning_rate": 9.484028911180742e-06, "loss": 0.0389, "step": 1445 }, { "epoch": 0.6945244956772334, "grad_norm": 0.47431051486539744, "learning_rate": 9.482791849592354e-06, "loss": 0.0597, "step": 1446 }, { "epoch": 0.6950048030739674, "grad_norm": 0.8329629013113664, "learning_rate": 9.48155338770381e-06, "loss": 0.0727, "step": 1447 }, { "epoch": 0.6954851104707013, "grad_norm": 0.5434685409654171, "learning_rate": 9.480313525901973e-06, "loss": 0.0505, "step": 1448 }, { "epoch": 0.6959654178674352, "grad_norm": 0.6327014782848, "learning_rate": 9.479072264574138e-06, "loss": 0.0524, "step": 1449 }, { "epoch": 0.696445725264169, "grad_norm": 0.4768500817588462, "learning_rate": 9.477829604108044e-06, "loss": 0.0479, "step": 1450 }, { "epoch": 0.696926032660903, "grad_norm": 0.33637687220332657, "learning_rate": 9.476585544891862e-06, "loss": 0.0335, "step": 1451 }, { "epoch": 0.6974063400576369, "grad_norm": 0.6410159305875692, "learning_rate": 9.475340087314203e-06, "loss": 0.0457, "step": 1452 }, { "epoch": 0.6978866474543708, "grad_norm": 0.7561772611383961, "learning_rate": 9.47409323176411e-06, "loss": 0.0625, "step": 1453 }, { "epoch": 0.6983669548511047, "grad_norm": 0.3920818511938983, "learning_rate": 9.472844978631071e-06, "loss": 0.0282, "step": 1454 }, { "epoch": 0.6988472622478387, "grad_norm": 0.7746401992023322, "learning_rate": 9.471595328305002e-06, "loss": 0.0577, "step": 1455 }, { "epoch": 0.6993275696445725, "grad_norm": 1.1432462417277984, "learning_rate": 9.470344281176261e-06, "loss": 0.0503, "step": 1456 }, { "epoch": 0.6998078770413064, "grad_norm": 0.5579207608396545, "learning_rate": 9.469091837635641e-06, "loss": 0.0436, "step": 1457 }, { "epoch": 0.7002881844380403, "grad_norm": 0.4293775018518884, "learning_rate": 9.467837998074369e-06, "loss": 0.0426, "step": 1458 }, { "epoch": 0.7007684918347743, "grad_norm": 0.5654128796038869, "learning_rate": 9.466582762884111e-06, "loss": 0.0515, "step": 1459 }, { "epoch": 0.7012487992315082, "grad_norm": 0.6275264278723893, "learning_rate": 9.465326132456966e-06, "loss": 0.0314, "step": 1460 }, { "epoch": 0.7017291066282421, "grad_norm": 0.7608816446615059, "learning_rate": 9.464068107185476e-06, "loss": 0.0793, "step": 1461 }, { "epoch": 0.7022094140249759, "grad_norm": 0.588693970208091, "learning_rate": 9.462808687462606e-06, "loss": 0.0451, "step": 1462 }, { "epoch": 0.7026897214217099, "grad_norm": 0.6771566307850208, "learning_rate": 9.461547873681767e-06, "loss": 0.0515, "step": 1463 }, { "epoch": 0.7031700288184438, "grad_norm": 0.526932323467287, "learning_rate": 9.460285666236804e-06, "loss": 0.057, "step": 1464 }, { "epoch": 0.7036503362151777, "grad_norm": 0.6547665214657098, "learning_rate": 9.459022065521994e-06, "loss": 0.0469, "step": 1465 }, { "epoch": 0.7041306436119116, "grad_norm": 1.1306126178130151, "learning_rate": 9.45775707193205e-06, "loss": 0.0437, "step": 1466 }, { "epoch": 0.7046109510086456, "grad_norm": 0.6656984791797591, "learning_rate": 9.456490685862123e-06, "loss": 0.0657, "step": 1467 }, { "epoch": 0.7050912584053795, "grad_norm": 0.5262955218143904, "learning_rate": 9.455222907707795e-06, "loss": 0.0405, "step": 1468 }, { "epoch": 0.7055715658021133, "grad_norm": 0.4113212330571846, "learning_rate": 9.453953737865087e-06, "loss": 0.0331, "step": 1469 }, { "epoch": 0.7060518731988472, "grad_norm": 0.4687499190476505, "learning_rate": 9.452683176730452e-06, "loss": 0.0424, "step": 1470 }, { "epoch": 0.7065321805955812, "grad_norm": 0.5572677332289074, "learning_rate": 9.451411224700776e-06, "loss": 0.0612, "step": 1471 }, { "epoch": 0.7070124879923151, "grad_norm": 0.39073970748339487, "learning_rate": 9.450137882173385e-06, "loss": 0.0279, "step": 1472 }, { "epoch": 0.707492795389049, "grad_norm": 0.8716017403152045, "learning_rate": 9.448863149546032e-06, "loss": 0.0612, "step": 1473 }, { "epoch": 0.7079731027857828, "grad_norm": 0.8087490747385843, "learning_rate": 9.447587027216912e-06, "loss": 0.0754, "step": 1474 }, { "epoch": 0.7084534101825168, "grad_norm": 0.40033610599261016, "learning_rate": 9.446309515584648e-06, "loss": 0.0516, "step": 1475 }, { "epoch": 0.7089337175792507, "grad_norm": 0.3828791531186985, "learning_rate": 9.445030615048301e-06, "loss": 0.0308, "step": 1476 }, { "epoch": 0.7094140249759846, "grad_norm": 0.558762243566099, "learning_rate": 9.443750326007361e-06, "loss": 0.046, "step": 1477 }, { "epoch": 0.7098943323727186, "grad_norm": 0.46666384505267333, "learning_rate": 9.44246864886176e-06, "loss": 0.0463, "step": 1478 }, { "epoch": 0.7103746397694525, "grad_norm": 0.4943805321936899, "learning_rate": 9.441185584011854e-06, "loss": 0.0412, "step": 1479 }, { "epoch": 0.7108549471661864, "grad_norm": 0.4595133347133602, "learning_rate": 9.439901131858437e-06, "loss": 0.0457, "step": 1480 }, { "epoch": 0.7113352545629202, "grad_norm": 0.43379618763244315, "learning_rate": 9.43861529280274e-06, "loss": 0.0357, "step": 1481 }, { "epoch": 0.7118155619596542, "grad_norm": 0.58990377026948, "learning_rate": 9.43732806724642e-06, "loss": 0.0533, "step": 1482 }, { "epoch": 0.7122958693563881, "grad_norm": 0.31473504698760496, "learning_rate": 9.436039455591574e-06, "loss": 0.0309, "step": 1483 }, { "epoch": 0.712776176753122, "grad_norm": 0.45683837690648205, "learning_rate": 9.434749458240723e-06, "loss": 0.047, "step": 1484 }, { "epoch": 0.7132564841498559, "grad_norm": 0.3777015877291396, "learning_rate": 9.433458075596834e-06, "loss": 0.0434, "step": 1485 }, { "epoch": 0.7137367915465899, "grad_norm": 0.5110934466400615, "learning_rate": 9.432165308063293e-06, "loss": 0.0603, "step": 1486 }, { "epoch": 0.7142170989433237, "grad_norm": 0.4887572837652129, "learning_rate": 9.430871156043929e-06, "loss": 0.0508, "step": 1487 }, { "epoch": 0.7146974063400576, "grad_norm": 0.5164375394535271, "learning_rate": 9.429575619942996e-06, "loss": 0.0357, "step": 1488 }, { "epoch": 0.7151777137367915, "grad_norm": 0.4860839779256668, "learning_rate": 9.428278700165185e-06, "loss": 0.0496, "step": 1489 }, { "epoch": 0.7156580211335255, "grad_norm": 0.44939726645063366, "learning_rate": 9.426980397115619e-06, "loss": 0.0392, "step": 1490 }, { "epoch": 0.7161383285302594, "grad_norm": 0.4491948208101559, "learning_rate": 9.42568071119985e-06, "loss": 0.0418, "step": 1491 }, { "epoch": 0.7166186359269933, "grad_norm": 0.4413729317811157, "learning_rate": 9.424379642823864e-06, "loss": 0.0418, "step": 1492 }, { "epoch": 0.7170989433237271, "grad_norm": 0.6050908044633452, "learning_rate": 9.423077192394081e-06, "loss": 0.054, "step": 1493 }, { "epoch": 0.7175792507204611, "grad_norm": 0.34829153690830816, "learning_rate": 9.421773360317348e-06, "loss": 0.0455, "step": 1494 }, { "epoch": 0.718059558117195, "grad_norm": 0.6479436968237636, "learning_rate": 9.420468147000947e-06, "loss": 0.0591, "step": 1495 }, { "epoch": 0.7185398655139289, "grad_norm": 0.5935139475806216, "learning_rate": 9.419161552852592e-06, "loss": 0.0592, "step": 1496 }, { "epoch": 0.7190201729106628, "grad_norm": 0.35365850589490483, "learning_rate": 9.417853578280425e-06, "loss": 0.035, "step": 1497 }, { "epoch": 0.7195004803073968, "grad_norm": 0.5335048893582442, "learning_rate": 9.41654422369302e-06, "loss": 0.0343, "step": 1498 }, { "epoch": 0.7199807877041307, "grad_norm": 0.47006252573172974, "learning_rate": 9.415233489499388e-06, "loss": 0.0466, "step": 1499 }, { "epoch": 0.7204610951008645, "grad_norm": 0.4138903520224807, "learning_rate": 9.413921376108958e-06, "loss": 0.0337, "step": 1500 }, { "epoch": 0.7209414024975984, "grad_norm": 0.5302837596305445, "learning_rate": 9.412607883931608e-06, "loss": 0.0355, "step": 1501 }, { "epoch": 0.7214217098943324, "grad_norm": 0.46188113895356303, "learning_rate": 9.411293013377628e-06, "loss": 0.0464, "step": 1502 }, { "epoch": 0.7219020172910663, "grad_norm": 0.5048308160692421, "learning_rate": 9.409976764857752e-06, "loss": 0.0659, "step": 1503 }, { "epoch": 0.7223823246878002, "grad_norm": 0.42557944797976405, "learning_rate": 9.408659138783138e-06, "loss": 0.0476, "step": 1504 }, { "epoch": 0.722862632084534, "grad_norm": 0.43337734311918524, "learning_rate": 9.407340135565375e-06, "loss": 0.0473, "step": 1505 }, { "epoch": 0.723342939481268, "grad_norm": 0.43466454870999904, "learning_rate": 9.406019755616484e-06, "loss": 0.0456, "step": 1506 }, { "epoch": 0.7238232468780019, "grad_norm": 0.44882261533196544, "learning_rate": 9.404697999348917e-06, "loss": 0.0545, "step": 1507 }, { "epoch": 0.7243035542747358, "grad_norm": 0.4627444515845815, "learning_rate": 9.403374867175553e-06, "loss": 0.039, "step": 1508 }, { "epoch": 0.7247838616714697, "grad_norm": 0.43987793983099877, "learning_rate": 9.4020503595097e-06, "loss": 0.0315, "step": 1509 }, { "epoch": 0.7252641690682037, "grad_norm": 0.4866538331222698, "learning_rate": 9.400724476765099e-06, "loss": 0.0451, "step": 1510 }, { "epoch": 0.7257444764649376, "grad_norm": 0.35933139183080276, "learning_rate": 9.39939721935592e-06, "loss": 0.0341, "step": 1511 }, { "epoch": 0.7262247838616714, "grad_norm": 0.36877632658996023, "learning_rate": 9.398068587696758e-06, "loss": 0.0382, "step": 1512 }, { "epoch": 0.7267050912584054, "grad_norm": 0.9738122717404578, "learning_rate": 9.396738582202645e-06, "loss": 0.0742, "step": 1513 }, { "epoch": 0.7271853986551393, "grad_norm": 0.3887278676876787, "learning_rate": 9.395407203289036e-06, "loss": 0.0376, "step": 1514 }, { "epoch": 0.7276657060518732, "grad_norm": 0.6994372693236012, "learning_rate": 9.394074451371817e-06, "loss": 0.0394, "step": 1515 }, { "epoch": 0.7281460134486071, "grad_norm": 0.45247145506590497, "learning_rate": 9.392740326867304e-06, "loss": 0.0364, "step": 1516 }, { "epoch": 0.7286263208453411, "grad_norm": 0.5967559115342186, "learning_rate": 9.391404830192239e-06, "loss": 0.0333, "step": 1517 }, { "epoch": 0.729106628242075, "grad_norm": 0.6079539868350357, "learning_rate": 9.390067961763795e-06, "loss": 0.0472, "step": 1518 }, { "epoch": 0.7295869356388088, "grad_norm": 0.4845564720247174, "learning_rate": 9.388729721999573e-06, "loss": 0.0399, "step": 1519 }, { "epoch": 0.7300672430355427, "grad_norm": 0.41693125786890695, "learning_rate": 9.387390111317599e-06, "loss": 0.0413, "step": 1520 }, { "epoch": 0.7305475504322767, "grad_norm": 0.550874709305461, "learning_rate": 9.386049130136335e-06, "loss": 0.0527, "step": 1521 }, { "epoch": 0.7310278578290106, "grad_norm": 0.7548778248070213, "learning_rate": 9.384706778874664e-06, "loss": 0.0521, "step": 1522 }, { "epoch": 0.7315081652257445, "grad_norm": 0.5908031992669268, "learning_rate": 9.3833630579519e-06, "loss": 0.0407, "step": 1523 }, { "epoch": 0.7319884726224783, "grad_norm": 0.6721817113672193, "learning_rate": 9.382017967787783e-06, "loss": 0.0456, "step": 1524 }, { "epoch": 0.7324687800192123, "grad_norm": 0.412185975988658, "learning_rate": 9.380671508802483e-06, "loss": 0.0392, "step": 1525 }, { "epoch": 0.7329490874159462, "grad_norm": 0.46229352306795535, "learning_rate": 9.379323681416596e-06, "loss": 0.0357, "step": 1526 }, { "epoch": 0.7334293948126801, "grad_norm": 0.5289989908411948, "learning_rate": 9.377974486051149e-06, "loss": 0.0548, "step": 1527 }, { "epoch": 0.733909702209414, "grad_norm": 0.5146883009854686, "learning_rate": 9.376623923127588e-06, "loss": 0.0505, "step": 1528 }, { "epoch": 0.734390009606148, "grad_norm": 0.5889961825894928, "learning_rate": 9.375271993067794e-06, "loss": 0.0442, "step": 1529 }, { "epoch": 0.7348703170028819, "grad_norm": 0.492353320981019, "learning_rate": 9.373918696294074e-06, "loss": 0.045, "step": 1530 }, { "epoch": 0.7353506243996157, "grad_norm": 0.39252458109011074, "learning_rate": 9.372564033229159e-06, "loss": 0.0357, "step": 1531 }, { "epoch": 0.7358309317963496, "grad_norm": 0.6428195670677652, "learning_rate": 9.371208004296208e-06, "loss": 0.0644, "step": 1532 }, { "epoch": 0.7363112391930836, "grad_norm": 0.6817577459803879, "learning_rate": 9.36985060991881e-06, "loss": 0.0804, "step": 1533 }, { "epoch": 0.7367915465898175, "grad_norm": 0.7456675871856415, "learning_rate": 9.368491850520972e-06, "loss": 0.0476, "step": 1534 }, { "epoch": 0.7372718539865514, "grad_norm": 0.45828290446709025, "learning_rate": 9.367131726527137e-06, "loss": 0.0387, "step": 1535 }, { "epoch": 0.7377521613832853, "grad_norm": 0.5469514522999743, "learning_rate": 9.365770238362168e-06, "loss": 0.0401, "step": 1536 }, { "epoch": 0.7382324687800192, "grad_norm": 0.6196800460825531, "learning_rate": 9.364407386451358e-06, "loss": 0.0382, "step": 1537 }, { "epoch": 0.7387127761767531, "grad_norm": 0.8185923332401619, "learning_rate": 9.363043171220423e-06, "loss": 0.0439, "step": 1538 }, { "epoch": 0.739193083573487, "grad_norm": 0.5914692890542824, "learning_rate": 9.361677593095506e-06, "loss": 0.0616, "step": 1539 }, { "epoch": 0.7396733909702209, "grad_norm": 0.42464742265876826, "learning_rate": 9.360310652503176e-06, "loss": 0.0454, "step": 1540 }, { "epoch": 0.7401536983669549, "grad_norm": 0.3717412618511177, "learning_rate": 9.358942349870428e-06, "loss": 0.0335, "step": 1541 }, { "epoch": 0.7406340057636888, "grad_norm": 0.47502141185790275, "learning_rate": 9.35757268562468e-06, "loss": 0.0488, "step": 1542 }, { "epoch": 0.7411143131604226, "grad_norm": 0.3846823017408531, "learning_rate": 9.356201660193782e-06, "loss": 0.0345, "step": 1543 }, { "epoch": 0.7415946205571565, "grad_norm": 0.5650933228577771, "learning_rate": 9.354829274005999e-06, "loss": 0.0383, "step": 1544 }, { "epoch": 0.7420749279538905, "grad_norm": 0.5690246647765329, "learning_rate": 9.35345552749003e-06, "loss": 0.0569, "step": 1545 }, { "epoch": 0.7425552353506244, "grad_norm": 0.38731794835615024, "learning_rate": 9.352080421074993e-06, "loss": 0.0419, "step": 1546 }, { "epoch": 0.7430355427473583, "grad_norm": 0.40974758691143354, "learning_rate": 9.350703955190435e-06, "loss": 0.0376, "step": 1547 }, { "epoch": 0.7435158501440923, "grad_norm": 0.46283701258349547, "learning_rate": 9.349326130266323e-06, "loss": 0.0328, "step": 1548 }, { "epoch": 0.7439961575408262, "grad_norm": 0.40124018251646276, "learning_rate": 9.347946946733055e-06, "loss": 0.0387, "step": 1549 }, { "epoch": 0.74447646493756, "grad_norm": 0.4121164368713994, "learning_rate": 9.346566405021448e-06, "loss": 0.0387, "step": 1550 }, { "epoch": 0.7449567723342939, "grad_norm": 0.48335845270026145, "learning_rate": 9.345184505562747e-06, "loss": 0.0368, "step": 1551 }, { "epoch": 0.7454370797310279, "grad_norm": 0.45269768346922323, "learning_rate": 9.343801248788616e-06, "loss": 0.052, "step": 1552 }, { "epoch": 0.7459173871277618, "grad_norm": 0.4610035020071121, "learning_rate": 9.342416635131148e-06, "loss": 0.0614, "step": 1553 }, { "epoch": 0.7463976945244957, "grad_norm": 0.4443227466310018, "learning_rate": 9.341030665022856e-06, "loss": 0.0447, "step": 1554 }, { "epoch": 0.7468780019212296, "grad_norm": 0.7642793611261786, "learning_rate": 9.339643338896682e-06, "loss": 0.0635, "step": 1555 }, { "epoch": 0.7473583093179635, "grad_norm": 0.4923735731312003, "learning_rate": 9.338254657185988e-06, "loss": 0.0507, "step": 1556 }, { "epoch": 0.7478386167146974, "grad_norm": 0.630683703114263, "learning_rate": 9.336864620324555e-06, "loss": 0.0621, "step": 1557 }, { "epoch": 0.7483189241114313, "grad_norm": 0.5090396267143328, "learning_rate": 9.335473228746599e-06, "loss": 0.0406, "step": 1558 }, { "epoch": 0.7487992315081652, "grad_norm": 0.367559997986288, "learning_rate": 9.334080482886746e-06, "loss": 0.0384, "step": 1559 }, { "epoch": 0.7492795389048992, "grad_norm": 0.4446659468934462, "learning_rate": 9.332686383180055e-06, "loss": 0.0361, "step": 1560 }, { "epoch": 0.7497598463016331, "grad_norm": 0.38856445414113083, "learning_rate": 9.331290930062003e-06, "loss": 0.0259, "step": 1561 }, { "epoch": 0.7502401536983669, "grad_norm": 0.5997908771571422, "learning_rate": 9.32989412396849e-06, "loss": 0.0471, "step": 1562 }, { "epoch": 0.7507204610951008, "grad_norm": 0.3393607659537801, "learning_rate": 9.328495965335844e-06, "loss": 0.0394, "step": 1563 }, { "epoch": 0.7512007684918348, "grad_norm": 0.37942787002008155, "learning_rate": 9.327096454600804e-06, "loss": 0.0273, "step": 1564 }, { "epoch": 0.7516810758885687, "grad_norm": 0.5089526163454388, "learning_rate": 9.325695592200545e-06, "loss": 0.0632, "step": 1565 }, { "epoch": 0.7521613832853026, "grad_norm": 0.5121396909775255, "learning_rate": 9.324293378572654e-06, "loss": 0.0551, "step": 1566 }, { "epoch": 0.7526416906820365, "grad_norm": 0.38135878200286283, "learning_rate": 9.322889814155143e-06, "loss": 0.0345, "step": 1567 }, { "epoch": 0.7531219980787704, "grad_norm": 0.6071199641753309, "learning_rate": 9.32148489938645e-06, "loss": 0.0424, "step": 1568 }, { "epoch": 0.7536023054755043, "grad_norm": 0.6157559078057468, "learning_rate": 9.32007863470543e-06, "loss": 0.045, "step": 1569 }, { "epoch": 0.7540826128722382, "grad_norm": 0.48273199109262704, "learning_rate": 9.31867102055136e-06, "loss": 0.0406, "step": 1570 }, { "epoch": 0.7545629202689721, "grad_norm": 0.4593296987544262, "learning_rate": 9.31726205736394e-06, "loss": 0.0436, "step": 1571 }, { "epoch": 0.7550432276657061, "grad_norm": 0.4500045712808899, "learning_rate": 9.315851745583294e-06, "loss": 0.0269, "step": 1572 }, { "epoch": 0.75552353506244, "grad_norm": 0.4454538603150306, "learning_rate": 9.31444008564996e-06, "loss": 0.0366, "step": 1573 }, { "epoch": 0.7560038424591738, "grad_norm": 0.8394007155144771, "learning_rate": 9.313027078004903e-06, "loss": 0.0503, "step": 1574 }, { "epoch": 0.7564841498559077, "grad_norm": 0.5712174753843666, "learning_rate": 9.311612723089511e-06, "loss": 0.0547, "step": 1575 }, { "epoch": 0.7569644572526417, "grad_norm": 0.46645965469348327, "learning_rate": 9.310197021345586e-06, "loss": 0.0374, "step": 1576 }, { "epoch": 0.7574447646493756, "grad_norm": 0.4740884325639232, "learning_rate": 9.308779973215355e-06, "loss": 0.0548, "step": 1577 }, { "epoch": 0.7579250720461095, "grad_norm": 0.5888223818640053, "learning_rate": 9.307361579141461e-06, "loss": 0.0581, "step": 1578 }, { "epoch": 0.7584053794428435, "grad_norm": 0.5714740123256156, "learning_rate": 9.305941839566978e-06, "loss": 0.0479, "step": 1579 }, { "epoch": 0.7588856868395774, "grad_norm": 0.6224308689788665, "learning_rate": 9.304520754935387e-06, "loss": 0.0418, "step": 1580 }, { "epoch": 0.7593659942363112, "grad_norm": 0.5273832414104769, "learning_rate": 9.303098325690601e-06, "loss": 0.0534, "step": 1581 }, { "epoch": 0.7598463016330451, "grad_norm": 0.4100845821662855, "learning_rate": 9.301674552276942e-06, "loss": 0.0393, "step": 1582 }, { "epoch": 0.7603266090297791, "grad_norm": 0.5233760618757851, "learning_rate": 9.300249435139162e-06, "loss": 0.0567, "step": 1583 }, { "epoch": 0.760806916426513, "grad_norm": 0.4470067504003617, "learning_rate": 9.298822974722425e-06, "loss": 0.039, "step": 1584 }, { "epoch": 0.7612872238232469, "grad_norm": 0.6788279010789742, "learning_rate": 9.297395171472321e-06, "loss": 0.045, "step": 1585 }, { "epoch": 0.7617675312199808, "grad_norm": 0.5433301733942123, "learning_rate": 9.295966025834853e-06, "loss": 0.0564, "step": 1586 }, { "epoch": 0.7622478386167147, "grad_norm": 0.49450333266253477, "learning_rate": 9.294535538256447e-06, "loss": 0.0531, "step": 1587 }, { "epoch": 0.7627281460134486, "grad_norm": 0.5471927312457198, "learning_rate": 9.29310370918395e-06, "loss": 0.0669, "step": 1588 }, { "epoch": 0.7632084534101825, "grad_norm": 0.4715635212373278, "learning_rate": 9.291670539064623e-06, "loss": 0.0405, "step": 1589 }, { "epoch": 0.7636887608069164, "grad_norm": 0.4111372690921772, "learning_rate": 9.290236028346152e-06, "loss": 0.0379, "step": 1590 }, { "epoch": 0.7641690682036504, "grad_norm": 0.7088243704037053, "learning_rate": 9.288800177476636e-06, "loss": 0.049, "step": 1591 }, { "epoch": 0.7646493756003843, "grad_norm": 0.38910744032185673, "learning_rate": 9.287362986904595e-06, "loss": 0.0377, "step": 1592 }, { "epoch": 0.7651296829971181, "grad_norm": 0.47900015103136745, "learning_rate": 9.28592445707897e-06, "loss": 0.0406, "step": 1593 }, { "epoch": 0.765609990393852, "grad_norm": 1.0851740572976578, "learning_rate": 9.284484588449115e-06, "loss": 0.0444, "step": 1594 }, { "epoch": 0.766090297790586, "grad_norm": 0.5147230458014541, "learning_rate": 9.283043381464806e-06, "loss": 0.0387, "step": 1595 }, { "epoch": 0.7665706051873199, "grad_norm": 0.46123024101584026, "learning_rate": 9.281600836576237e-06, "loss": 0.0451, "step": 1596 }, { "epoch": 0.7670509125840538, "grad_norm": 0.4441738388860228, "learning_rate": 9.280156954234017e-06, "loss": 0.0388, "step": 1597 }, { "epoch": 0.7675312199807877, "grad_norm": 0.5563354966449449, "learning_rate": 9.278711734889178e-06, "loss": 0.0407, "step": 1598 }, { "epoch": 0.7680115273775217, "grad_norm": 0.6141961922794104, "learning_rate": 9.277265178993164e-06, "loss": 0.0398, "step": 1599 }, { "epoch": 0.7684918347742555, "grad_norm": 0.5225559365038454, "learning_rate": 9.275817286997843e-06, "loss": 0.0371, "step": 1600 }, { "epoch": 0.7689721421709894, "grad_norm": 0.5371833677807129, "learning_rate": 9.274368059355491e-06, "loss": 0.0369, "step": 1601 }, { "epoch": 0.7694524495677233, "grad_norm": 0.4509358847362159, "learning_rate": 9.27291749651881e-06, "loss": 0.0348, "step": 1602 }, { "epoch": 0.7699327569644573, "grad_norm": 0.3790301116583118, "learning_rate": 9.271465598940914e-06, "loss": 0.0341, "step": 1603 }, { "epoch": 0.7704130643611912, "grad_norm": 0.5646089029750233, "learning_rate": 9.270012367075337e-06, "loss": 0.0458, "step": 1604 }, { "epoch": 0.770893371757925, "grad_norm": 0.6621291150481707, "learning_rate": 9.268557801376027e-06, "loss": 0.0673, "step": 1605 }, { "epoch": 0.7713736791546589, "grad_norm": 0.4824676654847145, "learning_rate": 9.267101902297354e-06, "loss": 0.0605, "step": 1606 }, { "epoch": 0.7718539865513929, "grad_norm": 0.47867737571924573, "learning_rate": 9.265644670294094e-06, "loss": 0.0402, "step": 1607 }, { "epoch": 0.7723342939481268, "grad_norm": 0.7521404239285508, "learning_rate": 9.26418610582145e-06, "loss": 0.0437, "step": 1608 }, { "epoch": 0.7728146013448607, "grad_norm": 0.4949872678032156, "learning_rate": 9.262726209335038e-06, "loss": 0.0471, "step": 1609 }, { "epoch": 0.7732949087415946, "grad_norm": 0.8146436171510608, "learning_rate": 9.261264981290887e-06, "loss": 0.0499, "step": 1610 }, { "epoch": 0.7737752161383286, "grad_norm": 0.5261485997463431, "learning_rate": 9.259802422145445e-06, "loss": 0.0499, "step": 1611 }, { "epoch": 0.7742555235350624, "grad_norm": 0.37722938922014426, "learning_rate": 9.258338532355575e-06, "loss": 0.0406, "step": 1612 }, { "epoch": 0.7747358309317963, "grad_norm": 0.5137949267423024, "learning_rate": 9.256873312378559e-06, "loss": 0.0371, "step": 1613 }, { "epoch": 0.7752161383285303, "grad_norm": 0.46384787647828507, "learning_rate": 9.255406762672085e-06, "loss": 0.0436, "step": 1614 }, { "epoch": 0.7756964457252642, "grad_norm": 0.42501919449374165, "learning_rate": 9.253938883694266e-06, "loss": 0.0319, "step": 1615 }, { "epoch": 0.7761767531219981, "grad_norm": 0.48825545396901937, "learning_rate": 9.252469675903627e-06, "loss": 0.0438, "step": 1616 }, { "epoch": 0.776657060518732, "grad_norm": 0.45045012320002853, "learning_rate": 9.250999139759107e-06, "loss": 0.0507, "step": 1617 }, { "epoch": 0.777137367915466, "grad_norm": 0.47986993527995575, "learning_rate": 9.249527275720062e-06, "loss": 0.0438, "step": 1618 }, { "epoch": 0.7776176753121998, "grad_norm": 0.6897438756676713, "learning_rate": 9.248054084246259e-06, "loss": 0.0711, "step": 1619 }, { "epoch": 0.7780979827089337, "grad_norm": 0.33151049726562337, "learning_rate": 9.246579565797886e-06, "loss": 0.0357, "step": 1620 }, { "epoch": 0.7785782901056676, "grad_norm": 0.43010301611246965, "learning_rate": 9.245103720835538e-06, "loss": 0.0451, "step": 1621 }, { "epoch": 0.7790585975024016, "grad_norm": 0.40143671990795116, "learning_rate": 9.24362654982023e-06, "loss": 0.0389, "step": 1622 }, { "epoch": 0.7795389048991355, "grad_norm": 0.4379641653718776, "learning_rate": 9.24214805321339e-06, "loss": 0.0422, "step": 1623 }, { "epoch": 0.7800192122958693, "grad_norm": 0.7421035915776883, "learning_rate": 9.24066823147686e-06, "loss": 0.0477, "step": 1624 }, { "epoch": 0.7804995196926032, "grad_norm": 0.6044451650392826, "learning_rate": 9.239187085072891e-06, "loss": 0.0482, "step": 1625 }, { "epoch": 0.7809798270893372, "grad_norm": 0.5596451678457982, "learning_rate": 9.237704614464157e-06, "loss": 0.0418, "step": 1626 }, { "epoch": 0.7814601344860711, "grad_norm": 0.6592034808792037, "learning_rate": 9.236220820113738e-06, "loss": 0.0546, "step": 1627 }, { "epoch": 0.781940441882805, "grad_norm": 0.5465739249129324, "learning_rate": 9.234735702485132e-06, "loss": 0.0442, "step": 1628 }, { "epoch": 0.7824207492795389, "grad_norm": 0.4911854758209489, "learning_rate": 9.233249262042247e-06, "loss": 0.0398, "step": 1629 }, { "epoch": 0.7829010566762729, "grad_norm": 0.5748292545758572, "learning_rate": 9.231761499249407e-06, "loss": 0.0508, "step": 1630 }, { "epoch": 0.7833813640730067, "grad_norm": 0.6346041474004851, "learning_rate": 9.230272414571349e-06, "loss": 0.0439, "step": 1631 }, { "epoch": 0.7838616714697406, "grad_norm": 0.5634165757189631, "learning_rate": 9.22878200847322e-06, "loss": 0.0539, "step": 1632 }, { "epoch": 0.7843419788664745, "grad_norm": 0.5027607179661531, "learning_rate": 9.227290281420583e-06, "loss": 0.045, "step": 1633 }, { "epoch": 0.7848222862632085, "grad_norm": 0.3037912293882595, "learning_rate": 9.22579723387941e-06, "loss": 0.0369, "step": 1634 }, { "epoch": 0.7853025936599424, "grad_norm": 0.8410699245876824, "learning_rate": 9.22430286631609e-06, "loss": 0.0356, "step": 1635 }, { "epoch": 0.7857829010566763, "grad_norm": 0.5368787438929216, "learning_rate": 9.222807179197421e-06, "loss": 0.043, "step": 1636 }, { "epoch": 0.7862632084534101, "grad_norm": 0.4032303101187688, "learning_rate": 9.221310172990616e-06, "loss": 0.0368, "step": 1637 }, { "epoch": 0.7867435158501441, "grad_norm": 0.5409648779344621, "learning_rate": 9.219811848163295e-06, "loss": 0.0629, "step": 1638 }, { "epoch": 0.787223823246878, "grad_norm": 0.6171811747268313, "learning_rate": 9.218312205183497e-06, "loss": 0.0531, "step": 1639 }, { "epoch": 0.7877041306436119, "grad_norm": 0.4216070952232712, "learning_rate": 9.216811244519667e-06, "loss": 0.0449, "step": 1640 }, { "epoch": 0.7881844380403458, "grad_norm": 0.5763421855553458, "learning_rate": 9.215308966640662e-06, "loss": 0.0368, "step": 1641 }, { "epoch": 0.7886647454370798, "grad_norm": 0.4870149354831771, "learning_rate": 9.213805372015756e-06, "loss": 0.0451, "step": 1642 }, { "epoch": 0.7891450528338136, "grad_norm": 0.393109877651519, "learning_rate": 9.212300461114626e-06, "loss": 0.0335, "step": 1643 }, { "epoch": 0.7896253602305475, "grad_norm": 0.5879082191240319, "learning_rate": 9.210794234407368e-06, "loss": 0.0451, "step": 1644 }, { "epoch": 0.7901056676272814, "grad_norm": 0.5295234190274382, "learning_rate": 9.209286692364484e-06, "loss": 0.061, "step": 1645 }, { "epoch": 0.7905859750240154, "grad_norm": 0.9676479560008302, "learning_rate": 9.207777835456887e-06, "loss": 0.0593, "step": 1646 }, { "epoch": 0.7910662824207493, "grad_norm": 0.41367077682405545, "learning_rate": 9.206267664155906e-06, "loss": 0.0286, "step": 1647 }, { "epoch": 0.7915465898174832, "grad_norm": 0.3879540531683163, "learning_rate": 9.204756178933274e-06, "loss": 0.0377, "step": 1648 }, { "epoch": 0.7920268972142172, "grad_norm": 0.4431616617976934, "learning_rate": 9.203243380261138e-06, "loss": 0.0558, "step": 1649 }, { "epoch": 0.792507204610951, "grad_norm": 0.3784767168841202, "learning_rate": 9.201729268612054e-06, "loss": 0.0539, "step": 1650 }, { "epoch": 0.7929875120076849, "grad_norm": 0.45445078560396285, "learning_rate": 9.20021384445899e-06, "loss": 0.0406, "step": 1651 }, { "epoch": 0.7934678194044188, "grad_norm": 0.40977728429479643, "learning_rate": 9.198697108275318e-06, "loss": 0.0337, "step": 1652 }, { "epoch": 0.7939481268011528, "grad_norm": 0.36463474145779456, "learning_rate": 9.19717906053483e-06, "loss": 0.0377, "step": 1653 }, { "epoch": 0.7944284341978867, "grad_norm": 0.36067865022176565, "learning_rate": 9.19565970171172e-06, "loss": 0.032, "step": 1654 }, { "epoch": 0.7949087415946205, "grad_norm": 0.5087941975761743, "learning_rate": 9.194139032280594e-06, "loss": 0.0363, "step": 1655 }, { "epoch": 0.7953890489913544, "grad_norm": 0.48198473655680424, "learning_rate": 9.192617052716463e-06, "loss": 0.0458, "step": 1656 }, { "epoch": 0.7958693563880884, "grad_norm": 0.5456323999363246, "learning_rate": 9.19109376349476e-06, "loss": 0.0546, "step": 1657 }, { "epoch": 0.7963496637848223, "grad_norm": 0.4698008133059572, "learning_rate": 9.18956916509131e-06, "loss": 0.041, "step": 1658 }, { "epoch": 0.7968299711815562, "grad_norm": 0.48858912139881877, "learning_rate": 9.18804325798236e-06, "loss": 0.0482, "step": 1659 }, { "epoch": 0.7973102785782901, "grad_norm": 0.45224694401197335, "learning_rate": 9.18651604264456e-06, "loss": 0.0432, "step": 1660 }, { "epoch": 0.7977905859750241, "grad_norm": 0.5291111116543427, "learning_rate": 9.184987519554969e-06, "loss": 0.0392, "step": 1661 }, { "epoch": 0.7982708933717579, "grad_norm": 0.39953149277001193, "learning_rate": 9.183457689191055e-06, "loss": 0.0392, "step": 1662 }, { "epoch": 0.7987512007684918, "grad_norm": 0.49701247579632074, "learning_rate": 9.181926552030698e-06, "loss": 0.0374, "step": 1663 }, { "epoch": 0.7992315081652257, "grad_norm": 0.38365739480522254, "learning_rate": 9.18039410855218e-06, "loss": 0.043, "step": 1664 }, { "epoch": 0.7997118155619597, "grad_norm": 0.37429467352808293, "learning_rate": 9.178860359234193e-06, "loss": 0.0407, "step": 1665 }, { "epoch": 0.8001921229586936, "grad_norm": 0.4091686599510648, "learning_rate": 9.17732530455584e-06, "loss": 0.038, "step": 1666 }, { "epoch": 0.8006724303554275, "grad_norm": 0.7195325093951213, "learning_rate": 9.175788944996629e-06, "loss": 0.0741, "step": 1667 }, { "epoch": 0.8011527377521613, "grad_norm": 0.3820712024818586, "learning_rate": 9.174251281036478e-06, "loss": 0.0516, "step": 1668 }, { "epoch": 0.8016330451488953, "grad_norm": 0.6407457249834841, "learning_rate": 9.172712313155708e-06, "loss": 0.0434, "step": 1669 }, { "epoch": 0.8021133525456292, "grad_norm": 0.4969915345518172, "learning_rate": 9.171172041835048e-06, "loss": 0.0385, "step": 1670 }, { "epoch": 0.8025936599423631, "grad_norm": 0.8727857658628531, "learning_rate": 9.169630467555643e-06, "loss": 0.0548, "step": 1671 }, { "epoch": 0.803073967339097, "grad_norm": 0.5478163807092746, "learning_rate": 9.168087590799034e-06, "loss": 0.0417, "step": 1672 }, { "epoch": 0.803554274735831, "grad_norm": 0.5827507091175965, "learning_rate": 9.16654341204717e-06, "loss": 0.0568, "step": 1673 }, { "epoch": 0.8040345821325648, "grad_norm": 0.5177032542659947, "learning_rate": 9.164997931782415e-06, "loss": 0.0492, "step": 1674 }, { "epoch": 0.8045148895292987, "grad_norm": 0.4309298356028421, "learning_rate": 9.163451150487531e-06, "loss": 0.0454, "step": 1675 }, { "epoch": 0.8049951969260326, "grad_norm": 0.4057663676388221, "learning_rate": 9.161903068645692e-06, "loss": 0.0422, "step": 1676 }, { "epoch": 0.8054755043227666, "grad_norm": 0.47646062877439854, "learning_rate": 9.160353686740476e-06, "loss": 0.0351, "step": 1677 }, { "epoch": 0.8059558117195005, "grad_norm": 0.45558821865778054, "learning_rate": 9.158803005255862e-06, "loss": 0.0416, "step": 1678 }, { "epoch": 0.8064361191162344, "grad_norm": 0.536210255513318, "learning_rate": 9.157251024676249e-06, "loss": 0.0499, "step": 1679 }, { "epoch": 0.8069164265129684, "grad_norm": 0.6111498216460901, "learning_rate": 9.155697745486424e-06, "loss": 0.0342, "step": 1680 }, { "epoch": 0.8073967339097022, "grad_norm": 0.4833245316704844, "learning_rate": 9.154143168171594e-06, "loss": 0.0447, "step": 1681 }, { "epoch": 0.8078770413064361, "grad_norm": 0.7125662823152267, "learning_rate": 9.152587293217362e-06, "loss": 0.0475, "step": 1682 }, { "epoch": 0.80835734870317, "grad_norm": 0.42403591379119365, "learning_rate": 9.151030121109745e-06, "loss": 0.0391, "step": 1683 }, { "epoch": 0.808837656099904, "grad_norm": 0.5205159350136663, "learning_rate": 9.149471652335155e-06, "loss": 0.0512, "step": 1684 }, { "epoch": 0.8093179634966379, "grad_norm": 0.4957469692515393, "learning_rate": 9.14791188738042e-06, "loss": 0.0406, "step": 1685 }, { "epoch": 0.8097982708933718, "grad_norm": 0.7665309194587624, "learning_rate": 9.146350826732762e-06, "loss": 0.0549, "step": 1686 }, { "epoch": 0.8102785782901056, "grad_norm": 0.43844120238867723, "learning_rate": 9.144788470879818e-06, "loss": 0.0429, "step": 1687 }, { "epoch": 0.8107588856868396, "grad_norm": 0.5211050949999975, "learning_rate": 9.143224820309622e-06, "loss": 0.0479, "step": 1688 }, { "epoch": 0.8112391930835735, "grad_norm": 0.45269831596656984, "learning_rate": 9.141659875510615e-06, "loss": 0.0527, "step": 1689 }, { "epoch": 0.8117195004803074, "grad_norm": 0.4724243903083948, "learning_rate": 9.140093636971646e-06, "loss": 0.0353, "step": 1690 }, { "epoch": 0.8121998078770413, "grad_norm": 0.6109510239550704, "learning_rate": 9.13852610518196e-06, "loss": 0.0466, "step": 1691 }, { "epoch": 0.8126801152737753, "grad_norm": 0.41144558793142044, "learning_rate": 9.136957280631212e-06, "loss": 0.0363, "step": 1692 }, { "epoch": 0.8131604226705091, "grad_norm": 0.4886106331868101, "learning_rate": 9.135387163809462e-06, "loss": 0.0442, "step": 1693 }, { "epoch": 0.813640730067243, "grad_norm": 0.486941943939095, "learning_rate": 9.133815755207168e-06, "loss": 0.0533, "step": 1694 }, { "epoch": 0.8141210374639769, "grad_norm": 0.34872099800574413, "learning_rate": 9.132243055315193e-06, "loss": 0.0314, "step": 1695 }, { "epoch": 0.8146013448607109, "grad_norm": 0.44324684859322067, "learning_rate": 9.130669064624811e-06, "loss": 0.0532, "step": 1696 }, { "epoch": 0.8150816522574448, "grad_norm": 0.44545944868455956, "learning_rate": 9.129093783627687e-06, "loss": 0.037, "step": 1697 }, { "epoch": 0.8155619596541787, "grad_norm": 0.45798246599733317, "learning_rate": 9.1275172128159e-06, "loss": 0.0481, "step": 1698 }, { "epoch": 0.8160422670509125, "grad_norm": 0.44349109742245846, "learning_rate": 9.125939352681922e-06, "loss": 0.0517, "step": 1699 }, { "epoch": 0.8165225744476465, "grad_norm": 0.6001166233671226, "learning_rate": 9.124360203718638e-06, "loss": 0.0461, "step": 1700 }, { "epoch": 0.8170028818443804, "grad_norm": 0.6831035265294242, "learning_rate": 9.122779766419329e-06, "loss": 0.0591, "step": 1701 }, { "epoch": 0.8174831892411143, "grad_norm": 0.38077445809771976, "learning_rate": 9.121198041277677e-06, "loss": 0.0329, "step": 1702 }, { "epoch": 0.8179634966378482, "grad_norm": 0.41140076728447145, "learning_rate": 9.119615028787771e-06, "loss": 0.0485, "step": 1703 }, { "epoch": 0.8184438040345822, "grad_norm": 0.5189348552493814, "learning_rate": 9.118030729444103e-06, "loss": 0.0448, "step": 1704 }, { "epoch": 0.818924111431316, "grad_norm": 0.465577166008072, "learning_rate": 9.11644514374156e-06, "loss": 0.0514, "step": 1705 }, { "epoch": 0.8194044188280499, "grad_norm": 0.5621851173772174, "learning_rate": 9.114858272175438e-06, "loss": 0.0392, "step": 1706 }, { "epoch": 0.8198847262247838, "grad_norm": 0.6859692616790941, "learning_rate": 9.113270115241429e-06, "loss": 0.0705, "step": 1707 }, { "epoch": 0.8203650336215178, "grad_norm": 0.47502833358038377, "learning_rate": 9.111680673435632e-06, "loss": 0.0441, "step": 1708 }, { "epoch": 0.8208453410182517, "grad_norm": 0.46706802163027805, "learning_rate": 9.110089947254544e-06, "loss": 0.0391, "step": 1709 }, { "epoch": 0.8213256484149856, "grad_norm": 0.31719462369803303, "learning_rate": 9.108497937195064e-06, "loss": 0.0289, "step": 1710 }, { "epoch": 0.8218059558117194, "grad_norm": 0.40385158643378627, "learning_rate": 9.106904643754491e-06, "loss": 0.0381, "step": 1711 }, { "epoch": 0.8222862632084534, "grad_norm": 0.5545734057584185, "learning_rate": 9.105310067430526e-06, "loss": 0.0471, "step": 1712 }, { "epoch": 0.8227665706051873, "grad_norm": 0.4461054981605916, "learning_rate": 9.10371420872127e-06, "loss": 0.0508, "step": 1713 }, { "epoch": 0.8232468780019212, "grad_norm": 0.47654544400356025, "learning_rate": 9.102117068125227e-06, "loss": 0.0475, "step": 1714 }, { "epoch": 0.8237271853986552, "grad_norm": 0.39911477246208293, "learning_rate": 9.100518646141299e-06, "loss": 0.0404, "step": 1715 }, { "epoch": 0.8242074927953891, "grad_norm": 0.7172841710639825, "learning_rate": 9.098918943268786e-06, "loss": 0.046, "step": 1716 }, { "epoch": 0.824687800192123, "grad_norm": 0.43088188833750957, "learning_rate": 9.097317960007395e-06, "loss": 0.0403, "step": 1717 }, { "epoch": 0.8251681075888568, "grad_norm": 0.5012218482080577, "learning_rate": 9.095715696857225e-06, "loss": 0.045, "step": 1718 }, { "epoch": 0.8256484149855908, "grad_norm": 0.46478416389541866, "learning_rate": 9.094112154318784e-06, "loss": 0.033, "step": 1719 }, { "epoch": 0.8261287223823247, "grad_norm": 0.4130084158371063, "learning_rate": 9.092507332892968e-06, "loss": 0.0348, "step": 1720 }, { "epoch": 0.8266090297790586, "grad_norm": 0.6684505123844774, "learning_rate": 9.090901233081082e-06, "loss": 0.0629, "step": 1721 }, { "epoch": 0.8270893371757925, "grad_norm": 0.4323599984449867, "learning_rate": 9.089293855384828e-06, "loss": 0.0437, "step": 1722 }, { "epoch": 0.8275696445725265, "grad_norm": 0.32774506981480916, "learning_rate": 9.087685200306306e-06, "loss": 0.0334, "step": 1723 }, { "epoch": 0.8280499519692603, "grad_norm": 0.5191345063331478, "learning_rate": 9.086075268348014e-06, "loss": 0.0433, "step": 1724 }, { "epoch": 0.8285302593659942, "grad_norm": 0.33509417066826364, "learning_rate": 9.084464060012849e-06, "loss": 0.0402, "step": 1725 }, { "epoch": 0.8290105667627281, "grad_norm": 0.3142718339340029, "learning_rate": 9.082851575804112e-06, "loss": 0.0331, "step": 1726 }, { "epoch": 0.8294908741594621, "grad_norm": 0.4748408271731178, "learning_rate": 9.081237816225497e-06, "loss": 0.0516, "step": 1727 }, { "epoch": 0.829971181556196, "grad_norm": 0.37444079243147355, "learning_rate": 9.079622781781094e-06, "loss": 0.037, "step": 1728 }, { "epoch": 0.8304514889529299, "grad_norm": 0.5592237440039568, "learning_rate": 9.0780064729754e-06, "loss": 0.0452, "step": 1729 }, { "epoch": 0.8309317963496637, "grad_norm": 0.5481405856733301, "learning_rate": 9.076388890313304e-06, "loss": 0.0446, "step": 1730 }, { "epoch": 0.8314121037463977, "grad_norm": 0.3932969307248321, "learning_rate": 9.07477003430009e-06, "loss": 0.0488, "step": 1731 }, { "epoch": 0.8318924111431316, "grad_norm": 0.3702785209776025, "learning_rate": 9.073149905441451e-06, "loss": 0.0334, "step": 1732 }, { "epoch": 0.8323727185398655, "grad_norm": 1.4773631914590435, "learning_rate": 9.071528504243465e-06, "loss": 0.0347, "step": 1733 }, { "epoch": 0.8328530259365994, "grad_norm": 0.4936274544739719, "learning_rate": 9.069905831212616e-06, "loss": 0.081, "step": 1734 }, { "epoch": 0.8333333333333334, "grad_norm": 0.5882359938166744, "learning_rate": 9.068281886855778e-06, "loss": 0.0437, "step": 1735 }, { "epoch": 0.8338136407300673, "grad_norm": 0.46989224834772747, "learning_rate": 9.066656671680231e-06, "loss": 0.0439, "step": 1736 }, { "epoch": 0.8342939481268011, "grad_norm": 0.4534004273658601, "learning_rate": 9.065030186193643e-06, "loss": 0.033, "step": 1737 }, { "epoch": 0.834774255523535, "grad_norm": 1.2265014670233068, "learning_rate": 9.063402430904087e-06, "loss": 0.0525, "step": 1738 }, { "epoch": 0.835254562920269, "grad_norm": 0.5422498828003771, "learning_rate": 9.061773406320027e-06, "loss": 0.0521, "step": 1739 }, { "epoch": 0.8357348703170029, "grad_norm": 0.4195808712209068, "learning_rate": 9.060143112950326e-06, "loss": 0.0316, "step": 1740 }, { "epoch": 0.8362151777137368, "grad_norm": 0.4855308450574598, "learning_rate": 9.058511551304241e-06, "loss": 0.0503, "step": 1741 }, { "epoch": 0.8366954851104706, "grad_norm": 0.5075843680608557, "learning_rate": 9.056878721891427e-06, "loss": 0.0425, "step": 1742 }, { "epoch": 0.8371757925072046, "grad_norm": 0.5482382504823542, "learning_rate": 9.055244625221935e-06, "loss": 0.0428, "step": 1743 }, { "epoch": 0.8376560999039385, "grad_norm": 0.6695595892233471, "learning_rate": 9.053609261806214e-06, "loss": 0.0587, "step": 1744 }, { "epoch": 0.8381364073006724, "grad_norm": 0.4581918394935273, "learning_rate": 9.0519726321551e-06, "loss": 0.0361, "step": 1745 }, { "epoch": 0.8386167146974063, "grad_norm": 0.3952408558855935, "learning_rate": 9.050334736779837e-06, "loss": 0.0381, "step": 1746 }, { "epoch": 0.8390970220941403, "grad_norm": 0.4109873080252859, "learning_rate": 9.048695576192058e-06, "loss": 0.0442, "step": 1747 }, { "epoch": 0.8395773294908742, "grad_norm": 0.5616982246438386, "learning_rate": 9.047055150903787e-06, "loss": 0.0584, "step": 1748 }, { "epoch": 0.840057636887608, "grad_norm": 0.37826080721339767, "learning_rate": 9.045413461427453e-06, "loss": 0.0409, "step": 1749 }, { "epoch": 0.840537944284342, "grad_norm": 0.40088324296361066, "learning_rate": 9.043770508275868e-06, "loss": 0.0487, "step": 1750 }, { "epoch": 0.8410182516810759, "grad_norm": 0.44718093929332914, "learning_rate": 9.04212629196225e-06, "loss": 0.0352, "step": 1751 }, { "epoch": 0.8414985590778098, "grad_norm": 0.4706059052244275, "learning_rate": 9.040480813000205e-06, "loss": 0.0358, "step": 1752 }, { "epoch": 0.8419788664745437, "grad_norm": 0.3670084341616506, "learning_rate": 9.038834071903736e-06, "loss": 0.0286, "step": 1753 }, { "epoch": 0.8424591738712777, "grad_norm": 0.5139890566947024, "learning_rate": 9.037186069187239e-06, "loss": 0.0471, "step": 1754 }, { "epoch": 0.8429394812680115, "grad_norm": 0.6580821574923913, "learning_rate": 9.035536805365503e-06, "loss": 0.0404, "step": 1755 }, { "epoch": 0.8434197886647454, "grad_norm": 0.3488569658059193, "learning_rate": 9.033886280953714e-06, "loss": 0.0332, "step": 1756 }, { "epoch": 0.8439000960614793, "grad_norm": 0.459104475456851, "learning_rate": 9.032234496467448e-06, "loss": 0.0457, "step": 1757 }, { "epoch": 0.8443804034582133, "grad_norm": 0.5555684583259805, "learning_rate": 9.03058145242268e-06, "loss": 0.0471, "step": 1758 }, { "epoch": 0.8448607108549472, "grad_norm": 0.4615657333941676, "learning_rate": 9.028927149335773e-06, "loss": 0.0412, "step": 1759 }, { "epoch": 0.8453410182516811, "grad_norm": 0.45856078963996716, "learning_rate": 9.027271587723487e-06, "loss": 0.0374, "step": 1760 }, { "epoch": 0.845821325648415, "grad_norm": 0.6912129790142436, "learning_rate": 9.025614768102972e-06, "loss": 0.0693, "step": 1761 }, { "epoch": 0.8463016330451489, "grad_norm": 0.5122989554497435, "learning_rate": 9.023956690991775e-06, "loss": 0.0465, "step": 1762 }, { "epoch": 0.8467819404418828, "grad_norm": 0.36126845802883945, "learning_rate": 9.02229735690783e-06, "loss": 0.0296, "step": 1763 }, { "epoch": 0.8472622478386167, "grad_norm": 0.6087551399045577, "learning_rate": 9.020636766369471e-06, "loss": 0.0506, "step": 1764 }, { "epoch": 0.8477425552353506, "grad_norm": 0.38525060421145424, "learning_rate": 9.018974919895418e-06, "loss": 0.0355, "step": 1765 }, { "epoch": 0.8482228626320846, "grad_norm": 0.346018375745237, "learning_rate": 9.017311818004785e-06, "loss": 0.0341, "step": 1766 }, { "epoch": 0.8487031700288185, "grad_norm": 0.6260615640367205, "learning_rate": 9.01564746121708e-06, "loss": 0.0443, "step": 1767 }, { "epoch": 0.8491834774255523, "grad_norm": 0.5187278606147561, "learning_rate": 9.013981850052205e-06, "loss": 0.0433, "step": 1768 }, { "epoch": 0.8496637848222862, "grad_norm": 0.5503187612040098, "learning_rate": 9.012314985030445e-06, "loss": 0.0358, "step": 1769 }, { "epoch": 0.8501440922190202, "grad_norm": 0.6316117180141888, "learning_rate": 9.010646866672488e-06, "loss": 0.0384, "step": 1770 }, { "epoch": 0.8506243996157541, "grad_norm": 0.5212663878288761, "learning_rate": 9.008977495499405e-06, "loss": 0.0361, "step": 1771 }, { "epoch": 0.851104707012488, "grad_norm": 0.5547948672955962, "learning_rate": 9.007306872032663e-06, "loss": 0.0446, "step": 1772 }, { "epoch": 0.8515850144092219, "grad_norm": 0.42396510808899085, "learning_rate": 9.005634996794117e-06, "loss": 0.032, "step": 1773 }, { "epoch": 0.8520653218059558, "grad_norm": 0.7725093468772292, "learning_rate": 9.003961870306015e-06, "loss": 0.0549, "step": 1774 }, { "epoch": 0.8525456292026897, "grad_norm": 0.4308369316438503, "learning_rate": 9.002287493090996e-06, "loss": 0.0499, "step": 1775 }, { "epoch": 0.8530259365994236, "grad_norm": 0.2780784001577288, "learning_rate": 9.000611865672088e-06, "loss": 0.0287, "step": 1776 }, { "epoch": 0.8535062439961575, "grad_norm": 0.5180718543954558, "learning_rate": 8.998934988572713e-06, "loss": 0.0329, "step": 1777 }, { "epoch": 0.8539865513928915, "grad_norm": 0.5057214443895521, "learning_rate": 8.997256862316678e-06, "loss": 0.0481, "step": 1778 }, { "epoch": 0.8544668587896254, "grad_norm": 0.3751078985501239, "learning_rate": 8.995577487428187e-06, "loss": 0.0321, "step": 1779 }, { "epoch": 0.8549471661863592, "grad_norm": 0.8942585028670543, "learning_rate": 8.993896864431825e-06, "loss": 0.0442, "step": 1780 }, { "epoch": 0.8554274735830932, "grad_norm": 0.537990863386714, "learning_rate": 8.992214993852576e-06, "loss": 0.0385, "step": 1781 }, { "epoch": 0.8559077809798271, "grad_norm": 0.3368917295945931, "learning_rate": 8.99053187621581e-06, "loss": 0.0358, "step": 1782 }, { "epoch": 0.856388088376561, "grad_norm": 0.3625643306114823, "learning_rate": 8.988847512047285e-06, "loss": 0.0359, "step": 1783 }, { "epoch": 0.8568683957732949, "grad_norm": 0.5392108809967152, "learning_rate": 8.98716190187315e-06, "loss": 0.0445, "step": 1784 }, { "epoch": 0.8573487031700289, "grad_norm": 0.5998276931275567, "learning_rate": 8.985475046219942e-06, "loss": 0.0443, "step": 1785 }, { "epoch": 0.8578290105667628, "grad_norm": 0.5131152759715736, "learning_rate": 8.983786945614589e-06, "loss": 0.0387, "step": 1786 }, { "epoch": 0.8583093179634966, "grad_norm": 0.6910401242120892, "learning_rate": 8.98209760058441e-06, "loss": 0.0469, "step": 1787 }, { "epoch": 0.8587896253602305, "grad_norm": 0.4194901862760443, "learning_rate": 8.980407011657103e-06, "loss": 0.0403, "step": 1788 }, { "epoch": 0.8592699327569645, "grad_norm": 0.46623807550964225, "learning_rate": 8.978715179360767e-06, "loss": 0.0392, "step": 1789 }, { "epoch": 0.8597502401536984, "grad_norm": 0.44859136364628743, "learning_rate": 8.97702210422388e-06, "loss": 0.0458, "step": 1790 }, { "epoch": 0.8602305475504323, "grad_norm": 0.6257891336546498, "learning_rate": 8.975327786775316e-06, "loss": 0.0366, "step": 1791 }, { "epoch": 0.8607108549471661, "grad_norm": 0.4736431571435201, "learning_rate": 8.973632227544326e-06, "loss": 0.0433, "step": 1792 }, { "epoch": 0.8611911623439001, "grad_norm": 0.4152452493682378, "learning_rate": 8.971935427060563e-06, "loss": 0.0405, "step": 1793 }, { "epoch": 0.861671469740634, "grad_norm": 0.9662287838754176, "learning_rate": 8.970237385854059e-06, "loss": 0.0583, "step": 1794 }, { "epoch": 0.8621517771373679, "grad_norm": 0.5056226800498823, "learning_rate": 8.96853810445523e-06, "loss": 0.0476, "step": 1795 }, { "epoch": 0.8626320845341018, "grad_norm": 0.5166750113936333, "learning_rate": 8.966837583394891e-06, "loss": 0.0434, "step": 1796 }, { "epoch": 0.8631123919308358, "grad_norm": 0.5477224778262247, "learning_rate": 8.965135823204232e-06, "loss": 0.0416, "step": 1797 }, { "epoch": 0.8635926993275697, "grad_norm": 0.3837910069912173, "learning_rate": 8.963432824414842e-06, "loss": 0.0268, "step": 1798 }, { "epoch": 0.8640730067243035, "grad_norm": 0.750187104754067, "learning_rate": 8.961728587558684e-06, "loss": 0.0514, "step": 1799 }, { "epoch": 0.8645533141210374, "grad_norm": 0.5116240081446706, "learning_rate": 8.96002311316812e-06, "loss": 0.0362, "step": 1800 }, { "epoch": 0.8650336215177714, "grad_norm": 0.5452866982844962, "learning_rate": 8.95831640177589e-06, "loss": 0.0412, "step": 1801 }, { "epoch": 0.8655139289145053, "grad_norm": 0.3893421475161622, "learning_rate": 8.956608453915126e-06, "loss": 0.0422, "step": 1802 }, { "epoch": 0.8659942363112392, "grad_norm": 0.400744042269121, "learning_rate": 8.954899270119339e-06, "loss": 0.0442, "step": 1803 }, { "epoch": 0.866474543707973, "grad_norm": 0.8459931991973275, "learning_rate": 8.953188850922436e-06, "loss": 0.0319, "step": 1804 }, { "epoch": 0.866954851104707, "grad_norm": 0.4769183729256473, "learning_rate": 8.951477196858703e-06, "loss": 0.0392, "step": 1805 }, { "epoch": 0.8674351585014409, "grad_norm": 0.5961258504118155, "learning_rate": 8.94976430846281e-06, "loss": 0.044, "step": 1806 }, { "epoch": 0.8679154658981748, "grad_norm": 0.7889199748787529, "learning_rate": 8.94805018626982e-06, "loss": 0.0554, "step": 1807 }, { "epoch": 0.8683957732949087, "grad_norm": 0.6776357232960094, "learning_rate": 8.946334830815176e-06, "loss": 0.0474, "step": 1808 }, { "epoch": 0.8688760806916427, "grad_norm": 0.5294621498199231, "learning_rate": 8.944618242634707e-06, "loss": 0.0441, "step": 1809 }, { "epoch": 0.8693563880883766, "grad_norm": 0.40665428772649925, "learning_rate": 8.942900422264627e-06, "loss": 0.0293, "step": 1810 }, { "epoch": 0.8698366954851104, "grad_norm": 0.5854731121778896, "learning_rate": 8.941181370241538e-06, "loss": 0.0472, "step": 1811 }, { "epoch": 0.8703170028818443, "grad_norm": 0.514195838759812, "learning_rate": 8.939461087102424e-06, "loss": 0.0396, "step": 1812 }, { "epoch": 0.8707973102785783, "grad_norm": 0.6456083251358632, "learning_rate": 8.937739573384653e-06, "loss": 0.048, "step": 1813 }, { "epoch": 0.8712776176753122, "grad_norm": 0.40027049247918695, "learning_rate": 8.936016829625977e-06, "loss": 0.0373, "step": 1814 }, { "epoch": 0.8717579250720461, "grad_norm": 1.4781590167762066, "learning_rate": 8.934292856364535e-06, "loss": 0.0706, "step": 1815 }, { "epoch": 0.8722382324687801, "grad_norm": 0.5618715192739092, "learning_rate": 8.932567654138849e-06, "loss": 0.0523, "step": 1816 }, { "epoch": 0.872718539865514, "grad_norm": 0.43422019108974075, "learning_rate": 8.930841223487823e-06, "loss": 0.0464, "step": 1817 }, { "epoch": 0.8731988472622478, "grad_norm": 0.3876967201171116, "learning_rate": 8.929113564950746e-06, "loss": 0.0278, "step": 1818 }, { "epoch": 0.8736791546589817, "grad_norm": 0.46720060258300933, "learning_rate": 8.927384679067293e-06, "loss": 0.0402, "step": 1819 }, { "epoch": 0.8741594620557157, "grad_norm": 0.519410545548104, "learning_rate": 8.925654566377519e-06, "loss": 0.0541, "step": 1820 }, { "epoch": 0.8746397694524496, "grad_norm": 1.022650569670746, "learning_rate": 8.923923227421862e-06, "loss": 0.0564, "step": 1821 }, { "epoch": 0.8751200768491835, "grad_norm": 0.358444548188737, "learning_rate": 8.922190662741146e-06, "loss": 0.0352, "step": 1822 }, { "epoch": 0.8756003842459174, "grad_norm": 0.6535408000392346, "learning_rate": 8.920456872876575e-06, "loss": 0.0679, "step": 1823 }, { "epoch": 0.8760806916426513, "grad_norm": 0.3811923244253041, "learning_rate": 8.918721858369738e-06, "loss": 0.0495, "step": 1824 }, { "epoch": 0.8765609990393852, "grad_norm": 0.502122776772645, "learning_rate": 8.916985619762605e-06, "loss": 0.0371, "step": 1825 }, { "epoch": 0.8770413064361191, "grad_norm": 1.0483402982818049, "learning_rate": 8.915248157597529e-06, "loss": 0.0655, "step": 1826 }, { "epoch": 0.877521613832853, "grad_norm": 0.45223694971335204, "learning_rate": 8.913509472417246e-06, "loss": 0.0311, "step": 1827 }, { "epoch": 0.878001921229587, "grad_norm": 0.4508039587720194, "learning_rate": 8.91176956476487e-06, "loss": 0.0436, "step": 1828 }, { "epoch": 0.8784822286263209, "grad_norm": 0.4562648581626756, "learning_rate": 8.910028435183906e-06, "loss": 0.0358, "step": 1829 }, { "epoch": 0.8789625360230547, "grad_norm": 0.3635207577181714, "learning_rate": 8.90828608421823e-06, "loss": 0.0357, "step": 1830 }, { "epoch": 0.8794428434197886, "grad_norm": 0.5209884592558325, "learning_rate": 8.906542512412105e-06, "loss": 0.0478, "step": 1831 }, { "epoch": 0.8799231508165226, "grad_norm": 0.37857114644619005, "learning_rate": 8.904797720310176e-06, "loss": 0.045, "step": 1832 }, { "epoch": 0.8804034582132565, "grad_norm": 0.8142054557917969, "learning_rate": 8.903051708457465e-06, "loss": 0.0376, "step": 1833 }, { "epoch": 0.8808837656099904, "grad_norm": 0.7396298749741087, "learning_rate": 8.90130447739938e-06, "loss": 0.0589, "step": 1834 }, { "epoch": 0.8813640730067243, "grad_norm": 0.33453688614785887, "learning_rate": 8.899556027681708e-06, "loss": 0.0231, "step": 1835 }, { "epoch": 0.8818443804034583, "grad_norm": 0.4457358953894517, "learning_rate": 8.897806359850614e-06, "loss": 0.0317, "step": 1836 }, { "epoch": 0.8823246878001921, "grad_norm": 1.3515020556940829, "learning_rate": 8.896055474452649e-06, "loss": 0.0427, "step": 1837 }, { "epoch": 0.882804995196926, "grad_norm": 0.4580165993160693, "learning_rate": 8.894303372034738e-06, "loss": 0.0433, "step": 1838 }, { "epoch": 0.8832853025936599, "grad_norm": 0.4503178993927309, "learning_rate": 8.89255005314419e-06, "loss": 0.0373, "step": 1839 }, { "epoch": 0.8837656099903939, "grad_norm": 0.4368170227289143, "learning_rate": 8.890795518328698e-06, "loss": 0.0461, "step": 1840 }, { "epoch": 0.8842459173871278, "grad_norm": 0.5359261995070854, "learning_rate": 8.889039768136323e-06, "loss": 0.0633, "step": 1841 }, { "epoch": 0.8847262247838616, "grad_norm": 0.32753604552230753, "learning_rate": 8.887282803115518e-06, "loss": 0.031, "step": 1842 }, { "epoch": 0.8852065321805955, "grad_norm": 0.561984541895264, "learning_rate": 8.885524623815107e-06, "loss": 0.061, "step": 1843 }, { "epoch": 0.8856868395773295, "grad_norm": 0.4954923412229821, "learning_rate": 8.883765230784298e-06, "loss": 0.0376, "step": 1844 }, { "epoch": 0.8861671469740634, "grad_norm": 0.40554156267027097, "learning_rate": 8.882004624572676e-06, "loss": 0.0286, "step": 1845 }, { "epoch": 0.8866474543707973, "grad_norm": 0.5113849488787681, "learning_rate": 8.880242805730208e-06, "loss": 0.0426, "step": 1846 }, { "epoch": 0.8871277617675313, "grad_norm": 0.37027022381367386, "learning_rate": 8.878479774807235e-06, "loss": 0.0295, "step": 1847 }, { "epoch": 0.8876080691642652, "grad_norm": 0.6769968021961212, "learning_rate": 8.876715532354478e-06, "loss": 0.0452, "step": 1848 }, { "epoch": 0.888088376560999, "grad_norm": 0.48737302282911404, "learning_rate": 8.87495007892304e-06, "loss": 0.0445, "step": 1849 }, { "epoch": 0.8885686839577329, "grad_norm": 0.3386523878498041, "learning_rate": 8.873183415064401e-06, "loss": 0.0333, "step": 1850 }, { "epoch": 0.8890489913544669, "grad_norm": 0.6658697028186209, "learning_rate": 8.871415541330417e-06, "loss": 0.0343, "step": 1851 }, { "epoch": 0.8895292987512008, "grad_norm": 0.4919802717601218, "learning_rate": 8.86964645827332e-06, "loss": 0.0455, "step": 1852 }, { "epoch": 0.8900096061479347, "grad_norm": 0.4263226706660005, "learning_rate": 8.867876166445724e-06, "loss": 0.0418, "step": 1853 }, { "epoch": 0.8904899135446686, "grad_norm": 0.3214256914507718, "learning_rate": 8.86610466640062e-06, "loss": 0.0327, "step": 1854 }, { "epoch": 0.8909702209414025, "grad_norm": 0.6264841255471314, "learning_rate": 8.864331958691377e-06, "loss": 0.0305, "step": 1855 }, { "epoch": 0.8914505283381364, "grad_norm": 1.2829456922872837, "learning_rate": 8.862558043871737e-06, "loss": 0.0585, "step": 1856 }, { "epoch": 0.8919308357348703, "grad_norm": 0.5015853830955843, "learning_rate": 8.860782922495821e-06, "loss": 0.0479, "step": 1857 }, { "epoch": 0.8924111431316042, "grad_norm": 0.6658604832983488, "learning_rate": 8.859006595118133e-06, "loss": 0.0485, "step": 1858 }, { "epoch": 0.8928914505283382, "grad_norm": 0.47639098590814116, "learning_rate": 8.857229062293544e-06, "loss": 0.0388, "step": 1859 }, { "epoch": 0.8933717579250721, "grad_norm": 0.4302098869706598, "learning_rate": 8.855450324577308e-06, "loss": 0.0482, "step": 1860 }, { "epoch": 0.8938520653218059, "grad_norm": 0.6482456313500351, "learning_rate": 8.853670382525052e-06, "loss": 0.0436, "step": 1861 }, { "epoch": 0.8943323727185398, "grad_norm": 0.47660614333387596, "learning_rate": 8.851889236692783e-06, "loss": 0.046, "step": 1862 }, { "epoch": 0.8948126801152738, "grad_norm": 0.6952319224454124, "learning_rate": 8.85010688763688e-06, "loss": 0.0495, "step": 1863 }, { "epoch": 0.8952929875120077, "grad_norm": 0.4901487858015449, "learning_rate": 8.8483233359141e-06, "loss": 0.0358, "step": 1864 }, { "epoch": 0.8957732949087416, "grad_norm": 0.5728646775595035, "learning_rate": 8.846538582081575e-06, "loss": 0.0435, "step": 1865 }, { "epoch": 0.8962536023054755, "grad_norm": 0.44725468456391415, "learning_rate": 8.84475262669681e-06, "loss": 0.0443, "step": 1866 }, { "epoch": 0.8967339097022095, "grad_norm": 0.5773271593291803, "learning_rate": 8.842965470317694e-06, "loss": 0.0424, "step": 1867 }, { "epoch": 0.8972142170989433, "grad_norm": 0.7198849042715714, "learning_rate": 8.84117711350248e-06, "loss": 0.0468, "step": 1868 }, { "epoch": 0.8976945244956772, "grad_norm": 0.5957419746315502, "learning_rate": 8.839387556809805e-06, "loss": 0.0468, "step": 1869 }, { "epoch": 0.8981748318924111, "grad_norm": 0.588750525644505, "learning_rate": 8.837596800798674e-06, "loss": 0.0464, "step": 1870 }, { "epoch": 0.8986551392891451, "grad_norm": 0.4505825678549945, "learning_rate": 8.835804846028473e-06, "loss": 0.0428, "step": 1871 }, { "epoch": 0.899135446685879, "grad_norm": 0.6871356390036418, "learning_rate": 8.834011693058955e-06, "loss": 0.0693, "step": 1872 }, { "epoch": 0.8996157540826129, "grad_norm": 0.4898107165484671, "learning_rate": 8.832217342450253e-06, "loss": 0.0425, "step": 1873 }, { "epoch": 0.9000960614793467, "grad_norm": 0.5624738413790371, "learning_rate": 8.830421794762873e-06, "loss": 0.044, "step": 1874 }, { "epoch": 0.9005763688760807, "grad_norm": 0.5195088744781344, "learning_rate": 8.828625050557695e-06, "loss": 0.0606, "step": 1875 }, { "epoch": 0.9010566762728146, "grad_norm": 0.38938511927268654, "learning_rate": 8.826827110395973e-06, "loss": 0.0336, "step": 1876 }, { "epoch": 0.9015369836695485, "grad_norm": 0.38684793123655187, "learning_rate": 8.82502797483933e-06, "loss": 0.0375, "step": 1877 }, { "epoch": 0.9020172910662824, "grad_norm": 0.7363107702433455, "learning_rate": 8.823227644449767e-06, "loss": 0.0465, "step": 1878 }, { "epoch": 0.9024975984630164, "grad_norm": 0.48895156850885646, "learning_rate": 8.821426119789662e-06, "loss": 0.0483, "step": 1879 }, { "epoch": 0.9029779058597502, "grad_norm": 0.660241990808151, "learning_rate": 8.819623401421757e-06, "loss": 0.0476, "step": 1880 }, { "epoch": 0.9034582132564841, "grad_norm": 0.46543854311143706, "learning_rate": 8.817819489909172e-06, "loss": 0.0403, "step": 1881 }, { "epoch": 0.9039385206532181, "grad_norm": 0.6017118184027497, "learning_rate": 8.8160143858154e-06, "loss": 0.0478, "step": 1882 }, { "epoch": 0.904418828049952, "grad_norm": 0.39226105718412113, "learning_rate": 8.814208089704306e-06, "loss": 0.0418, "step": 1883 }, { "epoch": 0.9048991354466859, "grad_norm": 0.4599436630675356, "learning_rate": 8.812400602140125e-06, "loss": 0.0468, "step": 1884 }, { "epoch": 0.9053794428434198, "grad_norm": 0.8681683906823245, "learning_rate": 8.810591923687468e-06, "loss": 0.0473, "step": 1885 }, { "epoch": 0.9058597502401537, "grad_norm": 0.7322132140172354, "learning_rate": 8.808782054911315e-06, "loss": 0.0662, "step": 1886 }, { "epoch": 0.9063400576368876, "grad_norm": 0.6055607958778498, "learning_rate": 8.806970996377018e-06, "loss": 0.0461, "step": 1887 }, { "epoch": 0.9068203650336215, "grad_norm": 0.4352135953438638, "learning_rate": 8.805158748650304e-06, "loss": 0.0443, "step": 1888 }, { "epoch": 0.9073006724303554, "grad_norm": 0.8207849567629364, "learning_rate": 8.803345312297269e-06, "loss": 0.0686, "step": 1889 }, { "epoch": 0.9077809798270894, "grad_norm": 0.4467690737669225, "learning_rate": 8.801530687884378e-06, "loss": 0.0532, "step": 1890 }, { "epoch": 0.9082612872238233, "grad_norm": 0.5198478158932659, "learning_rate": 8.799714875978472e-06, "loss": 0.0419, "step": 1891 }, { "epoch": 0.9087415946205571, "grad_norm": 1.0896879533925696, "learning_rate": 8.797897877146757e-06, "loss": 0.0734, "step": 1892 }, { "epoch": 0.909221902017291, "grad_norm": 0.3617708509543011, "learning_rate": 8.796079691956818e-06, "loss": 0.0327, "step": 1893 }, { "epoch": 0.909702209414025, "grad_norm": 0.7301142094909887, "learning_rate": 8.794260320976602e-06, "loss": 0.0392, "step": 1894 }, { "epoch": 0.9101825168107589, "grad_norm": 0.4068531645359285, "learning_rate": 8.79243976477443e-06, "loss": 0.0469, "step": 1895 }, { "epoch": 0.9106628242074928, "grad_norm": 0.48618429718856376, "learning_rate": 8.790618023918995e-06, "loss": 0.0558, "step": 1896 }, { "epoch": 0.9111431316042267, "grad_norm": 0.5431200818159274, "learning_rate": 8.788795098979358e-06, "loss": 0.0515, "step": 1897 }, { "epoch": 0.9116234390009607, "grad_norm": 0.6076681250674426, "learning_rate": 8.786970990524952e-06, "loss": 0.043, "step": 1898 }, { "epoch": 0.9121037463976945, "grad_norm": 0.4574042135859862, "learning_rate": 8.785145699125577e-06, "loss": 0.0468, "step": 1899 }, { "epoch": 0.9125840537944284, "grad_norm": 0.7075435238401119, "learning_rate": 8.783319225351401e-06, "loss": 0.0464, "step": 1900 }, { "epoch": 0.9130643611911623, "grad_norm": 0.534633281625552, "learning_rate": 8.781491569772966e-06, "loss": 0.0426, "step": 1901 }, { "epoch": 0.9135446685878963, "grad_norm": 0.3921046587973985, "learning_rate": 8.77966273296118e-06, "loss": 0.0358, "step": 1902 }, { "epoch": 0.9140249759846302, "grad_norm": 0.41990313919226124, "learning_rate": 8.777832715487325e-06, "loss": 0.0285, "step": 1903 }, { "epoch": 0.914505283381364, "grad_norm": 0.41717379390801274, "learning_rate": 8.776001517923042e-06, "loss": 0.0363, "step": 1904 }, { "epoch": 0.9149855907780979, "grad_norm": 0.3366938255497245, "learning_rate": 8.774169140840349e-06, "loss": 0.0374, "step": 1905 }, { "epoch": 0.9154658981748319, "grad_norm": 0.36457948244141064, "learning_rate": 8.772335584811631e-06, "loss": 0.0299, "step": 1906 }, { "epoch": 0.9159462055715658, "grad_norm": 0.4828707338456646, "learning_rate": 8.770500850409641e-06, "loss": 0.0331, "step": 1907 }, { "epoch": 0.9164265129682997, "grad_norm": 0.5966763216669441, "learning_rate": 8.768664938207494e-06, "loss": 0.0498, "step": 1908 }, { "epoch": 0.9169068203650336, "grad_norm": 0.3927338383436385, "learning_rate": 8.766827848778683e-06, "loss": 0.0338, "step": 1909 }, { "epoch": 0.9173871277617676, "grad_norm": 0.5893155428418285, "learning_rate": 8.764989582697065e-06, "loss": 0.0411, "step": 1910 }, { "epoch": 0.9178674351585014, "grad_norm": 0.5356691788194019, "learning_rate": 8.763150140536858e-06, "loss": 0.0398, "step": 1911 }, { "epoch": 0.9183477425552353, "grad_norm": 0.4306294724075641, "learning_rate": 8.761309522872657e-06, "loss": 0.0403, "step": 1912 }, { "epoch": 0.9188280499519692, "grad_norm": 0.4226012125799819, "learning_rate": 8.75946773027942e-06, "loss": 0.0353, "step": 1913 }, { "epoch": 0.9193083573487032, "grad_norm": 0.4136981494543946, "learning_rate": 8.75762476333247e-06, "loss": 0.0362, "step": 1914 }, { "epoch": 0.9197886647454371, "grad_norm": 0.6346827624212021, "learning_rate": 8.755780622607499e-06, "loss": 0.0445, "step": 1915 }, { "epoch": 0.920268972142171, "grad_norm": 0.558450568203092, "learning_rate": 8.753935308680568e-06, "loss": 0.043, "step": 1916 }, { "epoch": 0.920749279538905, "grad_norm": 0.43965358503774565, "learning_rate": 8.7520888221281e-06, "loss": 0.0495, "step": 1917 }, { "epoch": 0.9212295869356388, "grad_norm": 0.5476700265664696, "learning_rate": 8.750241163526887e-06, "loss": 0.0347, "step": 1918 }, { "epoch": 0.9217098943323727, "grad_norm": 0.39304076802918375, "learning_rate": 8.748392333454085e-06, "loss": 0.0349, "step": 1919 }, { "epoch": 0.9221902017291066, "grad_norm": 0.4971949825044665, "learning_rate": 8.74654233248722e-06, "loss": 0.0395, "step": 1920 }, { "epoch": 0.9226705091258406, "grad_norm": 0.5186534483519105, "learning_rate": 8.74469116120418e-06, "loss": 0.0307, "step": 1921 }, { "epoch": 0.9231508165225745, "grad_norm": 0.3224287987538711, "learning_rate": 8.742838820183218e-06, "loss": 0.024, "step": 1922 }, { "epoch": 0.9236311239193083, "grad_norm": 0.4280132539326167, "learning_rate": 8.740985310002956e-06, "loss": 0.0417, "step": 1923 }, { "epoch": 0.9241114313160422, "grad_norm": 0.41045791987815555, "learning_rate": 8.739130631242379e-06, "loss": 0.0359, "step": 1924 }, { "epoch": 0.9245917387127762, "grad_norm": 0.5184741851396398, "learning_rate": 8.737274784480839e-06, "loss": 0.0563, "step": 1925 }, { "epoch": 0.9250720461095101, "grad_norm": 0.31914533082950014, "learning_rate": 8.735417770298046e-06, "loss": 0.0299, "step": 1926 }, { "epoch": 0.925552353506244, "grad_norm": 0.5115763990615543, "learning_rate": 8.733559589274086e-06, "loss": 0.0495, "step": 1927 }, { "epoch": 0.9260326609029779, "grad_norm": 0.4761196913567422, "learning_rate": 8.731700241989398e-06, "loss": 0.0445, "step": 1928 }, { "epoch": 0.9265129682997119, "grad_norm": 0.551401916157282, "learning_rate": 8.729839729024794e-06, "loss": 0.036, "step": 1929 }, { "epoch": 0.9269932756964457, "grad_norm": 0.3446484842092706, "learning_rate": 8.727978050961446e-06, "loss": 0.036, "step": 1930 }, { "epoch": 0.9274735830931796, "grad_norm": 0.3994863651754855, "learning_rate": 8.726115208380892e-06, "loss": 0.0445, "step": 1931 }, { "epoch": 0.9279538904899135, "grad_norm": 0.39412495064642333, "learning_rate": 8.724251201865029e-06, "loss": 0.0495, "step": 1932 }, { "epoch": 0.9284341978866475, "grad_norm": 0.6031610926197576, "learning_rate": 8.722386031996124e-06, "loss": 0.0532, "step": 1933 }, { "epoch": 0.9289145052833814, "grad_norm": 0.4130746229906913, "learning_rate": 8.720519699356804e-06, "loss": 0.0438, "step": 1934 }, { "epoch": 0.9293948126801153, "grad_norm": 0.4945763714239433, "learning_rate": 8.71865220453006e-06, "loss": 0.0537, "step": 1935 }, { "epoch": 0.9298751200768491, "grad_norm": 0.6642151852115892, "learning_rate": 8.716783548099243e-06, "loss": 0.0305, "step": 1936 }, { "epoch": 0.9303554274735831, "grad_norm": 0.4808430941047988, "learning_rate": 8.714913730648073e-06, "loss": 0.0383, "step": 1937 }, { "epoch": 0.930835734870317, "grad_norm": 0.38784696307406447, "learning_rate": 8.713042752760629e-06, "loss": 0.0427, "step": 1938 }, { "epoch": 0.9313160422670509, "grad_norm": 0.46691293249094495, "learning_rate": 8.71117061502135e-06, "loss": 0.037, "step": 1939 }, { "epoch": 0.9317963496637848, "grad_norm": 0.528772802563039, "learning_rate": 8.709297318015042e-06, "loss": 0.0413, "step": 1940 }, { "epoch": 0.9322766570605188, "grad_norm": 0.49227721232250315, "learning_rate": 8.707422862326872e-06, "loss": 0.0359, "step": 1941 }, { "epoch": 0.9327569644572526, "grad_norm": 0.47166989196463194, "learning_rate": 8.705547248542366e-06, "loss": 0.0362, "step": 1942 }, { "epoch": 0.9332372718539865, "grad_norm": 0.5256984649221889, "learning_rate": 8.703670477247415e-06, "loss": 0.0483, "step": 1943 }, { "epoch": 0.9337175792507204, "grad_norm": 0.4307455296849523, "learning_rate": 8.701792549028269e-06, "loss": 0.0487, "step": 1944 }, { "epoch": 0.9341978866474544, "grad_norm": 0.46947997670245323, "learning_rate": 8.699913464471543e-06, "loss": 0.0441, "step": 1945 }, { "epoch": 0.9346781940441883, "grad_norm": 0.5890864790257363, "learning_rate": 8.69803322416421e-06, "loss": 0.0629, "step": 1946 }, { "epoch": 0.9351585014409222, "grad_norm": 0.380486982939282, "learning_rate": 8.696151828693606e-06, "loss": 0.0367, "step": 1947 }, { "epoch": 0.9356388088376562, "grad_norm": 0.42021576660117066, "learning_rate": 8.694269278647425e-06, "loss": 0.0379, "step": 1948 }, { "epoch": 0.93611911623439, "grad_norm": 0.41391650206618286, "learning_rate": 8.692385574613725e-06, "loss": 0.0365, "step": 1949 }, { "epoch": 0.9365994236311239, "grad_norm": 0.6058118454984688, "learning_rate": 8.690500717180924e-06, "loss": 0.065, "step": 1950 }, { "epoch": 0.9370797310278578, "grad_norm": 0.4144008472750772, "learning_rate": 8.688614706937794e-06, "loss": 0.0337, "step": 1951 }, { "epoch": 0.9375600384245918, "grad_norm": 0.46837940872225015, "learning_rate": 8.68672754447348e-06, "loss": 0.0539, "step": 1952 }, { "epoch": 0.9380403458213257, "grad_norm": 0.4192145916135169, "learning_rate": 8.684839230377475e-06, "loss": 0.0324, "step": 1953 }, { "epoch": 0.9385206532180596, "grad_norm": 0.5489777561757881, "learning_rate": 8.682949765239636e-06, "loss": 0.0357, "step": 1954 }, { "epoch": 0.9390009606147934, "grad_norm": 0.34093817269965004, "learning_rate": 8.681059149650181e-06, "loss": 0.0268, "step": 1955 }, { "epoch": 0.9394812680115274, "grad_norm": 0.395654696038833, "learning_rate": 8.679167384199686e-06, "loss": 0.0286, "step": 1956 }, { "epoch": 0.9399615754082613, "grad_norm": 0.2944331212675771, "learning_rate": 8.677274469479083e-06, "loss": 0.0263, "step": 1957 }, { "epoch": 0.9404418828049952, "grad_norm": 0.6932183759828536, "learning_rate": 8.67538040607967e-06, "loss": 0.0459, "step": 1958 }, { "epoch": 0.9409221902017291, "grad_norm": 0.9103586040099304, "learning_rate": 8.6734851945931e-06, "loss": 0.046, "step": 1959 }, { "epoch": 0.9414024975984631, "grad_norm": 0.4687709996696501, "learning_rate": 8.671588835611381e-06, "loss": 0.0448, "step": 1960 }, { "epoch": 0.9418828049951969, "grad_norm": 0.9039414492489983, "learning_rate": 8.669691329726888e-06, "loss": 0.0412, "step": 1961 }, { "epoch": 0.9423631123919308, "grad_norm": 0.5555653981562542, "learning_rate": 8.667792677532346e-06, "loss": 0.0339, "step": 1962 }, { "epoch": 0.9428434197886647, "grad_norm": 0.444243469483319, "learning_rate": 8.665892879620843e-06, "loss": 0.0427, "step": 1963 }, { "epoch": 0.9433237271853987, "grad_norm": 0.342635961775897, "learning_rate": 8.663991936585821e-06, "loss": 0.0337, "step": 1964 }, { "epoch": 0.9438040345821326, "grad_norm": 0.6397758038332509, "learning_rate": 8.662089849021086e-06, "loss": 0.0505, "step": 1965 }, { "epoch": 0.9442843419788665, "grad_norm": 0.5964279866861497, "learning_rate": 8.660186617520792e-06, "loss": 0.0358, "step": 1966 }, { "epoch": 0.9447646493756003, "grad_norm": 0.5659109590321414, "learning_rate": 8.658282242679461e-06, "loss": 0.0418, "step": 1967 }, { "epoch": 0.9452449567723343, "grad_norm": 0.506925916793818, "learning_rate": 8.656376725091965e-06, "loss": 0.0417, "step": 1968 }, { "epoch": 0.9457252641690682, "grad_norm": 0.48404249293897045, "learning_rate": 8.654470065353535e-06, "loss": 0.0429, "step": 1969 }, { "epoch": 0.9462055715658021, "grad_norm": 1.6891950338293646, "learning_rate": 8.652562264059758e-06, "loss": 0.0376, "step": 1970 }, { "epoch": 0.946685878962536, "grad_norm": 1.4094881449650138, "learning_rate": 8.65065332180658e-06, "loss": 0.0351, "step": 1971 }, { "epoch": 0.94716618635927, "grad_norm": 0.5177625328838332, "learning_rate": 8.6487432391903e-06, "loss": 0.0343, "step": 1972 }, { "epoch": 0.9476464937560038, "grad_norm": 0.5743763929907509, "learning_rate": 8.646832016807576e-06, "loss": 0.0366, "step": 1973 }, { "epoch": 0.9481268011527377, "grad_norm": 1.2258062310893443, "learning_rate": 8.644919655255421e-06, "loss": 0.0508, "step": 1974 }, { "epoch": 0.9486071085494716, "grad_norm": 0.9221165067101089, "learning_rate": 8.643006155131204e-06, "loss": 0.0673, "step": 1975 }, { "epoch": 0.9490874159462056, "grad_norm": 0.6966238615893148, "learning_rate": 8.641091517032648e-06, "loss": 0.0738, "step": 1976 }, { "epoch": 0.9495677233429395, "grad_norm": 0.4344930032517037, "learning_rate": 8.639175741557835e-06, "loss": 0.0326, "step": 1977 }, { "epoch": 0.9500480307396734, "grad_norm": 0.7423186859550626, "learning_rate": 8.6372588293052e-06, "loss": 0.0446, "step": 1978 }, { "epoch": 0.9505283381364072, "grad_norm": 0.4731476879098025, "learning_rate": 8.635340780873531e-06, "loss": 0.0353, "step": 1979 }, { "epoch": 0.9510086455331412, "grad_norm": 0.6600870526226297, "learning_rate": 8.633421596861977e-06, "loss": 0.0569, "step": 1980 }, { "epoch": 0.9514889529298751, "grad_norm": 0.5036533651424044, "learning_rate": 8.631501277870034e-06, "loss": 0.0406, "step": 1981 }, { "epoch": 0.951969260326609, "grad_norm": 1.3169404498677184, "learning_rate": 8.62957982449756e-06, "loss": 0.0786, "step": 1982 }, { "epoch": 0.952449567723343, "grad_norm": 0.7672144324242122, "learning_rate": 8.627657237344762e-06, "loss": 0.0437, "step": 1983 }, { "epoch": 0.9529298751200769, "grad_norm": 0.41972977550479074, "learning_rate": 8.625733517012202e-06, "loss": 0.0477, "step": 1984 }, { "epoch": 0.9534101825168108, "grad_norm": 0.507878350787633, "learning_rate": 8.6238086641008e-06, "loss": 0.0367, "step": 1985 }, { "epoch": 0.9538904899135446, "grad_norm": 0.516358838359933, "learning_rate": 8.621882679211826e-06, "loss": 0.0478, "step": 1986 }, { "epoch": 0.9543707973102786, "grad_norm": 0.3726975644915236, "learning_rate": 8.619955562946902e-06, "loss": 0.0341, "step": 1987 }, { "epoch": 0.9548511047070125, "grad_norm": 0.5774338343855023, "learning_rate": 8.618027315908009e-06, "loss": 0.0454, "step": 1988 }, { "epoch": 0.9553314121037464, "grad_norm": 0.710615004605156, "learning_rate": 8.616097938697476e-06, "loss": 0.0448, "step": 1989 }, { "epoch": 0.9558117195004803, "grad_norm": 0.9510665106194851, "learning_rate": 8.614167431917986e-06, "loss": 0.0543, "step": 1990 }, { "epoch": 0.9562920268972143, "grad_norm": 0.4147522768624343, "learning_rate": 8.612235796172579e-06, "loss": 0.0328, "step": 1991 }, { "epoch": 0.9567723342939481, "grad_norm": 0.5983476792554373, "learning_rate": 8.610303032064642e-06, "loss": 0.0606, "step": 1992 }, { "epoch": 0.957252641690682, "grad_norm": 0.3768025246044265, "learning_rate": 8.60836914019792e-06, "loss": 0.0323, "step": 1993 }, { "epoch": 0.9577329490874159, "grad_norm": 0.5553521298460753, "learning_rate": 8.606434121176504e-06, "loss": 0.0502, "step": 1994 }, { "epoch": 0.9582132564841499, "grad_norm": 0.6171063653286382, "learning_rate": 8.60449797560484e-06, "loss": 0.0505, "step": 1995 }, { "epoch": 0.9586935638808838, "grad_norm": 0.5729885075449906, "learning_rate": 8.60256070408773e-06, "loss": 0.0534, "step": 1996 }, { "epoch": 0.9591738712776177, "grad_norm": 0.4954254466131748, "learning_rate": 8.600622307230323e-06, "loss": 0.0339, "step": 1997 }, { "epoch": 0.9596541786743515, "grad_norm": 0.36497637328284854, "learning_rate": 8.598682785638119e-06, "loss": 0.0293, "step": 1998 }, { "epoch": 0.9601344860710855, "grad_norm": 0.5588614544226611, "learning_rate": 8.59674213991697e-06, "loss": 0.0497, "step": 1999 }, { "epoch": 0.9606147934678194, "grad_norm": 0.34528031432849365, "learning_rate": 8.594800370673083e-06, "loss": 0.032, "step": 2000 }, { "epoch": 0.9610951008645533, "grad_norm": 0.6906177073677481, "learning_rate": 8.592857478513011e-06, "loss": 0.0485, "step": 2001 }, { "epoch": 0.9615754082612872, "grad_norm": 0.3546432685965284, "learning_rate": 8.590913464043661e-06, "loss": 0.0297, "step": 2002 }, { "epoch": 0.9620557156580212, "grad_norm": 0.7379863818566214, "learning_rate": 8.58896832787229e-06, "loss": 0.0551, "step": 2003 }, { "epoch": 0.962536023054755, "grad_norm": 0.43001383506184815, "learning_rate": 8.5870220706065e-06, "loss": 0.0345, "step": 2004 }, { "epoch": 0.9630163304514889, "grad_norm": 0.40952514169054416, "learning_rate": 8.585074692854254e-06, "loss": 0.0387, "step": 2005 }, { "epoch": 0.9634966378482228, "grad_norm": 0.5621926001572276, "learning_rate": 8.583126195223854e-06, "loss": 0.0453, "step": 2006 }, { "epoch": 0.9639769452449568, "grad_norm": 0.6202940152268431, "learning_rate": 8.581176578323962e-06, "loss": 0.0455, "step": 2007 }, { "epoch": 0.9644572526416907, "grad_norm": 0.4253832733284014, "learning_rate": 8.579225842763578e-06, "loss": 0.038, "step": 2008 }, { "epoch": 0.9649375600384246, "grad_norm": 0.5301193530668448, "learning_rate": 8.577273989152063e-06, "loss": 0.045, "step": 2009 }, { "epoch": 0.9654178674351584, "grad_norm": 0.4447740781906591, "learning_rate": 8.575321018099122e-06, "loss": 0.0463, "step": 2010 }, { "epoch": 0.9658981748318924, "grad_norm": 0.5647765842461724, "learning_rate": 8.573366930214807e-06, "loss": 0.0382, "step": 2011 }, { "epoch": 0.9663784822286263, "grad_norm": 0.3880514422764617, "learning_rate": 8.571411726109518e-06, "loss": 0.0406, "step": 2012 }, { "epoch": 0.9668587896253602, "grad_norm": 0.49001392319362364, "learning_rate": 8.569455406394013e-06, "loss": 0.0379, "step": 2013 }, { "epoch": 0.9673390970220941, "grad_norm": 0.5021370712990999, "learning_rate": 8.567497971679387e-06, "loss": 0.0422, "step": 2014 }, { "epoch": 0.9678194044188281, "grad_norm": 0.46323226363095604, "learning_rate": 8.565539422577093e-06, "loss": 0.0404, "step": 2015 }, { "epoch": 0.968299711815562, "grad_norm": 0.41870207201506, "learning_rate": 8.563579759698925e-06, "loss": 0.0374, "step": 2016 }, { "epoch": 0.9687800192122958, "grad_norm": 0.45414111344706365, "learning_rate": 8.561618983657028e-06, "loss": 0.0418, "step": 2017 }, { "epoch": 0.9692603266090298, "grad_norm": 0.5305044501921113, "learning_rate": 8.559657095063893e-06, "loss": 0.0416, "step": 2018 }, { "epoch": 0.9697406340057637, "grad_norm": 0.5045574789523771, "learning_rate": 8.557694094532361e-06, "loss": 0.0324, "step": 2019 }, { "epoch": 0.9702209414024976, "grad_norm": 0.4492848331956523, "learning_rate": 8.555729982675619e-06, "loss": 0.0434, "step": 2020 }, { "epoch": 0.9707012487992315, "grad_norm": 0.4461437770479501, "learning_rate": 8.5537647601072e-06, "loss": 0.0363, "step": 2021 }, { "epoch": 0.9711815561959655, "grad_norm": 0.3915155679851884, "learning_rate": 8.551798427440985e-06, "loss": 0.0325, "step": 2022 }, { "epoch": 0.9716618635926993, "grad_norm": 0.4611826440449828, "learning_rate": 8.549830985291206e-06, "loss": 0.0485, "step": 2023 }, { "epoch": 0.9721421709894332, "grad_norm": 0.37428792596353483, "learning_rate": 8.547862434272431e-06, "loss": 0.0351, "step": 2024 }, { "epoch": 0.9726224783861671, "grad_norm": 0.323502628978833, "learning_rate": 8.545892774999589e-06, "loss": 0.0274, "step": 2025 }, { "epoch": 0.9731027857829011, "grad_norm": 0.6817640871809518, "learning_rate": 8.543922008087938e-06, "loss": 0.0429, "step": 2026 }, { "epoch": 0.973583093179635, "grad_norm": 0.503635291249707, "learning_rate": 8.541950134153099e-06, "loss": 0.0365, "step": 2027 }, { "epoch": 0.9740634005763689, "grad_norm": 0.5092392383162542, "learning_rate": 8.539977153811024e-06, "loss": 0.0412, "step": 2028 }, { "epoch": 0.9745437079731027, "grad_norm": 0.5178003304544473, "learning_rate": 8.538003067678022e-06, "loss": 0.0554, "step": 2029 }, { "epoch": 0.9750240153698367, "grad_norm": 0.35350741688529513, "learning_rate": 8.536027876370743e-06, "loss": 0.0346, "step": 2030 }, { "epoch": 0.9755043227665706, "grad_norm": 0.36618272757164555, "learning_rate": 8.53405158050618e-06, "loss": 0.0351, "step": 2031 }, { "epoch": 0.9759846301633045, "grad_norm": 0.4057826094482221, "learning_rate": 8.532074180701674e-06, "loss": 0.0394, "step": 2032 }, { "epoch": 0.9764649375600384, "grad_norm": 0.5037133903672764, "learning_rate": 8.53009567757491e-06, "loss": 0.0353, "step": 2033 }, { "epoch": 0.9769452449567724, "grad_norm": 0.45708751529472025, "learning_rate": 8.528116071743917e-06, "loss": 0.039, "step": 2034 }, { "epoch": 0.9774255523535063, "grad_norm": 0.4474137964593103, "learning_rate": 8.52613536382707e-06, "loss": 0.0482, "step": 2035 }, { "epoch": 0.9779058597502401, "grad_norm": 0.8437380174457212, "learning_rate": 8.524153554443088e-06, "loss": 0.0457, "step": 2036 }, { "epoch": 0.978386167146974, "grad_norm": 0.4317132378185624, "learning_rate": 8.522170644211032e-06, "loss": 0.0433, "step": 2037 }, { "epoch": 0.978866474543708, "grad_norm": 0.5306878906801935, "learning_rate": 8.520186633750309e-06, "loss": 0.0442, "step": 2038 }, { "epoch": 0.9793467819404419, "grad_norm": 0.4430347167684453, "learning_rate": 8.518201523680668e-06, "loss": 0.0472, "step": 2039 }, { "epoch": 0.9798270893371758, "grad_norm": 0.40367863591910896, "learning_rate": 8.516215314622203e-06, "loss": 0.0547, "step": 2040 }, { "epoch": 0.9803073967339097, "grad_norm": 1.0144384311645978, "learning_rate": 8.514228007195354e-06, "loss": 0.0632, "step": 2041 }, { "epoch": 0.9807877041306436, "grad_norm": 0.6255894136401294, "learning_rate": 8.512239602020894e-06, "loss": 0.0374, "step": 2042 }, { "epoch": 0.9812680115273775, "grad_norm": 0.29213416072835724, "learning_rate": 8.510250099719953e-06, "loss": 0.0302, "step": 2043 }, { "epoch": 0.9817483189241114, "grad_norm": 0.48617017713788496, "learning_rate": 8.50825950091399e-06, "loss": 0.049, "step": 2044 }, { "epoch": 0.9822286263208453, "grad_norm": 0.5170970045891808, "learning_rate": 8.506267806224817e-06, "loss": 0.0418, "step": 2045 }, { "epoch": 0.9827089337175793, "grad_norm": 0.5619222767720121, "learning_rate": 8.504275016274584e-06, "loss": 0.0477, "step": 2046 }, { "epoch": 0.9831892411143132, "grad_norm": 0.34210045056579325, "learning_rate": 8.502281131685783e-06, "loss": 0.0352, "step": 2047 }, { "epoch": 0.983669548511047, "grad_norm": 0.4991446492244405, "learning_rate": 8.500286153081248e-06, "loss": 0.0301, "step": 2048 }, { "epoch": 0.984149855907781, "grad_norm": 0.44922768509123856, "learning_rate": 8.498290081084156e-06, "loss": 0.0341, "step": 2049 }, { "epoch": 0.9846301633045149, "grad_norm": 0.307944880192032, "learning_rate": 8.496292916318023e-06, "loss": 0.025, "step": 2050 }, { "epoch": 0.9851104707012488, "grad_norm": 0.5664099644492291, "learning_rate": 8.494294659406709e-06, "loss": 0.0487, "step": 2051 }, { "epoch": 0.9855907780979827, "grad_norm": 0.7989706431633349, "learning_rate": 8.492295310974416e-06, "loss": 0.0515, "step": 2052 }, { "epoch": 0.9860710854947167, "grad_norm": 0.4172068934478134, "learning_rate": 8.490294871645681e-06, "loss": 0.037, "step": 2053 }, { "epoch": 0.9865513928914506, "grad_norm": 0.44711071451716344, "learning_rate": 8.488293342045391e-06, "loss": 0.0358, "step": 2054 }, { "epoch": 0.9870317002881844, "grad_norm": 0.394037791565088, "learning_rate": 8.486290722798765e-06, "loss": 0.0352, "step": 2055 }, { "epoch": 0.9875120076849183, "grad_norm": 0.44258059109076087, "learning_rate": 8.484287014531366e-06, "loss": 0.05, "step": 2056 }, { "epoch": 0.9879923150816523, "grad_norm": 0.4913903621371682, "learning_rate": 8.482282217869096e-06, "loss": 0.0418, "step": 2057 }, { "epoch": 0.9884726224783862, "grad_norm": 0.5964000018471872, "learning_rate": 8.480276333438203e-06, "loss": 0.0394, "step": 2058 }, { "epoch": 0.9889529298751201, "grad_norm": 0.42025505935398855, "learning_rate": 8.478269361865264e-06, "loss": 0.0315, "step": 2059 }, { "epoch": 0.989433237271854, "grad_norm": 0.3825968875041374, "learning_rate": 8.476261303777205e-06, "loss": 0.0267, "step": 2060 }, { "epoch": 0.9899135446685879, "grad_norm": 0.5266897373376677, "learning_rate": 8.474252159801287e-06, "loss": 0.0542, "step": 2061 }, { "epoch": 0.9903938520653218, "grad_norm": 0.9138199723571926, "learning_rate": 8.472241930565108e-06, "loss": 0.043, "step": 2062 }, { "epoch": 0.9908741594620557, "grad_norm": 0.4491153125193229, "learning_rate": 8.470230616696613e-06, "loss": 0.0416, "step": 2063 }, { "epoch": 0.9913544668587896, "grad_norm": 0.39648825911272, "learning_rate": 8.468218218824078e-06, "loss": 0.0308, "step": 2064 }, { "epoch": 0.9918347742555236, "grad_norm": 0.4266101634870432, "learning_rate": 8.46620473757612e-06, "loss": 0.0398, "step": 2065 }, { "epoch": 0.9923150816522575, "grad_norm": 0.46728827118397764, "learning_rate": 8.464190173581698e-06, "loss": 0.0332, "step": 2066 }, { "epoch": 0.9927953890489913, "grad_norm": 0.402026280201352, "learning_rate": 8.462174527470102e-06, "loss": 0.0306, "step": 2067 }, { "epoch": 0.9932756964457252, "grad_norm": 1.2753342168137747, "learning_rate": 8.460157799870967e-06, "loss": 0.0422, "step": 2068 }, { "epoch": 0.9937560038424592, "grad_norm": 0.48784901997970864, "learning_rate": 8.45813999141426e-06, "loss": 0.045, "step": 2069 }, { "epoch": 0.9942363112391931, "grad_norm": 0.4598639642272694, "learning_rate": 8.456121102730293e-06, "loss": 0.0339, "step": 2070 }, { "epoch": 0.994716618635927, "grad_norm": 0.43585915181157464, "learning_rate": 8.454101134449706e-06, "loss": 0.0495, "step": 2071 }, { "epoch": 0.9951969260326609, "grad_norm": 0.47450866187493235, "learning_rate": 8.452080087203484e-06, "loss": 0.0454, "step": 2072 }, { "epoch": 0.9956772334293948, "grad_norm": 0.40100519344897884, "learning_rate": 8.450057961622945e-06, "loss": 0.032, "step": 2073 }, { "epoch": 0.9961575408261287, "grad_norm": 0.3834770013645444, "learning_rate": 8.448034758339747e-06, "loss": 0.0366, "step": 2074 }, { "epoch": 0.9966378482228626, "grad_norm": 0.4018989482709451, "learning_rate": 8.446010477985882e-06, "loss": 0.0434, "step": 2075 }, { "epoch": 0.9971181556195965, "grad_norm": 0.44809689260265556, "learning_rate": 8.443985121193679e-06, "loss": 0.0464, "step": 2076 }, { "epoch": 0.9975984630163305, "grad_norm": 0.2905562947958189, "learning_rate": 8.441958688595802e-06, "loss": 0.0282, "step": 2077 }, { "epoch": 0.9980787704130644, "grad_norm": 0.7881415875093454, "learning_rate": 8.439931180825253e-06, "loss": 0.037, "step": 2078 }, { "epoch": 0.9985590778097982, "grad_norm": 0.38631353813613173, "learning_rate": 8.437902598515371e-06, "loss": 0.0406, "step": 2079 }, { "epoch": 0.9990393852065321, "grad_norm": 0.38950921399372224, "learning_rate": 8.435872942299827e-06, "loss": 0.0348, "step": 2080 }, { "epoch": 0.9995196926032661, "grad_norm": 0.5056529616402811, "learning_rate": 8.433842212812632e-06, "loss": 0.0509, "step": 2081 }, { "epoch": 1.0, "grad_norm": 0.5020057590352653, "learning_rate": 8.431810410688126e-06, "loss": 0.0373, "step": 2082 }, { "epoch": 1.0, "eval_loss": 0.03837773576378822, "eval_runtime": 509.5894, "eval_samples_per_second": 32.828, "eval_steps_per_second": 1.026, "step": 2082 }, { "epoch": 1.0004803073967339, "grad_norm": 0.3381715893609759, "learning_rate": 8.429777536560992e-06, "loss": 0.037, "step": 2083 }, { "epoch": 1.0009606147934678, "grad_norm": 0.609888699091212, "learning_rate": 8.427743591066241e-06, "loss": 0.0289, "step": 2084 }, { "epoch": 1.0014409221902016, "grad_norm": 0.3290378090727995, "learning_rate": 8.425708574839221e-06, "loss": 0.0342, "step": 2085 }, { "epoch": 1.0019212295869357, "grad_norm": 0.393539809851449, "learning_rate": 8.423672488515617e-06, "loss": 0.0323, "step": 2086 }, { "epoch": 1.0024015369836696, "grad_norm": 0.5832125175163978, "learning_rate": 8.421635332731443e-06, "loss": 0.0365, "step": 2087 }, { "epoch": 1.0028818443804035, "grad_norm": 0.41205511216875335, "learning_rate": 8.419597108123054e-06, "loss": 0.0336, "step": 2088 }, { "epoch": 1.0033621517771374, "grad_norm": 0.41255941159570103, "learning_rate": 8.417557815327131e-06, "loss": 0.0283, "step": 2089 }, { "epoch": 1.0038424591738713, "grad_norm": 0.3773371589665951, "learning_rate": 8.415517454980696e-06, "loss": 0.034, "step": 2090 }, { "epoch": 1.0043227665706052, "grad_norm": 0.4902551581278281, "learning_rate": 8.413476027721097e-06, "loss": 0.0458, "step": 2091 }, { "epoch": 1.004803073967339, "grad_norm": 0.28428153432378256, "learning_rate": 8.411433534186025e-06, "loss": 0.0275, "step": 2092 }, { "epoch": 1.005283381364073, "grad_norm": 0.43593181428678834, "learning_rate": 8.409389975013492e-06, "loss": 0.0359, "step": 2093 }, { "epoch": 1.005763688760807, "grad_norm": 1.3785888826755297, "learning_rate": 8.407345350841855e-06, "loss": 0.0494, "step": 2094 }, { "epoch": 1.006243996157541, "grad_norm": 0.35608517267897777, "learning_rate": 8.405299662309794e-06, "loss": 0.0264, "step": 2095 }, { "epoch": 1.0067243035542748, "grad_norm": 0.37950665336313133, "learning_rate": 8.40325291005633e-06, "loss": 0.0415, "step": 2096 }, { "epoch": 1.0072046109510087, "grad_norm": 0.4910462837795447, "learning_rate": 8.401205094720808e-06, "loss": 0.0363, "step": 2097 }, { "epoch": 1.0076849183477425, "grad_norm": 0.3976512513162833, "learning_rate": 8.399156216942909e-06, "loss": 0.0325, "step": 2098 }, { "epoch": 1.0081652257444764, "grad_norm": 0.5997013968480361, "learning_rate": 8.397106277362647e-06, "loss": 0.0362, "step": 2099 }, { "epoch": 1.0086455331412103, "grad_norm": 0.43970230943947863, "learning_rate": 8.395055276620365e-06, "loss": 0.0368, "step": 2100 }, { "epoch": 1.0091258405379442, "grad_norm": 0.526000465710968, "learning_rate": 8.393003215356742e-06, "loss": 0.0326, "step": 2101 }, { "epoch": 1.0096061479346783, "grad_norm": 0.5569505661835298, "learning_rate": 8.390950094212783e-06, "loss": 0.0464, "step": 2102 }, { "epoch": 1.0100864553314122, "grad_norm": 0.7301303874831994, "learning_rate": 8.388895913829825e-06, "loss": 0.0468, "step": 2103 }, { "epoch": 1.010566762728146, "grad_norm": 0.30080519694671837, "learning_rate": 8.386840674849538e-06, "loss": 0.0231, "step": 2104 }, { "epoch": 1.01104707012488, "grad_norm": 0.36167429882161295, "learning_rate": 8.384784377913923e-06, "loss": 0.0367, "step": 2105 }, { "epoch": 1.0115273775216138, "grad_norm": 0.4471823347883337, "learning_rate": 8.382727023665312e-06, "loss": 0.0372, "step": 2106 }, { "epoch": 1.0120076849183477, "grad_norm": 0.57961227888213, "learning_rate": 8.380668612746362e-06, "loss": 0.0402, "step": 2107 }, { "epoch": 1.0124879923150816, "grad_norm": 0.5013681711140076, "learning_rate": 8.378609145800062e-06, "loss": 0.035, "step": 2108 }, { "epoch": 1.0129682997118155, "grad_norm": 0.5940627129039502, "learning_rate": 8.376548623469737e-06, "loss": 0.0563, "step": 2109 }, { "epoch": 1.0134486071085496, "grad_norm": 0.42357030722727523, "learning_rate": 8.374487046399035e-06, "loss": 0.0346, "step": 2110 }, { "epoch": 1.0139289145052834, "grad_norm": 0.700305582805866, "learning_rate": 8.372424415231936e-06, "loss": 0.0456, "step": 2111 }, { "epoch": 1.0144092219020173, "grad_norm": 1.0010032199337793, "learning_rate": 8.37036073061275e-06, "loss": 0.0358, "step": 2112 }, { "epoch": 1.0148895292987512, "grad_norm": 0.4541897759361774, "learning_rate": 8.368295993186113e-06, "loss": 0.0375, "step": 2113 }, { "epoch": 1.015369836695485, "grad_norm": 0.41823187892252156, "learning_rate": 8.366230203596995e-06, "loss": 0.0311, "step": 2114 }, { "epoch": 1.015850144092219, "grad_norm": 0.38901699023725467, "learning_rate": 8.364163362490686e-06, "loss": 0.029, "step": 2115 }, { "epoch": 1.0163304514889528, "grad_norm": 0.3403787685486139, "learning_rate": 8.362095470512815e-06, "loss": 0.0318, "step": 2116 }, { "epoch": 1.0168107588856867, "grad_norm": 0.5281839131665715, "learning_rate": 8.360026528309332e-06, "loss": 0.0361, "step": 2117 }, { "epoch": 1.0172910662824208, "grad_norm": 0.4449834683187966, "learning_rate": 8.357956536526518e-06, "loss": 0.0369, "step": 2118 }, { "epoch": 1.0177713736791547, "grad_norm": 0.5668193705432205, "learning_rate": 8.355885495810981e-06, "loss": 0.0378, "step": 2119 }, { "epoch": 1.0182516810758886, "grad_norm": 0.595340603994674, "learning_rate": 8.353813406809657e-06, "loss": 0.0403, "step": 2120 }, { "epoch": 1.0187319884726225, "grad_norm": 0.6346455641375652, "learning_rate": 8.35174027016981e-06, "loss": 0.0454, "step": 2121 }, { "epoch": 1.0192122958693564, "grad_norm": 0.35110315273106346, "learning_rate": 8.349666086539028e-06, "loss": 0.0378, "step": 2122 }, { "epoch": 1.0196926032660902, "grad_norm": 0.4958901982981373, "learning_rate": 8.347590856565231e-06, "loss": 0.0428, "step": 2123 }, { "epoch": 1.0201729106628241, "grad_norm": 0.2965518425111321, "learning_rate": 8.345514580896664e-06, "loss": 0.0208, "step": 2124 }, { "epoch": 1.0206532180595582, "grad_norm": 0.24024911900103266, "learning_rate": 8.343437260181894e-06, "loss": 0.0189, "step": 2125 }, { "epoch": 1.021133525456292, "grad_norm": 0.43585792116795424, "learning_rate": 8.341358895069822e-06, "loss": 0.03, "step": 2126 }, { "epoch": 1.021613832853026, "grad_norm": 0.3541063873149519, "learning_rate": 8.339279486209672e-06, "loss": 0.0298, "step": 2127 }, { "epoch": 1.0220941402497599, "grad_norm": 0.5646097619353703, "learning_rate": 8.337199034250993e-06, "loss": 0.0468, "step": 2128 }, { "epoch": 1.0225744476464937, "grad_norm": 0.30478399666815387, "learning_rate": 8.33511753984366e-06, "loss": 0.0309, "step": 2129 }, { "epoch": 1.0230547550432276, "grad_norm": 0.34047376790714134, "learning_rate": 8.333035003637873e-06, "loss": 0.0268, "step": 2130 }, { "epoch": 1.0235350624399615, "grad_norm": 0.6109880287709962, "learning_rate": 8.330951426284164e-06, "loss": 0.0359, "step": 2131 }, { "epoch": 1.0240153698366954, "grad_norm": 0.3918414584720811, "learning_rate": 8.328866808433378e-06, "loss": 0.039, "step": 2132 }, { "epoch": 1.0244956772334295, "grad_norm": 0.5477349300929547, "learning_rate": 8.326781150736697e-06, "loss": 0.0283, "step": 2133 }, { "epoch": 1.0249759846301634, "grad_norm": 0.602430614292558, "learning_rate": 8.32469445384562e-06, "loss": 0.043, "step": 2134 }, { "epoch": 1.0254562920268973, "grad_norm": 0.4159469438712356, "learning_rate": 8.322606718411977e-06, "loss": 0.0384, "step": 2135 }, { "epoch": 1.0259365994236311, "grad_norm": 0.4235007098405889, "learning_rate": 8.320517945087913e-06, "loss": 0.035, "step": 2136 }, { "epoch": 1.026416906820365, "grad_norm": 0.3901174838995036, "learning_rate": 8.318428134525906e-06, "loss": 0.0473, "step": 2137 }, { "epoch": 1.026897214217099, "grad_norm": 0.4637158431357815, "learning_rate": 8.316337287378758e-06, "loss": 0.0508, "step": 2138 }, { "epoch": 1.0273775216138328, "grad_norm": 0.3338739662723301, "learning_rate": 8.314245404299589e-06, "loss": 0.0281, "step": 2139 }, { "epoch": 1.0278578290105667, "grad_norm": 0.412598979478152, "learning_rate": 8.312152485941845e-06, "loss": 0.0434, "step": 2140 }, { "epoch": 1.0283381364073008, "grad_norm": 0.45821858639878926, "learning_rate": 8.310058532959294e-06, "loss": 0.0363, "step": 2141 }, { "epoch": 1.0288184438040346, "grad_norm": 0.49690152789074443, "learning_rate": 8.307963546006033e-06, "loss": 0.0415, "step": 2142 }, { "epoch": 1.0292987512007685, "grad_norm": 0.6056409820766949, "learning_rate": 8.305867525736475e-06, "loss": 0.0454, "step": 2143 }, { "epoch": 1.0297790585975024, "grad_norm": 0.38151335100051437, "learning_rate": 8.303770472805361e-06, "loss": 0.0335, "step": 2144 }, { "epoch": 1.0302593659942363, "grad_norm": 0.3369970406156277, "learning_rate": 8.301672387867753e-06, "loss": 0.0277, "step": 2145 }, { "epoch": 1.0307396733909702, "grad_norm": 0.4328777418555136, "learning_rate": 8.29957327157903e-06, "loss": 0.0432, "step": 2146 }, { "epoch": 1.031219980787704, "grad_norm": 0.5685608071333996, "learning_rate": 8.297473124594902e-06, "loss": 0.0439, "step": 2147 }, { "epoch": 1.031700288184438, "grad_norm": 0.6687673446042472, "learning_rate": 8.295371947571393e-06, "loss": 0.0514, "step": 2148 }, { "epoch": 1.032180595581172, "grad_norm": 0.42898125948760024, "learning_rate": 8.293269741164858e-06, "loss": 0.0367, "step": 2149 }, { "epoch": 1.032660902977906, "grad_norm": 0.46281669516109053, "learning_rate": 8.291166506031965e-06, "loss": 0.0428, "step": 2150 }, { "epoch": 1.0331412103746398, "grad_norm": 0.47057934138454016, "learning_rate": 8.289062242829707e-06, "loss": 0.0333, "step": 2151 }, { "epoch": 1.0336215177713737, "grad_norm": 1.0985484754690809, "learning_rate": 8.286956952215396e-06, "loss": 0.0792, "step": 2152 }, { "epoch": 1.0341018251681076, "grad_norm": 0.5234639884643008, "learning_rate": 8.28485063484667e-06, "loss": 0.0455, "step": 2153 }, { "epoch": 1.0345821325648414, "grad_norm": 0.27656491631847174, "learning_rate": 8.28274329138148e-06, "loss": 0.0207, "step": 2154 }, { "epoch": 1.0350624399615753, "grad_norm": 0.4251628743618885, "learning_rate": 8.280634922478105e-06, "loss": 0.0256, "step": 2155 }, { "epoch": 1.0355427473583094, "grad_norm": 0.638688999019299, "learning_rate": 8.278525528795141e-06, "loss": 0.0363, "step": 2156 }, { "epoch": 1.0360230547550433, "grad_norm": 0.4549983532220588, "learning_rate": 8.276415110991503e-06, "loss": 0.0461, "step": 2157 }, { "epoch": 1.0365033621517772, "grad_norm": 0.6473154326582508, "learning_rate": 8.274303669726427e-06, "loss": 0.0548, "step": 2158 }, { "epoch": 1.036983669548511, "grad_norm": 0.4149993757940453, "learning_rate": 8.272191205659468e-06, "loss": 0.0315, "step": 2159 }, { "epoch": 1.037463976945245, "grad_norm": 0.4226638333902966, "learning_rate": 8.270077719450505e-06, "loss": 0.0383, "step": 2160 }, { "epoch": 1.0379442843419788, "grad_norm": 0.5513679201220233, "learning_rate": 8.26796321175973e-06, "loss": 0.0408, "step": 2161 }, { "epoch": 1.0384245917387127, "grad_norm": 0.3314420623308529, "learning_rate": 8.265847683247655e-06, "loss": 0.0322, "step": 2162 }, { "epoch": 1.0389048991354466, "grad_norm": 0.3717481246577918, "learning_rate": 8.263731134575114e-06, "loss": 0.0229, "step": 2163 }, { "epoch": 1.0393852065321807, "grad_norm": 0.4127819803214742, "learning_rate": 8.261613566403262e-06, "loss": 0.0361, "step": 2164 }, { "epoch": 1.0398655139289146, "grad_norm": 0.4225272713614735, "learning_rate": 8.259494979393563e-06, "loss": 0.0301, "step": 2165 }, { "epoch": 1.0403458213256485, "grad_norm": 0.5520262879661866, "learning_rate": 8.257375374207807e-06, "loss": 0.0414, "step": 2166 }, { "epoch": 1.0408261287223823, "grad_norm": 0.3933923430253226, "learning_rate": 8.255254751508103e-06, "loss": 0.0306, "step": 2167 }, { "epoch": 1.0413064361191162, "grad_norm": 0.5158023316395097, "learning_rate": 8.253133111956872e-06, "loss": 0.0355, "step": 2168 }, { "epoch": 1.04178674351585, "grad_norm": 0.4170919178033847, "learning_rate": 8.251010456216857e-06, "loss": 0.0294, "step": 2169 }, { "epoch": 1.042267050912584, "grad_norm": 0.39783617943091165, "learning_rate": 8.248886784951112e-06, "loss": 0.0297, "step": 2170 }, { "epoch": 1.0427473583093179, "grad_norm": 0.427097413262639, "learning_rate": 8.246762098823019e-06, "loss": 0.0312, "step": 2171 }, { "epoch": 1.043227665706052, "grad_norm": 0.5619268404552017, "learning_rate": 8.244636398496268e-06, "loss": 0.0499, "step": 2172 }, { "epoch": 1.0437079731027858, "grad_norm": 0.2640287016884147, "learning_rate": 8.242509684634873e-06, "loss": 0.0204, "step": 2173 }, { "epoch": 1.0441882804995197, "grad_norm": 0.3619744616574935, "learning_rate": 8.240381957903154e-06, "loss": 0.0309, "step": 2174 }, { "epoch": 1.0446685878962536, "grad_norm": 0.35340089829597376, "learning_rate": 8.23825321896576e-06, "loss": 0.028, "step": 2175 }, { "epoch": 1.0451488952929875, "grad_norm": 0.3781337359865754, "learning_rate": 8.236123468487649e-06, "loss": 0.0306, "step": 2176 }, { "epoch": 1.0456292026897214, "grad_norm": 0.3477884233513952, "learning_rate": 8.233992707134091e-06, "loss": 0.0325, "step": 2177 }, { "epoch": 1.0461095100864553, "grad_norm": 0.32850028069857407, "learning_rate": 8.231860935570684e-06, "loss": 0.0445, "step": 2178 }, { "epoch": 1.0465898174831891, "grad_norm": 0.3919882986715127, "learning_rate": 8.229728154463331e-06, "loss": 0.0373, "step": 2179 }, { "epoch": 1.0470701248799232, "grad_norm": 0.37809179335392684, "learning_rate": 8.227594364478253e-06, "loss": 0.0428, "step": 2180 }, { "epoch": 1.0475504322766571, "grad_norm": 0.48640219553150793, "learning_rate": 8.225459566281989e-06, "loss": 0.0361, "step": 2181 }, { "epoch": 1.048030739673391, "grad_norm": 0.5046070555986016, "learning_rate": 8.223323760541387e-06, "loss": 0.0438, "step": 2182 }, { "epoch": 1.0485110470701249, "grad_norm": 0.58074850566737, "learning_rate": 8.22118694792362e-06, "loss": 0.0471, "step": 2183 }, { "epoch": 1.0489913544668588, "grad_norm": 0.4822200002768439, "learning_rate": 8.219049129096164e-06, "loss": 0.0392, "step": 2184 }, { "epoch": 1.0494716618635926, "grad_norm": 0.4078304004114354, "learning_rate": 8.216910304726816e-06, "loss": 0.0397, "step": 2185 }, { "epoch": 1.0499519692603265, "grad_norm": 0.41160297302430937, "learning_rate": 8.214770475483686e-06, "loss": 0.0321, "step": 2186 }, { "epoch": 1.0504322766570606, "grad_norm": 0.2947831735926439, "learning_rate": 8.2126296420352e-06, "loss": 0.0317, "step": 2187 }, { "epoch": 1.0509125840537945, "grad_norm": 0.29144146800532367, "learning_rate": 8.210487805050088e-06, "loss": 0.0322, "step": 2188 }, { "epoch": 1.0513928914505284, "grad_norm": 0.3883723056342047, "learning_rate": 8.208344965197407e-06, "loss": 0.0357, "step": 2189 }, { "epoch": 1.0518731988472623, "grad_norm": 0.47475618894652727, "learning_rate": 8.20620112314652e-06, "loss": 0.0334, "step": 2190 }, { "epoch": 1.0523535062439962, "grad_norm": 0.5043873973633535, "learning_rate": 8.204056279567102e-06, "loss": 0.0483, "step": 2191 }, { "epoch": 1.05283381364073, "grad_norm": 0.4665013809367916, "learning_rate": 8.20191043512914e-06, "loss": 0.0427, "step": 2192 }, { "epoch": 1.053314121037464, "grad_norm": 0.29745486705681384, "learning_rate": 8.199763590502945e-06, "loss": 0.021, "step": 2193 }, { "epoch": 1.0537944284341978, "grad_norm": 0.6143560259293539, "learning_rate": 8.197615746359124e-06, "loss": 0.0306, "step": 2194 }, { "epoch": 1.054274735830932, "grad_norm": 0.34102183083013227, "learning_rate": 8.195466903368609e-06, "loss": 0.0277, "step": 2195 }, { "epoch": 1.0547550432276658, "grad_norm": 0.5182485986979969, "learning_rate": 8.193317062202635e-06, "loss": 0.047, "step": 2196 }, { "epoch": 1.0552353506243997, "grad_norm": 0.5819512539438135, "learning_rate": 8.191166223532756e-06, "loss": 0.0378, "step": 2197 }, { "epoch": 1.0557156580211335, "grad_norm": 0.42481809406539833, "learning_rate": 8.189014388030834e-06, "loss": 0.0297, "step": 2198 }, { "epoch": 1.0561959654178674, "grad_norm": 0.2779317653540544, "learning_rate": 8.18686155636904e-06, "loss": 0.0294, "step": 2199 }, { "epoch": 1.0566762728146013, "grad_norm": 0.4222868602915256, "learning_rate": 8.184707729219865e-06, "loss": 0.0407, "step": 2200 }, { "epoch": 1.0571565802113352, "grad_norm": 0.31542892149260526, "learning_rate": 8.1825529072561e-06, "loss": 0.0315, "step": 2201 }, { "epoch": 1.057636887608069, "grad_norm": 0.32726087919680547, "learning_rate": 8.180397091150853e-06, "loss": 0.0351, "step": 2202 }, { "epoch": 1.0581171950048032, "grad_norm": 0.3211728938374578, "learning_rate": 8.178240281577542e-06, "loss": 0.0301, "step": 2203 }, { "epoch": 1.058597502401537, "grad_norm": 0.5808023052416281, "learning_rate": 8.176082479209893e-06, "loss": 0.0302, "step": 2204 }, { "epoch": 1.059077809798271, "grad_norm": 0.34868082893878943, "learning_rate": 8.173923684721945e-06, "loss": 0.0334, "step": 2205 }, { "epoch": 1.0595581171950048, "grad_norm": 0.3423105715083267, "learning_rate": 8.171763898788045e-06, "loss": 0.0242, "step": 2206 }, { "epoch": 1.0600384245917387, "grad_norm": 0.37156177964238457, "learning_rate": 8.169603122082852e-06, "loss": 0.0324, "step": 2207 }, { "epoch": 1.0605187319884726, "grad_norm": 0.3578686179526663, "learning_rate": 8.167441355281332e-06, "loss": 0.0237, "step": 2208 }, { "epoch": 1.0609990393852065, "grad_norm": 0.4520619797776136, "learning_rate": 8.16527859905876e-06, "loss": 0.0409, "step": 2209 }, { "epoch": 1.0614793467819403, "grad_norm": 0.30190193413578303, "learning_rate": 8.163114854090724e-06, "loss": 0.0217, "step": 2210 }, { "epoch": 1.0619596541786744, "grad_norm": 0.43131875274041354, "learning_rate": 8.160950121053114e-06, "loss": 0.0304, "step": 2211 }, { "epoch": 1.0624399615754083, "grad_norm": 0.38601446435697534, "learning_rate": 8.158784400622135e-06, "loss": 0.0321, "step": 2212 }, { "epoch": 1.0629202689721422, "grad_norm": 0.4596019935836858, "learning_rate": 8.156617693474301e-06, "loss": 0.0395, "step": 2213 }, { "epoch": 1.063400576368876, "grad_norm": 0.37655855003588423, "learning_rate": 8.154450000286425e-06, "loss": 0.0361, "step": 2214 }, { "epoch": 1.06388088376561, "grad_norm": 0.45845627490103313, "learning_rate": 8.15228132173564e-06, "loss": 0.0442, "step": 2215 }, { "epoch": 1.0643611911623438, "grad_norm": 0.4086550863179147, "learning_rate": 8.150111658499378e-06, "loss": 0.0422, "step": 2216 }, { "epoch": 1.0648414985590777, "grad_norm": 0.47493775886593015, "learning_rate": 8.147941011255385e-06, "loss": 0.035, "step": 2217 }, { "epoch": 1.0653218059558118, "grad_norm": 0.5449375651330198, "learning_rate": 8.145769380681707e-06, "loss": 0.0522, "step": 2218 }, { "epoch": 1.0658021133525457, "grad_norm": 0.3088899364401438, "learning_rate": 8.143596767456702e-06, "loss": 0.037, "step": 2219 }, { "epoch": 1.0662824207492796, "grad_norm": 0.32446796982737464, "learning_rate": 8.141423172259038e-06, "loss": 0.0298, "step": 2220 }, { "epoch": 1.0667627281460135, "grad_norm": 0.3836903958802041, "learning_rate": 8.139248595767682e-06, "loss": 0.0365, "step": 2221 }, { "epoch": 1.0672430355427474, "grad_norm": 0.520922372024731, "learning_rate": 8.137073038661914e-06, "loss": 0.04, "step": 2222 }, { "epoch": 1.0677233429394812, "grad_norm": 0.48766063222264555, "learning_rate": 8.134896501621317e-06, "loss": 0.0347, "step": 2223 }, { "epoch": 1.0682036503362151, "grad_norm": 0.4766525658868032, "learning_rate": 8.13271898532578e-06, "loss": 0.0441, "step": 2224 }, { "epoch": 1.068683957732949, "grad_norm": 0.3221128109694835, "learning_rate": 8.1305404904555e-06, "loss": 0.0211, "step": 2225 }, { "epoch": 1.069164265129683, "grad_norm": 0.3755154907722652, "learning_rate": 8.128361017690978e-06, "loss": 0.0393, "step": 2226 }, { "epoch": 1.069644572526417, "grad_norm": 0.5453412672935776, "learning_rate": 8.126180567713022e-06, "loss": 0.0423, "step": 2227 }, { "epoch": 1.0701248799231509, "grad_norm": 0.24228351531895745, "learning_rate": 8.123999141202743e-06, "loss": 0.0174, "step": 2228 }, { "epoch": 1.0706051873198847, "grad_norm": 0.3994781652728111, "learning_rate": 8.121816738841559e-06, "loss": 0.0345, "step": 2229 }, { "epoch": 1.0710854947166186, "grad_norm": 0.40024502159348485, "learning_rate": 8.119633361311192e-06, "loss": 0.0314, "step": 2230 }, { "epoch": 1.0715658021133525, "grad_norm": 0.32235536588042024, "learning_rate": 8.117449009293668e-06, "loss": 0.0244, "step": 2231 }, { "epoch": 1.0720461095100864, "grad_norm": 0.5220604273580561, "learning_rate": 8.11526368347132e-06, "loss": 0.0432, "step": 2232 }, { "epoch": 1.0725264169068203, "grad_norm": 0.5155635343628655, "learning_rate": 8.113077384526782e-06, "loss": 0.0308, "step": 2233 }, { "epoch": 1.0730067243035544, "grad_norm": 0.42755017336819257, "learning_rate": 8.110890113142993e-06, "loss": 0.0382, "step": 2234 }, { "epoch": 1.0734870317002883, "grad_norm": 0.4849667790301038, "learning_rate": 8.108701870003195e-06, "loss": 0.0324, "step": 2235 }, { "epoch": 1.0739673390970221, "grad_norm": 0.3641325674240868, "learning_rate": 8.106512655790937e-06, "loss": 0.0279, "step": 2236 }, { "epoch": 1.074447646493756, "grad_norm": 1.161998559638778, "learning_rate": 8.10432247119007e-06, "loss": 0.0321, "step": 2237 }, { "epoch": 1.07492795389049, "grad_norm": 0.3770143763346995, "learning_rate": 8.102131316884742e-06, "loss": 0.0282, "step": 2238 }, { "epoch": 1.0754082612872238, "grad_norm": 1.0941124923225543, "learning_rate": 8.099939193559416e-06, "loss": 0.0282, "step": 2239 }, { "epoch": 1.0758885686839577, "grad_norm": 0.4061255490476273, "learning_rate": 8.097746101898844e-06, "loss": 0.0334, "step": 2240 }, { "epoch": 1.0763688760806915, "grad_norm": 0.4079742485433284, "learning_rate": 8.095552042588091e-06, "loss": 0.0344, "step": 2241 }, { "epoch": 1.0768491834774256, "grad_norm": 0.4219136908707164, "learning_rate": 8.093357016312518e-06, "loss": 0.0386, "step": 2242 }, { "epoch": 1.0773294908741595, "grad_norm": 0.5186414712281636, "learning_rate": 8.091161023757792e-06, "loss": 0.0418, "step": 2243 }, { "epoch": 1.0778097982708934, "grad_norm": 0.9256604051736174, "learning_rate": 8.088964065609881e-06, "loss": 0.0269, "step": 2244 }, { "epoch": 1.0782901056676273, "grad_norm": 0.5085831132188349, "learning_rate": 8.086766142555054e-06, "loss": 0.0278, "step": 2245 }, { "epoch": 1.0787704130643612, "grad_norm": 0.6983484405632742, "learning_rate": 8.08456725527988e-06, "loss": 0.0524, "step": 2246 }, { "epoch": 1.079250720461095, "grad_norm": 0.5384875060604798, "learning_rate": 8.08236740447123e-06, "loss": 0.038, "step": 2247 }, { "epoch": 1.079731027857829, "grad_norm": 0.45102326834264717, "learning_rate": 8.080166590816278e-06, "loss": 0.0306, "step": 2248 }, { "epoch": 1.080211335254563, "grad_norm": 0.6150742252142519, "learning_rate": 8.077964815002497e-06, "loss": 0.0453, "step": 2249 }, { "epoch": 1.080691642651297, "grad_norm": 0.7521374737324324, "learning_rate": 8.075762077717662e-06, "loss": 0.0336, "step": 2250 }, { "epoch": 1.0811719500480308, "grad_norm": 0.500831526567554, "learning_rate": 8.073558379649845e-06, "loss": 0.0445, "step": 2251 }, { "epoch": 1.0816522574447647, "grad_norm": 0.4399577310772139, "learning_rate": 8.071353721487421e-06, "loss": 0.0367, "step": 2252 }, { "epoch": 1.0821325648414986, "grad_norm": 0.6064759907115551, "learning_rate": 8.069148103919064e-06, "loss": 0.0474, "step": 2253 }, { "epoch": 1.0826128722382324, "grad_norm": 0.43261608322137696, "learning_rate": 8.06694152763375e-06, "loss": 0.0309, "step": 2254 }, { "epoch": 1.0830931796349663, "grad_norm": 0.34544250495307816, "learning_rate": 8.064733993320751e-06, "loss": 0.0311, "step": 2255 }, { "epoch": 1.0835734870317002, "grad_norm": 0.41663658460388087, "learning_rate": 8.062525501669638e-06, "loss": 0.0257, "step": 2256 }, { "epoch": 1.084053794428434, "grad_norm": 0.8570052725985697, "learning_rate": 8.060316053370282e-06, "loss": 0.0442, "step": 2257 }, { "epoch": 1.0845341018251682, "grad_norm": 0.6830595804257297, "learning_rate": 8.058105649112859e-06, "loss": 0.0513, "step": 2258 }, { "epoch": 1.085014409221902, "grad_norm": 0.605543687102496, "learning_rate": 8.055894289587833e-06, "loss": 0.0472, "step": 2259 }, { "epoch": 1.085494716618636, "grad_norm": 0.3218080458907461, "learning_rate": 8.053681975485973e-06, "loss": 0.0358, "step": 2260 }, { "epoch": 1.0859750240153698, "grad_norm": 0.33207095453314117, "learning_rate": 8.051468707498348e-06, "loss": 0.0256, "step": 2261 }, { "epoch": 1.0864553314121037, "grad_norm": 0.558221044914241, "learning_rate": 8.049254486316316e-06, "loss": 0.0246, "step": 2262 }, { "epoch": 1.0869356388088376, "grad_norm": 0.4701763944281842, "learning_rate": 8.047039312631542e-06, "loss": 0.0283, "step": 2263 }, { "epoch": 1.0874159462055715, "grad_norm": 0.6589241295805947, "learning_rate": 8.044823187135984e-06, "loss": 0.0437, "step": 2264 }, { "epoch": 1.0878962536023056, "grad_norm": 0.4612982612804883, "learning_rate": 8.042606110521897e-06, "loss": 0.0365, "step": 2265 }, { "epoch": 1.0883765609990395, "grad_norm": 0.5972014572574613, "learning_rate": 8.040388083481838e-06, "loss": 0.0484, "step": 2266 }, { "epoch": 1.0888568683957733, "grad_norm": 0.5678773407692689, "learning_rate": 8.038169106708656e-06, "loss": 0.0346, "step": 2267 }, { "epoch": 1.0893371757925072, "grad_norm": 0.4375411388102669, "learning_rate": 8.035949180895495e-06, "loss": 0.026, "step": 2268 }, { "epoch": 1.089817483189241, "grad_norm": 0.43856494522390793, "learning_rate": 8.0337283067358e-06, "loss": 0.0252, "step": 2269 }, { "epoch": 1.090297790585975, "grad_norm": 0.5326148477250409, "learning_rate": 8.031506484923312e-06, "loss": 0.0336, "step": 2270 }, { "epoch": 1.0907780979827089, "grad_norm": 0.505624538892781, "learning_rate": 8.029283716152065e-06, "loss": 0.0401, "step": 2271 }, { "epoch": 1.0912584053794427, "grad_norm": 0.78546233963862, "learning_rate": 8.027060001116393e-06, "loss": 0.0551, "step": 2272 }, { "epoch": 1.0917387127761768, "grad_norm": 0.34328643301065775, "learning_rate": 8.02483534051092e-06, "loss": 0.0301, "step": 2273 }, { "epoch": 1.0922190201729107, "grad_norm": 0.30241459573674484, "learning_rate": 8.02260973503057e-06, "loss": 0.0224, "step": 2274 }, { "epoch": 1.0926993275696446, "grad_norm": 0.5272917882242916, "learning_rate": 8.020383185370559e-06, "loss": 0.0381, "step": 2275 }, { "epoch": 1.0931796349663785, "grad_norm": 0.3626572283876546, "learning_rate": 8.018155692226403e-06, "loss": 0.03, "step": 2276 }, { "epoch": 1.0936599423631124, "grad_norm": 0.4121981651828316, "learning_rate": 8.015927256293906e-06, "loss": 0.028, "step": 2277 }, { "epoch": 1.0941402497598463, "grad_norm": 0.5109205352364851, "learning_rate": 8.01369787826917e-06, "loss": 0.0281, "step": 2278 }, { "epoch": 1.0946205571565801, "grad_norm": 0.44072750736049787, "learning_rate": 8.011467558848594e-06, "loss": 0.0261, "step": 2279 }, { "epoch": 1.0951008645533142, "grad_norm": 0.38429670441283004, "learning_rate": 8.009236298728864e-06, "loss": 0.0287, "step": 2280 }, { "epoch": 1.0955811719500481, "grad_norm": 0.333548569621216, "learning_rate": 8.007004098606967e-06, "loss": 0.0233, "step": 2281 }, { "epoch": 1.096061479346782, "grad_norm": 0.5217629458993167, "learning_rate": 8.004770959180179e-06, "loss": 0.0529, "step": 2282 }, { "epoch": 1.0965417867435159, "grad_norm": 0.4886825391475661, "learning_rate": 8.00253688114607e-06, "loss": 0.0423, "step": 2283 }, { "epoch": 1.0970220941402498, "grad_norm": 0.3889650652886195, "learning_rate": 8.000301865202507e-06, "loss": 0.0297, "step": 2284 }, { "epoch": 1.0975024015369836, "grad_norm": 0.4255838607337438, "learning_rate": 7.998065912047645e-06, "loss": 0.0351, "step": 2285 }, { "epoch": 1.0979827089337175, "grad_norm": 0.3468387046634431, "learning_rate": 7.995829022379937e-06, "loss": 0.0331, "step": 2286 }, { "epoch": 1.0984630163304514, "grad_norm": 0.541388456477309, "learning_rate": 7.993591196898119e-06, "loss": 0.0333, "step": 2287 }, { "epoch": 1.0989433237271853, "grad_norm": 0.3640246957604723, "learning_rate": 7.991352436301233e-06, "loss": 0.0299, "step": 2288 }, { "epoch": 1.0994236311239194, "grad_norm": 0.39031296923135117, "learning_rate": 7.989112741288601e-06, "loss": 0.0336, "step": 2289 }, { "epoch": 1.0999039385206533, "grad_norm": 0.6614301082931587, "learning_rate": 7.986872112559845e-06, "loss": 0.0353, "step": 2290 }, { "epoch": 1.1003842459173871, "grad_norm": 0.36989736322957784, "learning_rate": 7.984630550814872e-06, "loss": 0.0258, "step": 2291 }, { "epoch": 1.100864553314121, "grad_norm": 0.45370608998514333, "learning_rate": 7.982388056753886e-06, "loss": 0.0428, "step": 2292 }, { "epoch": 1.101344860710855, "grad_norm": 0.363610968555276, "learning_rate": 7.980144631077383e-06, "loss": 0.0279, "step": 2293 }, { "epoch": 1.1018251681075888, "grad_norm": 0.3824128619184399, "learning_rate": 7.977900274486141e-06, "loss": 0.0324, "step": 2294 }, { "epoch": 1.1023054755043227, "grad_norm": 0.8080081303764828, "learning_rate": 7.975654987681236e-06, "loss": 0.0397, "step": 2295 }, { "epoch": 1.1027857829010568, "grad_norm": 0.3688010630311016, "learning_rate": 7.973408771364039e-06, "loss": 0.0332, "step": 2296 }, { "epoch": 1.1032660902977907, "grad_norm": 0.43206250551112707, "learning_rate": 7.9711616262362e-06, "loss": 0.0336, "step": 2297 }, { "epoch": 1.1037463976945245, "grad_norm": 0.4580561104578234, "learning_rate": 7.96891355299967e-06, "loss": 0.0328, "step": 2298 }, { "epoch": 1.1042267050912584, "grad_norm": 0.6252831477527122, "learning_rate": 7.96666455235668e-06, "loss": 0.0408, "step": 2299 }, { "epoch": 1.1047070124879923, "grad_norm": 0.33348355854177336, "learning_rate": 7.964414625009758e-06, "loss": 0.0268, "step": 2300 }, { "epoch": 1.1051873198847262, "grad_norm": 0.5533479401081703, "learning_rate": 7.962163771661718e-06, "loss": 0.03, "step": 2301 }, { "epoch": 1.10566762728146, "grad_norm": 0.36577074600957327, "learning_rate": 7.959911993015665e-06, "loss": 0.0276, "step": 2302 }, { "epoch": 1.106147934678194, "grad_norm": 0.4316472606715181, "learning_rate": 7.957659289774992e-06, "loss": 0.0395, "step": 2303 }, { "epoch": 1.106628242074928, "grad_norm": 0.3800380203785241, "learning_rate": 7.955405662643384e-06, "loss": 0.0314, "step": 2304 }, { "epoch": 1.107108549471662, "grad_norm": 0.4843606581996306, "learning_rate": 7.953151112324807e-06, "loss": 0.0434, "step": 2305 }, { "epoch": 1.1075888568683958, "grad_norm": 0.3565081004990375, "learning_rate": 7.950895639523524e-06, "loss": 0.034, "step": 2306 }, { "epoch": 1.1080691642651297, "grad_norm": 0.5188756888870497, "learning_rate": 7.948639244944078e-06, "loss": 0.0453, "step": 2307 }, { "epoch": 1.1085494716618636, "grad_norm": 0.4080267827192745, "learning_rate": 7.94638192929131e-06, "loss": 0.0275, "step": 2308 }, { "epoch": 1.1090297790585975, "grad_norm": 0.3574840729538061, "learning_rate": 7.944123693270338e-06, "loss": 0.0292, "step": 2309 }, { "epoch": 1.1095100864553313, "grad_norm": 0.6187360776084617, "learning_rate": 7.941864537586576e-06, "loss": 0.0394, "step": 2310 }, { "epoch": 1.1099903938520654, "grad_norm": 0.3433745678372005, "learning_rate": 7.93960446294572e-06, "loss": 0.0222, "step": 2311 }, { "epoch": 1.1104707012487993, "grad_norm": 0.3835800795714836, "learning_rate": 7.937343470053753e-06, "loss": 0.0395, "step": 2312 }, { "epoch": 1.1109510086455332, "grad_norm": 0.43988773736126635, "learning_rate": 7.935081559616951e-06, "loss": 0.0318, "step": 2313 }, { "epoch": 1.111431316042267, "grad_norm": 0.9000652881059054, "learning_rate": 7.932818732341868e-06, "loss": 0.0356, "step": 2314 }, { "epoch": 1.111911623439001, "grad_norm": 0.4395385787136166, "learning_rate": 7.930554988935351e-06, "loss": 0.0337, "step": 2315 }, { "epoch": 1.1123919308357348, "grad_norm": 0.3318835066071892, "learning_rate": 7.928290330104531e-06, "loss": 0.0243, "step": 2316 }, { "epoch": 1.1128722382324687, "grad_norm": 0.4547390514198411, "learning_rate": 7.926024756556822e-06, "loss": 0.0414, "step": 2317 }, { "epoch": 1.1133525456292026, "grad_norm": 0.5289233449092288, "learning_rate": 7.923758268999931e-06, "loss": 0.0356, "step": 2318 }, { "epoch": 1.1138328530259365, "grad_norm": 0.43381512380484083, "learning_rate": 7.921490868141843e-06, "loss": 0.0351, "step": 2319 }, { "epoch": 1.1143131604226706, "grad_norm": 0.39487491965822513, "learning_rate": 7.91922255469083e-06, "loss": 0.0232, "step": 2320 }, { "epoch": 1.1147934678194045, "grad_norm": 0.3337376828357569, "learning_rate": 7.916953329355455e-06, "loss": 0.0275, "step": 2321 }, { "epoch": 1.1152737752161384, "grad_norm": 0.6862646175337912, "learning_rate": 7.914683192844556e-06, "loss": 0.0307, "step": 2322 }, { "epoch": 1.1157540826128722, "grad_norm": 0.32532119228091205, "learning_rate": 7.912412145867266e-06, "loss": 0.0263, "step": 2323 }, { "epoch": 1.1162343900096061, "grad_norm": 0.3535253178962557, "learning_rate": 7.91014018913299e-06, "loss": 0.0367, "step": 2324 }, { "epoch": 1.11671469740634, "grad_norm": 0.42627737377647384, "learning_rate": 7.907867323351433e-06, "loss": 0.0369, "step": 2325 }, { "epoch": 1.1171950048030739, "grad_norm": 0.4165428516762115, "learning_rate": 7.90559354923257e-06, "loss": 0.0395, "step": 2326 }, { "epoch": 1.117675312199808, "grad_norm": 0.5341520310125699, "learning_rate": 7.903318867486667e-06, "loss": 0.0429, "step": 2327 }, { "epoch": 1.1181556195965419, "grad_norm": 0.36017266938079623, "learning_rate": 7.90104327882427e-06, "loss": 0.0272, "step": 2328 }, { "epoch": 1.1186359269932757, "grad_norm": 0.5473539207357297, "learning_rate": 7.898766783956213e-06, "loss": 0.0416, "step": 2329 }, { "epoch": 1.1191162343900096, "grad_norm": 0.32433698907988967, "learning_rate": 7.896489383593606e-06, "loss": 0.0239, "step": 2330 }, { "epoch": 1.1195965417867435, "grad_norm": 0.3929213640534066, "learning_rate": 7.89421107844785e-06, "loss": 0.0323, "step": 2331 }, { "epoch": 1.1200768491834774, "grad_norm": 0.36661532024406557, "learning_rate": 7.891931869230621e-06, "loss": 0.0422, "step": 2332 }, { "epoch": 1.1205571565802113, "grad_norm": 0.2867953144217533, "learning_rate": 7.889651756653882e-06, "loss": 0.031, "step": 2333 }, { "epoch": 1.1210374639769451, "grad_norm": 0.3022836940397149, "learning_rate": 7.887370741429877e-06, "loss": 0.0268, "step": 2334 }, { "epoch": 1.1215177713736793, "grad_norm": 0.27364041653472126, "learning_rate": 7.885088824271133e-06, "loss": 0.0225, "step": 2335 }, { "epoch": 1.1219980787704131, "grad_norm": 0.46297016498534943, "learning_rate": 7.882806005890458e-06, "loss": 0.0419, "step": 2336 }, { "epoch": 1.122478386167147, "grad_norm": 0.373688053566492, "learning_rate": 7.88052228700094e-06, "loss": 0.0335, "step": 2337 }, { "epoch": 1.122958693563881, "grad_norm": 0.35958762118698195, "learning_rate": 7.878237668315951e-06, "loss": 0.0295, "step": 2338 }, { "epoch": 1.1234390009606148, "grad_norm": 0.4357983966007902, "learning_rate": 7.87595215054914e-06, "loss": 0.0313, "step": 2339 }, { "epoch": 1.1239193083573487, "grad_norm": 0.3967179688427042, "learning_rate": 7.873665734414445e-06, "loss": 0.0336, "step": 2340 }, { "epoch": 1.1243996157540825, "grad_norm": 0.7445667642848869, "learning_rate": 7.871378420626072e-06, "loss": 0.0597, "step": 2341 }, { "epoch": 1.1248799231508164, "grad_norm": 0.38955934397676056, "learning_rate": 7.869090209898518e-06, "loss": 0.0384, "step": 2342 }, { "epoch": 1.1253602305475505, "grad_norm": 0.345113774658829, "learning_rate": 7.866801102946558e-06, "loss": 0.0193, "step": 2343 }, { "epoch": 1.1258405379442844, "grad_norm": 0.3939234910432558, "learning_rate": 7.864511100485246e-06, "loss": 0.0392, "step": 2344 }, { "epoch": 1.1263208453410183, "grad_norm": 0.40275835553972805, "learning_rate": 7.862220203229911e-06, "loss": 0.0304, "step": 2345 }, { "epoch": 1.1268011527377522, "grad_norm": 0.34843217454647263, "learning_rate": 7.85992841189617e-06, "loss": 0.03, "step": 2346 }, { "epoch": 1.127281460134486, "grad_norm": 0.2890039598318389, "learning_rate": 7.857635727199915e-06, "loss": 0.0194, "step": 2347 }, { "epoch": 1.12776176753122, "grad_norm": 0.3981185658924181, "learning_rate": 7.855342149857315e-06, "loss": 0.0296, "step": 2348 }, { "epoch": 1.1282420749279538, "grad_norm": 0.3906510389359068, "learning_rate": 7.853047680584821e-06, "loss": 0.0274, "step": 2349 }, { "epoch": 1.1287223823246877, "grad_norm": 0.34744562989888017, "learning_rate": 7.850752320099165e-06, "loss": 0.0261, "step": 2350 }, { "epoch": 1.1292026897214218, "grad_norm": 0.5123711712409632, "learning_rate": 7.848456069117349e-06, "loss": 0.0516, "step": 2351 }, { "epoch": 1.1296829971181557, "grad_norm": 0.37561743206319553, "learning_rate": 7.84615892835666e-06, "loss": 0.0318, "step": 2352 }, { "epoch": 1.1301633045148896, "grad_norm": 0.38200270339855635, "learning_rate": 7.843860898534661e-06, "loss": 0.0288, "step": 2353 }, { "epoch": 1.1306436119116234, "grad_norm": 0.3338472930179495, "learning_rate": 7.841561980369197e-06, "loss": 0.0232, "step": 2354 }, { "epoch": 1.1311239193083573, "grad_norm": 0.3921217276946738, "learning_rate": 7.83926217457838e-06, "loss": 0.0286, "step": 2355 }, { "epoch": 1.1316042267050912, "grad_norm": 0.3180842439848397, "learning_rate": 7.836961481880611e-06, "loss": 0.0249, "step": 2356 }, { "epoch": 1.132084534101825, "grad_norm": 0.5596647300664854, "learning_rate": 7.83465990299456e-06, "loss": 0.0273, "step": 2357 }, { "epoch": 1.1325648414985592, "grad_norm": 0.36029319441170915, "learning_rate": 7.832357438639175e-06, "loss": 0.0288, "step": 2358 }, { "epoch": 1.133045148895293, "grad_norm": 0.4159033930545626, "learning_rate": 7.830054089533687e-06, "loss": 0.0293, "step": 2359 }, { "epoch": 1.133525456292027, "grad_norm": 0.4063642951749275, "learning_rate": 7.827749856397595e-06, "loss": 0.0411, "step": 2360 }, { "epoch": 1.1340057636887608, "grad_norm": 0.4090551001531113, "learning_rate": 7.825444739950678e-06, "loss": 0.0311, "step": 2361 }, { "epoch": 1.1344860710854947, "grad_norm": 0.4509197831911849, "learning_rate": 7.823138740912992e-06, "loss": 0.0279, "step": 2362 }, { "epoch": 1.1349663784822286, "grad_norm": 0.3613381995718407, "learning_rate": 7.820831860004867e-06, "loss": 0.0375, "step": 2363 }, { "epoch": 1.1354466858789625, "grad_norm": 0.41285343844138966, "learning_rate": 7.818524097946906e-06, "loss": 0.042, "step": 2364 }, { "epoch": 1.1359269932756964, "grad_norm": 0.3311546206310933, "learning_rate": 7.816215455459994e-06, "loss": 0.0273, "step": 2365 }, { "epoch": 1.1364073006724302, "grad_norm": 0.33850950761061577, "learning_rate": 7.813905933265284e-06, "loss": 0.0277, "step": 2366 }, { "epoch": 1.1368876080691643, "grad_norm": 0.3843644072695301, "learning_rate": 7.81159553208421e-06, "loss": 0.0287, "step": 2367 }, { "epoch": 1.1373679154658982, "grad_norm": 0.44336772396820107, "learning_rate": 7.809284252638474e-06, "loss": 0.0284, "step": 2368 }, { "epoch": 1.137848222862632, "grad_norm": 0.691014926166663, "learning_rate": 7.806972095650057e-06, "loss": 0.0391, "step": 2369 }, { "epoch": 1.138328530259366, "grad_norm": 0.4566750491145922, "learning_rate": 7.804659061841216e-06, "loss": 0.0418, "step": 2370 }, { "epoch": 1.1388088376560999, "grad_norm": 0.4434065595782408, "learning_rate": 7.802345151934473e-06, "loss": 0.0337, "step": 2371 }, { "epoch": 1.1392891450528337, "grad_norm": 0.41106146075874245, "learning_rate": 7.800030366652633e-06, "loss": 0.0305, "step": 2372 }, { "epoch": 1.1397694524495678, "grad_norm": 0.33346335800271754, "learning_rate": 7.79771470671877e-06, "loss": 0.0242, "step": 2373 }, { "epoch": 1.1402497598463017, "grad_norm": 0.4724726504496481, "learning_rate": 7.795398172856234e-06, "loss": 0.0347, "step": 2374 }, { "epoch": 1.1407300672430356, "grad_norm": 0.4154733387206092, "learning_rate": 7.79308076578864e-06, "loss": 0.0252, "step": 2375 }, { "epoch": 1.1412103746397695, "grad_norm": 0.5407866126060925, "learning_rate": 7.790762486239891e-06, "loss": 0.0354, "step": 2376 }, { "epoch": 1.1416906820365034, "grad_norm": 0.5507199392301321, "learning_rate": 7.788443334934148e-06, "loss": 0.0304, "step": 2377 }, { "epoch": 1.1421709894332372, "grad_norm": 0.630786976037147, "learning_rate": 7.78612331259585e-06, "loss": 0.0327, "step": 2378 }, { "epoch": 1.1426512968299711, "grad_norm": 0.49270545213242467, "learning_rate": 7.783802419949706e-06, "loss": 0.0398, "step": 2379 }, { "epoch": 1.143131604226705, "grad_norm": 0.3198651734297585, "learning_rate": 7.781480657720704e-06, "loss": 0.0245, "step": 2380 }, { "epoch": 1.143611911623439, "grad_norm": 0.4049722516955576, "learning_rate": 7.779158026634096e-06, "loss": 0.0345, "step": 2381 }, { "epoch": 1.144092219020173, "grad_norm": 0.36892885469981307, "learning_rate": 7.776834527415403e-06, "loss": 0.0249, "step": 2382 }, { "epoch": 1.1445725264169069, "grad_norm": 0.4674032046214744, "learning_rate": 7.774510160790427e-06, "loss": 0.0379, "step": 2383 }, { "epoch": 1.1450528338136408, "grad_norm": 0.5981601559426172, "learning_rate": 7.772184927485235e-06, "loss": 0.033, "step": 2384 }, { "epoch": 1.1455331412103746, "grad_norm": 0.37507719554486524, "learning_rate": 7.769858828226165e-06, "loss": 0.0249, "step": 2385 }, { "epoch": 1.1460134486071085, "grad_norm": 0.905029163141749, "learning_rate": 7.767531863739826e-06, "loss": 0.0438, "step": 2386 }, { "epoch": 1.1464937560038424, "grad_norm": 0.5851325002322023, "learning_rate": 7.765204034753098e-06, "loss": 0.0397, "step": 2387 }, { "epoch": 1.1469740634005763, "grad_norm": 0.2841895222411613, "learning_rate": 7.76287534199313e-06, "loss": 0.0163, "step": 2388 }, { "epoch": 1.1474543707973104, "grad_norm": 0.2842068980580675, "learning_rate": 7.76054578618734e-06, "loss": 0.0261, "step": 2389 }, { "epoch": 1.1479346781940443, "grad_norm": 0.4495098779135193, "learning_rate": 7.758215368063418e-06, "loss": 0.0256, "step": 2390 }, { "epoch": 1.1484149855907781, "grad_norm": 0.3442267620329867, "learning_rate": 7.755884088349324e-06, "loss": 0.0236, "step": 2391 }, { "epoch": 1.148895292987512, "grad_norm": 0.5458880230475651, "learning_rate": 7.753551947773282e-06, "loss": 0.0786, "step": 2392 }, { "epoch": 1.149375600384246, "grad_norm": 0.6157443206300256, "learning_rate": 7.751218947063788e-06, "loss": 0.0326, "step": 2393 }, { "epoch": 1.1498559077809798, "grad_norm": 0.4006081822103283, "learning_rate": 7.74888508694961e-06, "loss": 0.0403, "step": 2394 }, { "epoch": 1.1503362151777137, "grad_norm": 0.5819915511575596, "learning_rate": 7.746550368159778e-06, "loss": 0.054, "step": 2395 }, { "epoch": 1.1508165225744476, "grad_norm": 0.2833566184286292, "learning_rate": 7.744214791423597e-06, "loss": 0.0225, "step": 2396 }, { "epoch": 1.1512968299711814, "grad_norm": 0.5953075576231542, "learning_rate": 7.741878357470634e-06, "loss": 0.0512, "step": 2397 }, { "epoch": 1.1517771373679155, "grad_norm": 0.38402265897070875, "learning_rate": 7.739541067030726e-06, "loss": 0.0256, "step": 2398 }, { "epoch": 1.1522574447646494, "grad_norm": 0.40593681912525176, "learning_rate": 7.73720292083398e-06, "loss": 0.0317, "step": 2399 }, { "epoch": 1.1527377521613833, "grad_norm": 0.3597794019869164, "learning_rate": 7.73486391961077e-06, "loss": 0.0276, "step": 2400 }, { "epoch": 1.1532180595581172, "grad_norm": 0.409941608804132, "learning_rate": 7.732524064091729e-06, "loss": 0.0314, "step": 2401 }, { "epoch": 1.153698366954851, "grad_norm": 0.43908591052721524, "learning_rate": 7.730183355007767e-06, "loss": 0.0362, "step": 2402 }, { "epoch": 1.154178674351585, "grad_norm": 0.595461176307672, "learning_rate": 7.727841793090058e-06, "loss": 0.0378, "step": 2403 }, { "epoch": 1.154658981748319, "grad_norm": 0.40203636522366276, "learning_rate": 7.725499379070039e-06, "loss": 0.038, "step": 2404 }, { "epoch": 1.155139289145053, "grad_norm": 0.4036375362142536, "learning_rate": 7.723156113679415e-06, "loss": 0.0292, "step": 2405 }, { "epoch": 1.1556195965417868, "grad_norm": 0.9620310461628728, "learning_rate": 7.72081199765016e-06, "loss": 0.0379, "step": 2406 }, { "epoch": 1.1560999039385207, "grad_norm": 0.5427325029430672, "learning_rate": 7.718467031714506e-06, "loss": 0.0354, "step": 2407 }, { "epoch": 1.1565802113352546, "grad_norm": 0.3295342015472228, "learning_rate": 7.71612121660496e-06, "loss": 0.0283, "step": 2408 }, { "epoch": 1.1570605187319885, "grad_norm": 0.4598217627622351, "learning_rate": 7.713774553054289e-06, "loss": 0.0344, "step": 2409 }, { "epoch": 1.1575408261287223, "grad_norm": 0.4401095501987976, "learning_rate": 7.711427041795525e-06, "loss": 0.0329, "step": 2410 }, { "epoch": 1.1580211335254562, "grad_norm": 0.4300024127161751, "learning_rate": 7.709078683561965e-06, "loss": 0.0358, "step": 2411 }, { "epoch": 1.15850144092219, "grad_norm": 0.2937252637413703, "learning_rate": 7.706729479087171e-06, "loss": 0.0209, "step": 2412 }, { "epoch": 1.1589817483189242, "grad_norm": 0.47557904321613953, "learning_rate": 7.704379429104973e-06, "loss": 0.0265, "step": 2413 }, { "epoch": 1.159462055715658, "grad_norm": 0.46781403061599747, "learning_rate": 7.702028534349457e-06, "loss": 0.039, "step": 2414 }, { "epoch": 1.159942363112392, "grad_norm": 0.43489943273068127, "learning_rate": 7.69967679555498e-06, "loss": 0.0351, "step": 2415 }, { "epoch": 1.1604226705091258, "grad_norm": 0.5514888794461481, "learning_rate": 7.697324213456159e-06, "loss": 0.0487, "step": 2416 }, { "epoch": 1.1609029779058597, "grad_norm": 0.422019823868367, "learning_rate": 7.694970788787877e-06, "loss": 0.0291, "step": 2417 }, { "epoch": 1.1613832853025936, "grad_norm": 0.5038413332528168, "learning_rate": 7.692616522285278e-06, "loss": 0.0496, "step": 2418 }, { "epoch": 1.1618635926993275, "grad_norm": 0.6699567766729324, "learning_rate": 7.690261414683768e-06, "loss": 0.0297, "step": 2419 }, { "epoch": 1.1623439000960616, "grad_norm": 0.38043806604258107, "learning_rate": 7.687905466719022e-06, "loss": 0.0323, "step": 2420 }, { "epoch": 1.1628242074927955, "grad_norm": 0.3914611539380208, "learning_rate": 7.685548679126967e-06, "loss": 0.0401, "step": 2421 }, { "epoch": 1.1633045148895294, "grad_norm": 0.45568325828138156, "learning_rate": 7.683191052643802e-06, "loss": 0.0427, "step": 2422 }, { "epoch": 1.1637848222862632, "grad_norm": 0.4676088744859122, "learning_rate": 7.680832588005985e-06, "loss": 0.0411, "step": 2423 }, { "epoch": 1.1642651296829971, "grad_norm": 0.45421067379493585, "learning_rate": 7.678473285950233e-06, "loss": 0.03, "step": 2424 }, { "epoch": 1.164745437079731, "grad_norm": 0.32219502387578125, "learning_rate": 7.676113147213526e-06, "loss": 0.0281, "step": 2425 }, { "epoch": 1.1652257444764649, "grad_norm": 0.3856998667599777, "learning_rate": 7.673752172533108e-06, "loss": 0.0309, "step": 2426 }, { "epoch": 1.1657060518731988, "grad_norm": 0.5020275739731586, "learning_rate": 7.671390362646482e-06, "loss": 0.0433, "step": 2427 }, { "epoch": 1.1661863592699326, "grad_norm": 0.35810691820615814, "learning_rate": 7.669027718291413e-06, "loss": 0.0258, "step": 2428 }, { "epoch": 1.1666666666666667, "grad_norm": 0.414390114672223, "learning_rate": 7.666664240205922e-06, "loss": 0.0308, "step": 2429 }, { "epoch": 1.1671469740634006, "grad_norm": 0.36060018657955184, "learning_rate": 7.664299929128296e-06, "loss": 0.0325, "step": 2430 }, { "epoch": 1.1676272814601345, "grad_norm": 0.39743555828886684, "learning_rate": 7.661934785797083e-06, "loss": 0.0401, "step": 2431 }, { "epoch": 1.1681075888568684, "grad_norm": 0.3585049744568902, "learning_rate": 7.659568810951086e-06, "loss": 0.0374, "step": 2432 }, { "epoch": 1.1685878962536023, "grad_norm": 0.583205745491999, "learning_rate": 7.657202005329371e-06, "loss": 0.0484, "step": 2433 }, { "epoch": 1.1690682036503361, "grad_norm": 0.624131879847695, "learning_rate": 7.65483436967126e-06, "loss": 0.035, "step": 2434 }, { "epoch": 1.1695485110470702, "grad_norm": 0.3785298821127292, "learning_rate": 7.652465904716339e-06, "loss": 0.0419, "step": 2435 }, { "epoch": 1.1700288184438041, "grad_norm": 0.5888337550214988, "learning_rate": 7.650096611204452e-06, "loss": 0.0671, "step": 2436 }, { "epoch": 1.170509125840538, "grad_norm": 0.3912399077618036, "learning_rate": 7.6477264898757e-06, "loss": 0.0359, "step": 2437 }, { "epoch": 1.170989433237272, "grad_norm": 0.4806420238993042, "learning_rate": 7.645355541470441e-06, "loss": 0.0348, "step": 2438 }, { "epoch": 1.1714697406340058, "grad_norm": 0.4455573783323548, "learning_rate": 7.642983766729297e-06, "loss": 0.0397, "step": 2439 }, { "epoch": 1.1719500480307397, "grad_norm": 0.4916600258187127, "learning_rate": 7.640611166393142e-06, "loss": 0.0442, "step": 2440 }, { "epoch": 1.1724303554274735, "grad_norm": 0.45519613776310275, "learning_rate": 7.638237741203113e-06, "loss": 0.0307, "step": 2441 }, { "epoch": 1.1729106628242074, "grad_norm": 0.5507147440643265, "learning_rate": 7.6358634919006e-06, "loss": 0.0429, "step": 2442 }, { "epoch": 1.1733909702209413, "grad_norm": 0.3952068960311734, "learning_rate": 7.633488419227256e-06, "loss": 0.0231, "step": 2443 }, { "epoch": 1.1738712776176754, "grad_norm": 1.0086346909387622, "learning_rate": 7.631112523924986e-06, "loss": 0.0442, "step": 2444 }, { "epoch": 1.1743515850144093, "grad_norm": 0.7153718371899989, "learning_rate": 7.628735806735954e-06, "loss": 0.0418, "step": 2445 }, { "epoch": 1.1748318924111432, "grad_norm": 0.6188127559600229, "learning_rate": 7.6263582684025805e-06, "loss": 0.0362, "step": 2446 }, { "epoch": 1.175312199807877, "grad_norm": 0.38861617826142886, "learning_rate": 7.6239799096675425e-06, "loss": 0.0385, "step": 2447 }, { "epoch": 1.175792507204611, "grad_norm": 0.5544832025118211, "learning_rate": 7.621600731273774e-06, "loss": 0.0464, "step": 2448 }, { "epoch": 1.1762728146013448, "grad_norm": 0.42227774690004555, "learning_rate": 7.619220733964465e-06, "loss": 0.0297, "step": 2449 }, { "epoch": 1.1767531219980787, "grad_norm": 0.3099597447135775, "learning_rate": 7.616839918483061e-06, "loss": 0.0255, "step": 2450 }, { "epoch": 1.1772334293948128, "grad_norm": 0.5322318857625717, "learning_rate": 7.614458285573262e-06, "loss": 0.0263, "step": 2451 }, { "epoch": 1.1777137367915467, "grad_norm": 0.4172811212713302, "learning_rate": 7.612075835979023e-06, "loss": 0.0233, "step": 2452 }, { "epoch": 1.1781940441882806, "grad_norm": 0.5067613506593434, "learning_rate": 7.609692570444558e-06, "loss": 0.0391, "step": 2453 }, { "epoch": 1.1786743515850144, "grad_norm": 0.3986949222270931, "learning_rate": 7.607308489714332e-06, "loss": 0.0318, "step": 2454 }, { "epoch": 1.1791546589817483, "grad_norm": 0.4366425208439947, "learning_rate": 7.604923594533067e-06, "loss": 0.0311, "step": 2455 }, { "epoch": 1.1796349663784822, "grad_norm": 0.6413891115610021, "learning_rate": 7.602537885645735e-06, "loss": 0.0253, "step": 2456 }, { "epoch": 1.180115273775216, "grad_norm": 0.4389191962267233, "learning_rate": 7.600151363797569e-06, "loss": 0.0357, "step": 2457 }, { "epoch": 1.18059558117195, "grad_norm": 0.5682847765205594, "learning_rate": 7.597764029734051e-06, "loss": 0.0347, "step": 2458 }, { "epoch": 1.1810758885686838, "grad_norm": 0.551053582917074, "learning_rate": 7.595375884200917e-06, "loss": 0.0345, "step": 2459 }, { "epoch": 1.181556195965418, "grad_norm": 0.4342418830003905, "learning_rate": 7.59298692794416e-06, "loss": 0.0339, "step": 2460 }, { "epoch": 1.1820365033621518, "grad_norm": 0.34219348953526746, "learning_rate": 7.590597161710023e-06, "loss": 0.028, "step": 2461 }, { "epoch": 1.1825168107588857, "grad_norm": 0.5919641615807635, "learning_rate": 7.588206586245001e-06, "loss": 0.035, "step": 2462 }, { "epoch": 1.1829971181556196, "grad_norm": 0.27855634331739165, "learning_rate": 7.585815202295845e-06, "loss": 0.0285, "step": 2463 }, { "epoch": 1.1834774255523535, "grad_norm": 0.48803756726465664, "learning_rate": 7.583423010609558e-06, "loss": 0.0373, "step": 2464 }, { "epoch": 1.1839577329490873, "grad_norm": 0.37762379715566896, "learning_rate": 7.581030011933394e-06, "loss": 0.0271, "step": 2465 }, { "epoch": 1.1844380403458212, "grad_norm": 0.4365301314414535, "learning_rate": 7.57863620701486e-06, "loss": 0.0387, "step": 2466 }, { "epoch": 1.1849183477425553, "grad_norm": 0.3842989665489749, "learning_rate": 7.576241596601712e-06, "loss": 0.0365, "step": 2467 }, { "epoch": 1.1853986551392892, "grad_norm": 0.30935133033723167, "learning_rate": 7.573846181441964e-06, "loss": 0.0224, "step": 2468 }, { "epoch": 1.185878962536023, "grad_norm": 0.32680365340459383, "learning_rate": 7.571449962283874e-06, "loss": 0.0212, "step": 2469 }, { "epoch": 1.186359269932757, "grad_norm": 0.3523506808924624, "learning_rate": 7.569052939875954e-06, "loss": 0.0345, "step": 2470 }, { "epoch": 1.1868395773294909, "grad_norm": 0.5103417278090306, "learning_rate": 7.566655114966971e-06, "loss": 0.0354, "step": 2471 }, { "epoch": 1.1873198847262247, "grad_norm": 0.889847313657782, "learning_rate": 7.5642564883059376e-06, "loss": 0.0377, "step": 2472 }, { "epoch": 1.1878001921229586, "grad_norm": 0.47244346469206266, "learning_rate": 7.56185706064212e-06, "loss": 0.0411, "step": 2473 }, { "epoch": 1.1882804995196925, "grad_norm": 0.4111151628594079, "learning_rate": 7.5594568327250275e-06, "loss": 0.032, "step": 2474 }, { "epoch": 1.1887608069164266, "grad_norm": 0.5192796009944411, "learning_rate": 7.557055805304432e-06, "loss": 0.0382, "step": 2475 }, { "epoch": 1.1892411143131605, "grad_norm": 0.4412499923832414, "learning_rate": 7.554653979130342e-06, "loss": 0.0247, "step": 2476 }, { "epoch": 1.1897214217098944, "grad_norm": 0.5455703543637357, "learning_rate": 7.5522513549530264e-06, "loss": 0.0406, "step": 2477 }, { "epoch": 1.1902017291066282, "grad_norm": 0.578708320230323, "learning_rate": 7.549847933522996e-06, "loss": 0.0392, "step": 2478 }, { "epoch": 1.1906820365033621, "grad_norm": 0.4432237986235382, "learning_rate": 7.547443715591014e-06, "loss": 0.0322, "step": 2479 }, { "epoch": 1.191162343900096, "grad_norm": 0.5151778002177907, "learning_rate": 7.54503870190809e-06, "loss": 0.0275, "step": 2480 }, { "epoch": 1.19164265129683, "grad_norm": 0.6652691695787023, "learning_rate": 7.5426328932254855e-06, "loss": 0.0381, "step": 2481 }, { "epoch": 1.192122958693564, "grad_norm": 0.41380998589755885, "learning_rate": 7.540226290294708e-06, "loss": 0.0269, "step": 2482 }, { "epoch": 1.1926032660902979, "grad_norm": 0.43638492545272434, "learning_rate": 7.537818893867513e-06, "loss": 0.0286, "step": 2483 }, { "epoch": 1.1930835734870318, "grad_norm": 0.3768097436189252, "learning_rate": 7.535410704695907e-06, "loss": 0.0263, "step": 2484 }, { "epoch": 1.1935638808837656, "grad_norm": 0.3861770478061206, "learning_rate": 7.533001723532136e-06, "loss": 0.0335, "step": 2485 }, { "epoch": 1.1940441882804995, "grad_norm": 0.42918222522366517, "learning_rate": 7.530591951128707e-06, "loss": 0.0622, "step": 2486 }, { "epoch": 1.1945244956772334, "grad_norm": 0.46081378478395063, "learning_rate": 7.52818138823836e-06, "loss": 0.0269, "step": 2487 }, { "epoch": 1.1950048030739673, "grad_norm": 0.3715986732924598, "learning_rate": 7.525770035614093e-06, "loss": 0.0258, "step": 2488 }, { "epoch": 1.1954851104707012, "grad_norm": 0.7044742014737356, "learning_rate": 7.52335789400914e-06, "loss": 0.0309, "step": 2489 }, { "epoch": 1.195965417867435, "grad_norm": 0.3891801811161489, "learning_rate": 7.52094496417699e-06, "loss": 0.0273, "step": 2490 }, { "epoch": 1.1964457252641691, "grad_norm": 0.46278966248909253, "learning_rate": 7.5185312468713785e-06, "loss": 0.0393, "step": 2491 }, { "epoch": 1.196926032660903, "grad_norm": 0.4890898235928551, "learning_rate": 7.51611674284628e-06, "loss": 0.0325, "step": 2492 }, { "epoch": 1.197406340057637, "grad_norm": 0.5046403344584116, "learning_rate": 7.513701452855921e-06, "loss": 0.0456, "step": 2493 }, { "epoch": 1.1978866474543708, "grad_norm": 0.6095352851600957, "learning_rate": 7.511285377654771e-06, "loss": 0.0413, "step": 2494 }, { "epoch": 1.1983669548511047, "grad_norm": 0.2922692070628431, "learning_rate": 7.508868517997544e-06, "loss": 0.0232, "step": 2495 }, { "epoch": 1.1988472622478386, "grad_norm": 0.5168164184185329, "learning_rate": 7.506450874639202e-06, "loss": 0.0301, "step": 2496 }, { "epoch": 1.1993275696445724, "grad_norm": 0.6287428527140615, "learning_rate": 7.504032448334946e-06, "loss": 0.0339, "step": 2497 }, { "epoch": 1.1998078770413065, "grad_norm": 0.3987888783684513, "learning_rate": 7.501613239840229e-06, "loss": 0.0296, "step": 2498 }, { "epoch": 1.2002881844380404, "grad_norm": 0.735192650988037, "learning_rate": 7.499193249910746e-06, "loss": 0.0296, "step": 2499 }, { "epoch": 1.2007684918347743, "grad_norm": 0.3299278872824402, "learning_rate": 7.49677247930243e-06, "loss": 0.0329, "step": 2500 }, { "epoch": 1.2012487992315082, "grad_norm": 0.5791096873906328, "learning_rate": 7.494350928771466e-06, "loss": 0.0444, "step": 2501 }, { "epoch": 1.201729106628242, "grad_norm": 0.344684539714487, "learning_rate": 7.491928599074281e-06, "loss": 0.0244, "step": 2502 }, { "epoch": 1.202209414024976, "grad_norm": 0.37089245939662396, "learning_rate": 7.489505490967538e-06, "loss": 0.0336, "step": 2503 }, { "epoch": 1.2026897214217098, "grad_norm": 0.39470374942676023, "learning_rate": 7.487081605208157e-06, "loss": 0.0233, "step": 2504 }, { "epoch": 1.2031700288184437, "grad_norm": 0.5215168405731028, "learning_rate": 7.484656942553286e-06, "loss": 0.0347, "step": 2505 }, { "epoch": 1.2036503362151778, "grad_norm": 0.3631322154130452, "learning_rate": 7.4822315037603245e-06, "loss": 0.0257, "step": 2506 }, { "epoch": 1.2041306436119117, "grad_norm": 0.35859475167635774, "learning_rate": 7.479805289586913e-06, "loss": 0.0298, "step": 2507 }, { "epoch": 1.2046109510086456, "grad_norm": 0.46149610107216354, "learning_rate": 7.477378300790935e-06, "loss": 0.041, "step": 2508 }, { "epoch": 1.2050912584053795, "grad_norm": 0.4539051434032434, "learning_rate": 7.474950538130513e-06, "loss": 0.0275, "step": 2509 }, { "epoch": 1.2055715658021133, "grad_norm": 0.4374021214238443, "learning_rate": 7.472522002364013e-06, "loss": 0.0373, "step": 2510 }, { "epoch": 1.2060518731988472, "grad_norm": 0.3953416823788581, "learning_rate": 7.470092694250043e-06, "loss": 0.0218, "step": 2511 }, { "epoch": 1.206532180595581, "grad_norm": 0.3546083739512667, "learning_rate": 7.467662614547451e-06, "loss": 0.03, "step": 2512 }, { "epoch": 1.2070124879923152, "grad_norm": 0.3889623392275641, "learning_rate": 7.465231764015326e-06, "loss": 0.0316, "step": 2513 }, { "epoch": 1.207492795389049, "grad_norm": 0.5392184818914386, "learning_rate": 7.462800143413001e-06, "loss": 0.0461, "step": 2514 }, { "epoch": 1.207973102785783, "grad_norm": 0.4842412231696042, "learning_rate": 7.460367753500045e-06, "loss": 0.0371, "step": 2515 }, { "epoch": 1.2084534101825168, "grad_norm": 0.4041183344416958, "learning_rate": 7.4579345950362695e-06, "loss": 0.0215, "step": 2516 }, { "epoch": 1.2089337175792507, "grad_norm": 0.3103442033441776, "learning_rate": 7.455500668781725e-06, "loss": 0.0185, "step": 2517 }, { "epoch": 1.2094140249759846, "grad_norm": 0.4237163707507035, "learning_rate": 7.453065975496706e-06, "loss": 0.0375, "step": 2518 }, { "epoch": 1.2098943323727185, "grad_norm": 0.46062942498059556, "learning_rate": 7.45063051594174e-06, "loss": 0.0418, "step": 2519 }, { "epoch": 1.2103746397694524, "grad_norm": 0.3979234248274906, "learning_rate": 7.4481942908775985e-06, "loss": 0.0338, "step": 2520 }, { "epoch": 1.2108549471661862, "grad_norm": 0.532099857246016, "learning_rate": 7.445757301065291e-06, "loss": 0.0401, "step": 2521 }, { "epoch": 1.2113352545629203, "grad_norm": 0.38263776848905523, "learning_rate": 7.443319547266064e-06, "loss": 0.0289, "step": 2522 }, { "epoch": 1.2118155619596542, "grad_norm": 0.43633975656029783, "learning_rate": 7.440881030241407e-06, "loss": 0.0357, "step": 2523 }, { "epoch": 1.2122958693563881, "grad_norm": 0.3883671427263807, "learning_rate": 7.438441750753045e-06, "loss": 0.028, "step": 2524 }, { "epoch": 1.212776176753122, "grad_norm": 0.39002194568390597, "learning_rate": 7.436001709562937e-06, "loss": 0.0275, "step": 2525 }, { "epoch": 1.2132564841498559, "grad_norm": 0.3970895964480513, "learning_rate": 7.43356090743329e-06, "loss": 0.0251, "step": 2526 }, { "epoch": 1.2137367915465898, "grad_norm": 0.6377061411632488, "learning_rate": 7.431119345126542e-06, "loss": 0.0585, "step": 2527 }, { "epoch": 1.2142170989433236, "grad_norm": 0.32809513857482375, "learning_rate": 7.428677023405366e-06, "loss": 0.0231, "step": 2528 }, { "epoch": 1.2146974063400577, "grad_norm": 0.32697362069876706, "learning_rate": 7.426233943032679e-06, "loss": 0.0261, "step": 2529 }, { "epoch": 1.2151777137367916, "grad_norm": 0.4317029099118679, "learning_rate": 7.4237901047716306e-06, "loss": 0.0386, "step": 2530 }, { "epoch": 1.2156580211335255, "grad_norm": 0.3933600040512097, "learning_rate": 7.4213455093856094e-06, "loss": 0.0227, "step": 2531 }, { "epoch": 1.2161383285302594, "grad_norm": 0.5704184986865041, "learning_rate": 7.418900157638238e-06, "loss": 0.0329, "step": 2532 }, { "epoch": 1.2166186359269933, "grad_norm": 0.3956530449423525, "learning_rate": 7.416454050293376e-06, "loss": 0.0315, "step": 2533 }, { "epoch": 1.2170989433237271, "grad_norm": 0.8718461293268891, "learning_rate": 7.414007188115121e-06, "loss": 0.0242, "step": 2534 }, { "epoch": 1.217579250720461, "grad_norm": 0.37602328948229724, "learning_rate": 7.411559571867806e-06, "loss": 0.0285, "step": 2535 }, { "epoch": 1.218059558117195, "grad_norm": 0.3812841309758368, "learning_rate": 7.409111202315996e-06, "loss": 0.0331, "step": 2536 }, { "epoch": 1.218539865513929, "grad_norm": 0.3840386185823107, "learning_rate": 7.406662080224496e-06, "loss": 0.0284, "step": 2537 }, { "epoch": 1.219020172910663, "grad_norm": 0.30919574089565943, "learning_rate": 7.404212206358343e-06, "loss": 0.0212, "step": 2538 }, { "epoch": 1.2195004803073968, "grad_norm": 0.37481691908305437, "learning_rate": 7.40176158148281e-06, "loss": 0.0279, "step": 2539 }, { "epoch": 1.2199807877041307, "grad_norm": 0.5191420141412276, "learning_rate": 7.3993102063634055e-06, "loss": 0.0345, "step": 2540 }, { "epoch": 1.2204610951008645, "grad_norm": 0.362958175014462, "learning_rate": 7.3968580817658696e-06, "loss": 0.0322, "step": 2541 }, { "epoch": 1.2209414024975984, "grad_norm": 0.4427558300372852, "learning_rate": 7.394405208456179e-06, "loss": 0.028, "step": 2542 }, { "epoch": 1.2214217098943323, "grad_norm": 0.4474223375207176, "learning_rate": 7.391951587200543e-06, "loss": 0.0273, "step": 2543 }, { "epoch": 1.2219020172910664, "grad_norm": 0.5002398340666806, "learning_rate": 7.389497218765404e-06, "loss": 0.0425, "step": 2544 }, { "epoch": 1.2223823246878003, "grad_norm": 0.36228420369640385, "learning_rate": 7.387042103917443e-06, "loss": 0.0283, "step": 2545 }, { "epoch": 1.2228626320845342, "grad_norm": 0.38577700658024644, "learning_rate": 7.384586243423566e-06, "loss": 0.0212, "step": 2546 }, { "epoch": 1.223342939481268, "grad_norm": 0.4858903367119634, "learning_rate": 7.3821296380509145e-06, "loss": 0.0384, "step": 2547 }, { "epoch": 1.223823246878002, "grad_norm": 0.39271100252780944, "learning_rate": 7.379672288566869e-06, "loss": 0.0346, "step": 2548 }, { "epoch": 1.2243035542747358, "grad_norm": 0.3466801197709512, "learning_rate": 7.377214195739034e-06, "loss": 0.0309, "step": 2549 }, { "epoch": 1.2247838616714697, "grad_norm": 0.6090419397059588, "learning_rate": 7.374755360335253e-06, "loss": 0.0419, "step": 2550 }, { "epoch": 1.2252641690682036, "grad_norm": 0.37876811135442856, "learning_rate": 7.372295783123593e-06, "loss": 0.0341, "step": 2551 }, { "epoch": 1.2257444764649374, "grad_norm": 0.38986678372487205, "learning_rate": 7.369835464872361e-06, "loss": 0.0296, "step": 2552 }, { "epoch": 1.2262247838616716, "grad_norm": 0.3620059870954491, "learning_rate": 7.367374406350094e-06, "loss": 0.0288, "step": 2553 }, { "epoch": 1.2267050912584054, "grad_norm": 0.4586548302292265, "learning_rate": 7.364912608325555e-06, "loss": 0.0327, "step": 2554 }, { "epoch": 1.2271853986551393, "grad_norm": 0.3269381152697729, "learning_rate": 7.362450071567744e-06, "loss": 0.0229, "step": 2555 }, { "epoch": 1.2276657060518732, "grad_norm": 0.8860426371109142, "learning_rate": 7.359986796845888e-06, "loss": 0.0505, "step": 2556 }, { "epoch": 1.228146013448607, "grad_norm": 0.4391563141390525, "learning_rate": 7.3575227849294475e-06, "loss": 0.0351, "step": 2557 }, { "epoch": 1.228626320845341, "grad_norm": 0.43212747149070707, "learning_rate": 7.355058036588111e-06, "loss": 0.0338, "step": 2558 }, { "epoch": 1.2291066282420748, "grad_norm": 0.3662882031219981, "learning_rate": 7.352592552591796e-06, "loss": 0.0245, "step": 2559 }, { "epoch": 1.229586935638809, "grad_norm": 0.37058260645463087, "learning_rate": 7.350126333710653e-06, "loss": 0.0291, "step": 2560 }, { "epoch": 1.2300672430355428, "grad_norm": 0.45082499363265327, "learning_rate": 7.3476593807150606e-06, "loss": 0.0485, "step": 2561 }, { "epoch": 1.2305475504322767, "grad_norm": 0.3220648453832103, "learning_rate": 7.345191694375626e-06, "loss": 0.0292, "step": 2562 }, { "epoch": 1.2310278578290106, "grad_norm": 0.5030246439109646, "learning_rate": 7.342723275463188e-06, "loss": 0.0358, "step": 2563 }, { "epoch": 1.2315081652257445, "grad_norm": 0.5868734043625342, "learning_rate": 7.340254124748811e-06, "loss": 0.027, "step": 2564 }, { "epoch": 1.2319884726224783, "grad_norm": 0.4700389532667564, "learning_rate": 7.337784243003788e-06, "loss": 0.0506, "step": 2565 }, { "epoch": 1.2324687800192122, "grad_norm": 0.35490462183924243, "learning_rate": 7.335313630999643e-06, "loss": 0.0291, "step": 2566 }, { "epoch": 1.232949087415946, "grad_norm": 0.38881945592086803, "learning_rate": 7.332842289508127e-06, "loss": 0.0323, "step": 2567 }, { "epoch": 1.23342939481268, "grad_norm": 0.28903444477986073, "learning_rate": 7.330370219301222e-06, "loss": 0.0233, "step": 2568 }, { "epoch": 1.233909702209414, "grad_norm": 0.3838283978782329, "learning_rate": 7.327897421151126e-06, "loss": 0.0349, "step": 2569 }, { "epoch": 1.234390009606148, "grad_norm": 0.3992218295333178, "learning_rate": 7.3254238958302805e-06, "loss": 0.0298, "step": 2570 }, { "epoch": 1.2348703170028819, "grad_norm": 0.3551034236734177, "learning_rate": 7.322949644111343e-06, "loss": 0.032, "step": 2571 }, { "epoch": 1.2353506243996157, "grad_norm": 0.43628684623838837, "learning_rate": 7.320474666767201e-06, "loss": 0.029, "step": 2572 }, { "epoch": 1.2358309317963496, "grad_norm": 0.5713799427339364, "learning_rate": 7.3179989645709715e-06, "loss": 0.0419, "step": 2573 }, { "epoch": 1.2363112391930835, "grad_norm": 0.3236154503063247, "learning_rate": 7.315522538295993e-06, "loss": 0.0293, "step": 2574 }, { "epoch": 1.2367915465898176, "grad_norm": 0.42170898330714474, "learning_rate": 7.3130453887158335e-06, "loss": 0.0282, "step": 2575 }, { "epoch": 1.2372718539865515, "grad_norm": 0.4713245733992388, "learning_rate": 7.3105675166042854e-06, "loss": 0.0394, "step": 2576 }, { "epoch": 1.2377521613832854, "grad_norm": 0.6328231076727163, "learning_rate": 7.3080889227353675e-06, "loss": 0.0334, "step": 2577 }, { "epoch": 1.2382324687800192, "grad_norm": 0.41537444428538967, "learning_rate": 7.305609607883325e-06, "loss": 0.0343, "step": 2578 }, { "epoch": 1.2387127761767531, "grad_norm": 0.46728893039200914, "learning_rate": 7.303129572822626e-06, "loss": 0.0271, "step": 2579 }, { "epoch": 1.239193083573487, "grad_norm": 0.4764851833211291, "learning_rate": 7.300648818327964e-06, "loss": 0.0526, "step": 2580 }, { "epoch": 1.239673390970221, "grad_norm": 0.2877441196553634, "learning_rate": 7.298167345174262e-06, "loss": 0.0272, "step": 2581 }, { "epoch": 1.2401536983669548, "grad_norm": 0.4080410434456886, "learning_rate": 7.295685154136659e-06, "loss": 0.0353, "step": 2582 }, { "epoch": 1.2406340057636887, "grad_norm": 0.41496870399042607, "learning_rate": 7.293202245990526e-06, "loss": 0.0366, "step": 2583 }, { "epoch": 1.2411143131604228, "grad_norm": 0.34357188525908244, "learning_rate": 7.290718621511452e-06, "loss": 0.0279, "step": 2584 }, { "epoch": 1.2415946205571566, "grad_norm": 0.43260972026689254, "learning_rate": 7.288234281475255e-06, "loss": 0.0278, "step": 2585 }, { "epoch": 1.2420749279538905, "grad_norm": 0.44623083035611033, "learning_rate": 7.285749226657974e-06, "loss": 0.0349, "step": 2586 }, { "epoch": 1.2425552353506244, "grad_norm": 0.36980670326968385, "learning_rate": 7.283263457835871e-06, "loss": 0.0358, "step": 2587 }, { "epoch": 1.2430355427473583, "grad_norm": 0.47967418322077227, "learning_rate": 7.280776975785429e-06, "loss": 0.0272, "step": 2588 }, { "epoch": 1.2435158501440922, "grad_norm": 0.5215949433659128, "learning_rate": 7.278289781283358e-06, "loss": 0.0251, "step": 2589 }, { "epoch": 1.243996157540826, "grad_norm": 0.3552640348106672, "learning_rate": 7.2758018751065915e-06, "loss": 0.0246, "step": 2590 }, { "epoch": 1.2444764649375601, "grad_norm": 0.3354633348765226, "learning_rate": 7.273313258032279e-06, "loss": 0.0322, "step": 2591 }, { "epoch": 1.244956772334294, "grad_norm": 0.3843479056375301, "learning_rate": 7.270823930837796e-06, "loss": 0.0341, "step": 2592 }, { "epoch": 1.245437079731028, "grad_norm": 0.47449248797804777, "learning_rate": 7.268333894300741e-06, "loss": 0.0411, "step": 2593 }, { "epoch": 1.2459173871277618, "grad_norm": 0.42843421536800386, "learning_rate": 7.265843149198931e-06, "loss": 0.0332, "step": 2594 }, { "epoch": 1.2463976945244957, "grad_norm": 0.36459728610802966, "learning_rate": 7.263351696310408e-06, "loss": 0.0338, "step": 2595 }, { "epoch": 1.2468780019212296, "grad_norm": 0.507911971734832, "learning_rate": 7.260859536413429e-06, "loss": 0.0358, "step": 2596 }, { "epoch": 1.2473583093179634, "grad_norm": 0.2707751049729295, "learning_rate": 7.258366670286481e-06, "loss": 0.0292, "step": 2597 }, { "epoch": 1.2478386167146973, "grad_norm": 0.2893162535369668, "learning_rate": 7.255873098708262e-06, "loss": 0.0268, "step": 2598 }, { "epoch": 1.2483189241114312, "grad_norm": 0.4642048128134999, "learning_rate": 7.253378822457696e-06, "loss": 0.0278, "step": 2599 }, { "epoch": 1.2487992315081653, "grad_norm": 0.3448212831669026, "learning_rate": 7.250883842313927e-06, "loss": 0.0369, "step": 2600 }, { "epoch": 1.2492795389048992, "grad_norm": 0.2503606400659592, "learning_rate": 7.248388159056317e-06, "loss": 0.0194, "step": 2601 }, { "epoch": 1.249759846301633, "grad_norm": 0.29981970525831303, "learning_rate": 7.2458917734644486e-06, "loss": 0.0184, "step": 2602 }, { "epoch": 1.250240153698367, "grad_norm": 0.5598888200247213, "learning_rate": 7.243394686318124e-06, "loss": 0.0375, "step": 2603 }, { "epoch": 1.2507204610951008, "grad_norm": 0.31130302353418193, "learning_rate": 7.2408968983973634e-06, "loss": 0.0207, "step": 2604 }, { "epoch": 1.2512007684918347, "grad_norm": 0.46789107222542337, "learning_rate": 7.238398410482408e-06, "loss": 0.0345, "step": 2605 }, { "epoch": 1.2516810758885688, "grad_norm": 0.3951716757752129, "learning_rate": 7.235899223353713e-06, "loss": 0.0405, "step": 2606 }, { "epoch": 1.2521613832853027, "grad_norm": 0.3810429482440259, "learning_rate": 7.2333993377919585e-06, "loss": 0.0226, "step": 2607 }, { "epoch": 1.2526416906820366, "grad_norm": 0.3451585313920329, "learning_rate": 7.230898754578038e-06, "loss": 0.0247, "step": 2608 }, { "epoch": 1.2531219980787704, "grad_norm": 1.1538097513850596, "learning_rate": 7.228397474493067e-06, "loss": 0.0696, "step": 2609 }, { "epoch": 1.2536023054755043, "grad_norm": 0.5150205961025892, "learning_rate": 7.225895498318373e-06, "loss": 0.0458, "step": 2610 }, { "epoch": 1.2540826128722382, "grad_norm": 0.3978177298928555, "learning_rate": 7.223392826835506e-06, "loss": 0.0287, "step": 2611 }, { "epoch": 1.254562920268972, "grad_norm": 0.4303203840546095, "learning_rate": 7.220889460826231e-06, "loss": 0.0355, "step": 2612 }, { "epoch": 1.255043227665706, "grad_norm": 0.44911764375307034, "learning_rate": 7.21838540107253e-06, "loss": 0.0398, "step": 2613 }, { "epoch": 1.2555235350624399, "grad_norm": 0.3579478068056246, "learning_rate": 7.215880648356602e-06, "loss": 0.0245, "step": 2614 }, { "epoch": 1.2560038424591737, "grad_norm": 0.47844382701206495, "learning_rate": 7.2133752034608615e-06, "loss": 0.0306, "step": 2615 }, { "epoch": 1.2564841498559078, "grad_norm": 0.36527036915060085, "learning_rate": 7.210869067167942e-06, "loss": 0.0248, "step": 2616 }, { "epoch": 1.2569644572526417, "grad_norm": 0.5055702116678806, "learning_rate": 7.20836224026069e-06, "loss": 0.0617, "step": 2617 }, { "epoch": 1.2574447646493756, "grad_norm": 0.39113927427751083, "learning_rate": 7.20585472352217e-06, "loss": 0.0299, "step": 2618 }, { "epoch": 1.2579250720461095, "grad_norm": 0.34593803787082733, "learning_rate": 7.20334651773566e-06, "loss": 0.0254, "step": 2619 }, { "epoch": 1.2584053794428434, "grad_norm": 0.524280507604877, "learning_rate": 7.200837623684654e-06, "loss": 0.0373, "step": 2620 }, { "epoch": 1.2588856868395775, "grad_norm": 0.3513419638292414, "learning_rate": 7.19832804215286e-06, "loss": 0.0332, "step": 2621 }, { "epoch": 1.2593659942363113, "grad_norm": 0.4294502978775908, "learning_rate": 7.195817773924205e-06, "loss": 0.0371, "step": 2622 }, { "epoch": 1.2598463016330452, "grad_norm": 0.4149092115844474, "learning_rate": 7.193306819782826e-06, "loss": 0.0237, "step": 2623 }, { "epoch": 1.260326609029779, "grad_norm": 0.48205805380322014, "learning_rate": 7.190795180513073e-06, "loss": 0.0256, "step": 2624 }, { "epoch": 1.260806916426513, "grad_norm": 0.9616476961422092, "learning_rate": 7.188282856899516e-06, "loss": 0.0486, "step": 2625 }, { "epoch": 1.2612872238232469, "grad_norm": 0.6011840600047883, "learning_rate": 7.185769849726933e-06, "loss": 0.0473, "step": 2626 }, { "epoch": 1.2617675312199808, "grad_norm": 0.34034689626166237, "learning_rate": 7.183256159780321e-06, "loss": 0.029, "step": 2627 }, { "epoch": 1.2622478386167146, "grad_norm": 0.3920239138169121, "learning_rate": 7.180741787844883e-06, "loss": 0.0361, "step": 2628 }, { "epoch": 1.2627281460134485, "grad_norm": 0.49128285381418685, "learning_rate": 7.178226734706042e-06, "loss": 0.0327, "step": 2629 }, { "epoch": 1.2632084534101824, "grad_norm": 0.44205135164265036, "learning_rate": 7.175711001149431e-06, "loss": 0.0308, "step": 2630 }, { "epoch": 1.2636887608069165, "grad_norm": 0.6018050276574136, "learning_rate": 7.1731945879608955e-06, "loss": 0.0404, "step": 2631 }, { "epoch": 1.2641690682036504, "grad_norm": 0.9281944322517798, "learning_rate": 7.170677495926492e-06, "loss": 0.0443, "step": 2632 }, { "epoch": 1.2646493756003843, "grad_norm": 0.4243092720944124, "learning_rate": 7.168159725832492e-06, "loss": 0.0343, "step": 2633 }, { "epoch": 1.2651296829971181, "grad_norm": 0.36924534510316237, "learning_rate": 7.1656412784653765e-06, "loss": 0.0304, "step": 2634 }, { "epoch": 1.265609990393852, "grad_norm": 0.38192345059486504, "learning_rate": 7.163122154611838e-06, "loss": 0.0232, "step": 2635 }, { "epoch": 1.266090297790586, "grad_norm": 0.4430422846034388, "learning_rate": 7.160602355058782e-06, "loss": 0.0296, "step": 2636 }, { "epoch": 1.26657060518732, "grad_norm": 0.41133247697775455, "learning_rate": 7.158081880593325e-06, "loss": 0.0272, "step": 2637 }, { "epoch": 1.267050912584054, "grad_norm": 0.31727676749063993, "learning_rate": 7.155560732002792e-06, "loss": 0.0256, "step": 2638 }, { "epoch": 1.2675312199807878, "grad_norm": 0.7718554960988421, "learning_rate": 7.15303891007472e-06, "loss": 0.0456, "step": 2639 }, { "epoch": 1.2680115273775217, "grad_norm": 0.4849475256274669, "learning_rate": 7.150516415596859e-06, "loss": 0.0249, "step": 2640 }, { "epoch": 1.2684918347742555, "grad_norm": 0.5427408389711995, "learning_rate": 7.147993249357163e-06, "loss": 0.0223, "step": 2641 }, { "epoch": 1.2689721421709894, "grad_norm": 0.3949693305162911, "learning_rate": 7.145469412143801e-06, "loss": 0.0297, "step": 2642 }, { "epoch": 1.2694524495677233, "grad_norm": 0.6403622949422496, "learning_rate": 7.142944904745149e-06, "loss": 0.0598, "step": 2643 }, { "epoch": 1.2699327569644572, "grad_norm": 0.4930530657320328, "learning_rate": 7.1404197279497945e-06, "loss": 0.0382, "step": 2644 }, { "epoch": 1.270413064361191, "grad_norm": 0.4753802649303946, "learning_rate": 7.137893882546534e-06, "loss": 0.0475, "step": 2645 }, { "epoch": 1.270893371757925, "grad_norm": 0.8453554589570575, "learning_rate": 7.135367369324369e-06, "loss": 0.031, "step": 2646 }, { "epoch": 1.271373679154659, "grad_norm": 0.33631717710784154, "learning_rate": 7.132840189072513e-06, "loss": 0.0367, "step": 2647 }, { "epoch": 1.271853986551393, "grad_norm": 0.3389568635356958, "learning_rate": 7.130312342580389e-06, "loss": 0.0279, "step": 2648 }, { "epoch": 1.2723342939481268, "grad_norm": 0.5755319159504408, "learning_rate": 7.127783830637625e-06, "loss": 0.0429, "step": 2649 }, { "epoch": 1.2728146013448607, "grad_norm": 0.6319511334090538, "learning_rate": 7.125254654034056e-06, "loss": 0.0317, "step": 2650 }, { "epoch": 1.2732949087415946, "grad_norm": 0.4055197399077354, "learning_rate": 7.122724813559732e-06, "loss": 0.0368, "step": 2651 }, { "epoch": 1.2737752161383284, "grad_norm": 0.4194543855037022, "learning_rate": 7.1201943100049e-06, "loss": 0.0283, "step": 2652 }, { "epoch": 1.2742555235350626, "grad_norm": 0.24552395262283644, "learning_rate": 7.117663144160022e-06, "loss": 0.0213, "step": 2653 }, { "epoch": 1.2747358309317964, "grad_norm": 0.3183156772762976, "learning_rate": 7.115131316815763e-06, "loss": 0.0272, "step": 2654 }, { "epoch": 1.2752161383285303, "grad_norm": 0.3310053904694869, "learning_rate": 7.112598828762998e-06, "loss": 0.0303, "step": 2655 }, { "epoch": 1.2756964457252642, "grad_norm": 0.4399054346338161, "learning_rate": 7.110065680792803e-06, "loss": 0.0283, "step": 2656 }, { "epoch": 1.276176753121998, "grad_norm": 0.4362417955347966, "learning_rate": 7.107531873696465e-06, "loss": 0.0353, "step": 2657 }, { "epoch": 1.276657060518732, "grad_norm": 0.38543398300465576, "learning_rate": 7.104997408265477e-06, "loss": 0.0256, "step": 2658 }, { "epoch": 1.2771373679154658, "grad_norm": 0.35862837789194746, "learning_rate": 7.102462285291534e-06, "loss": 0.0314, "step": 2659 }, { "epoch": 1.2776176753121997, "grad_norm": 1.1321553070711072, "learning_rate": 7.099926505566537e-06, "loss": 0.04, "step": 2660 }, { "epoch": 1.2780979827089336, "grad_norm": 0.3136588773732061, "learning_rate": 7.097390069882595e-06, "loss": 0.0255, "step": 2661 }, { "epoch": 1.2785782901056677, "grad_norm": 0.3228329484083442, "learning_rate": 7.094852979032021e-06, "loss": 0.0269, "step": 2662 }, { "epoch": 1.2790585975024016, "grad_norm": 0.3823956992833706, "learning_rate": 7.092315233807331e-06, "loss": 0.025, "step": 2663 }, { "epoch": 1.2795389048991355, "grad_norm": 0.5345544781267894, "learning_rate": 7.089776835001246e-06, "loss": 0.0289, "step": 2664 }, { "epoch": 1.2800192122958693, "grad_norm": 0.3825773916625585, "learning_rate": 7.087237783406692e-06, "loss": 0.029, "step": 2665 }, { "epoch": 1.2804995196926032, "grad_norm": 0.44781616166215205, "learning_rate": 7.084698079816799e-06, "loss": 0.0341, "step": 2666 }, { "epoch": 1.280979827089337, "grad_norm": 0.2958516899298896, "learning_rate": 7.082157725024901e-06, "loss": 0.021, "step": 2667 }, { "epoch": 1.2814601344860712, "grad_norm": 0.4331191004571764, "learning_rate": 7.079616719824532e-06, "loss": 0.0309, "step": 2668 }, { "epoch": 1.281940441882805, "grad_norm": 0.30474017331831055, "learning_rate": 7.0770750650094335e-06, "loss": 0.0259, "step": 2669 }, { "epoch": 1.282420749279539, "grad_norm": 0.492995893466332, "learning_rate": 7.074532761373547e-06, "loss": 0.0364, "step": 2670 }, { "epoch": 1.2829010566762729, "grad_norm": 0.3660792544753867, "learning_rate": 7.071989809711018e-06, "loss": 0.0307, "step": 2671 }, { "epoch": 1.2833813640730067, "grad_norm": 0.4650647064353154, "learning_rate": 7.069446210816197e-06, "loss": 0.034, "step": 2672 }, { "epoch": 1.2838616714697406, "grad_norm": 0.41807658492437555, "learning_rate": 7.06690196548363e-06, "loss": 0.0301, "step": 2673 }, { "epoch": 1.2843419788664745, "grad_norm": 0.3729303339587027, "learning_rate": 7.06435707450807e-06, "loss": 0.0293, "step": 2674 }, { "epoch": 1.2848222862632084, "grad_norm": 0.35486056577139347, "learning_rate": 7.061811538684473e-06, "loss": 0.0321, "step": 2675 }, { "epoch": 1.2853025936599423, "grad_norm": 0.3703349449946021, "learning_rate": 7.059265358807991e-06, "loss": 0.0255, "step": 2676 }, { "epoch": 1.2857829010566761, "grad_norm": 0.39677551483943063, "learning_rate": 7.056718535673981e-06, "loss": 0.0259, "step": 2677 }, { "epoch": 1.2862632084534102, "grad_norm": 0.3535355251327798, "learning_rate": 7.054171070078001e-06, "loss": 0.0316, "step": 2678 }, { "epoch": 1.2867435158501441, "grad_norm": 0.38800205143788935, "learning_rate": 7.051622962815806e-06, "loss": 0.0394, "step": 2679 }, { "epoch": 1.287223823246878, "grad_norm": 0.30274050216770004, "learning_rate": 7.049074214683358e-06, "loss": 0.0232, "step": 2680 }, { "epoch": 1.2877041306436119, "grad_norm": 0.42804919646865397, "learning_rate": 7.046524826476815e-06, "loss": 0.023, "step": 2681 }, { "epoch": 1.2881844380403458, "grad_norm": 0.5487255136722479, "learning_rate": 7.043974798992532e-06, "loss": 0.041, "step": 2682 }, { "epoch": 1.2886647454370797, "grad_norm": 1.0302950600469856, "learning_rate": 7.041424133027068e-06, "loss": 0.0415, "step": 2683 }, { "epoch": 1.2891450528338138, "grad_norm": 0.391377865119486, "learning_rate": 7.038872829377184e-06, "loss": 0.0219, "step": 2684 }, { "epoch": 1.2896253602305476, "grad_norm": 0.30667703630184534, "learning_rate": 7.0363208888398326e-06, "loss": 0.0328, "step": 2685 }, { "epoch": 1.2901056676272815, "grad_norm": 0.6259423976707589, "learning_rate": 7.033768312212172e-06, "loss": 0.0366, "step": 2686 }, { "epoch": 1.2905859750240154, "grad_norm": 0.47912433163289414, "learning_rate": 7.031215100291555e-06, "loss": 0.029, "step": 2687 }, { "epoch": 1.2910662824207493, "grad_norm": 0.31622164841869743, "learning_rate": 7.028661253875536e-06, "loss": 0.0299, "step": 2688 }, { "epoch": 1.2915465898174832, "grad_norm": 0.5146729672271773, "learning_rate": 7.026106773761864e-06, "loss": 0.0383, "step": 2689 }, { "epoch": 1.292026897214217, "grad_norm": 0.4247612683767634, "learning_rate": 7.023551660748489e-06, "loss": 0.0384, "step": 2690 }, { "epoch": 1.292507204610951, "grad_norm": 0.9689986264029096, "learning_rate": 7.020995915633557e-06, "loss": 0.0402, "step": 2691 }, { "epoch": 1.2929875120076848, "grad_norm": 0.3301195989232945, "learning_rate": 7.018439539215414e-06, "loss": 0.0311, "step": 2692 }, { "epoch": 1.293467819404419, "grad_norm": 0.38927422608342704, "learning_rate": 7.015882532292598e-06, "loss": 0.0377, "step": 2693 }, { "epoch": 1.2939481268011528, "grad_norm": 0.7123704275157317, "learning_rate": 7.013324895663849e-06, "loss": 0.0416, "step": 2694 }, { "epoch": 1.2944284341978867, "grad_norm": 0.5449396712144299, "learning_rate": 7.010766630128103e-06, "loss": 0.034, "step": 2695 }, { "epoch": 1.2949087415946205, "grad_norm": 0.35782709207064894, "learning_rate": 7.00820773648449e-06, "loss": 0.0287, "step": 2696 }, { "epoch": 1.2953890489913544, "grad_norm": 0.26646069381293774, "learning_rate": 7.005648215532338e-06, "loss": 0.0207, "step": 2697 }, { "epoch": 1.2958693563880883, "grad_norm": 0.39726019552798875, "learning_rate": 7.003088068071169e-06, "loss": 0.0306, "step": 2698 }, { "epoch": 1.2963496637848224, "grad_norm": 0.30236840106709867, "learning_rate": 7.0005272949007055e-06, "loss": 0.0286, "step": 2699 }, { "epoch": 1.2968299711815563, "grad_norm": 0.4102625289249289, "learning_rate": 6.997965896820859e-06, "loss": 0.0317, "step": 2700 }, { "epoch": 1.2973102785782902, "grad_norm": 0.8682308646556751, "learning_rate": 6.995403874631741e-06, "loss": 0.0388, "step": 2701 }, { "epoch": 1.297790585975024, "grad_norm": 0.33110766929067287, "learning_rate": 6.9928412291336564e-06, "loss": 0.0311, "step": 2702 }, { "epoch": 1.298270893371758, "grad_norm": 0.3971918901087091, "learning_rate": 6.9902779611271056e-06, "loss": 0.0314, "step": 2703 }, { "epoch": 1.2987512007684918, "grad_norm": 0.5511979977427979, "learning_rate": 6.987714071412781e-06, "loss": 0.0478, "step": 2704 }, { "epoch": 1.2992315081652257, "grad_norm": 0.34445094091872547, "learning_rate": 6.98514956079157e-06, "loss": 0.0262, "step": 2705 }, { "epoch": 1.2997118155619596, "grad_norm": 0.5733894636381583, "learning_rate": 6.982584430064556e-06, "loss": 0.0505, "step": 2706 }, { "epoch": 1.3001921229586935, "grad_norm": 0.3780284278483849, "learning_rate": 6.980018680033016e-06, "loss": 0.0282, "step": 2707 }, { "epoch": 1.3006724303554273, "grad_norm": 0.3539323118976981, "learning_rate": 6.977452311498418e-06, "loss": 0.0327, "step": 2708 }, { "epoch": 1.3011527377521614, "grad_norm": 0.5313843754018146, "learning_rate": 6.974885325262425e-06, "loss": 0.0414, "step": 2709 }, { "epoch": 1.3016330451488953, "grad_norm": 0.9347771640621206, "learning_rate": 6.972317722126891e-06, "loss": 0.045, "step": 2710 }, { "epoch": 1.3021133525456292, "grad_norm": 0.2673954979606252, "learning_rate": 6.969749502893868e-06, "loss": 0.0264, "step": 2711 }, { "epoch": 1.302593659942363, "grad_norm": 0.3442011509085709, "learning_rate": 6.967180668365591e-06, "loss": 0.0414, "step": 2712 }, { "epoch": 1.303073967339097, "grad_norm": 0.5244270812369923, "learning_rate": 6.964611219344498e-06, "loss": 0.0352, "step": 2713 }, { "epoch": 1.3035542747358309, "grad_norm": 0.3491954232720679, "learning_rate": 6.9620411566332116e-06, "loss": 0.0248, "step": 2714 }, { "epoch": 1.304034582132565, "grad_norm": 0.45172236284953454, "learning_rate": 6.959470481034547e-06, "loss": 0.0413, "step": 2715 }, { "epoch": 1.3045148895292988, "grad_norm": 0.48763321987703145, "learning_rate": 6.956899193351514e-06, "loss": 0.0289, "step": 2716 }, { "epoch": 1.3049951969260327, "grad_norm": 0.7772462140194594, "learning_rate": 6.954327294387313e-06, "loss": 0.0328, "step": 2717 }, { "epoch": 1.3054755043227666, "grad_norm": 0.6498848629905595, "learning_rate": 6.9517547849453315e-06, "loss": 0.0367, "step": 2718 }, { "epoch": 1.3059558117195005, "grad_norm": 0.3726219048894061, "learning_rate": 6.94918166582915e-06, "loss": 0.0374, "step": 2719 }, { "epoch": 1.3064361191162344, "grad_norm": 0.3724177489811474, "learning_rate": 6.946607937842541e-06, "loss": 0.0394, "step": 2720 }, { "epoch": 1.3069164265129682, "grad_norm": 0.5084969399940996, "learning_rate": 6.944033601789467e-06, "loss": 0.0376, "step": 2721 }, { "epoch": 1.3073967339097021, "grad_norm": 0.2753632445522617, "learning_rate": 6.9414586584740785e-06, "loss": 0.0226, "step": 2722 }, { "epoch": 1.307877041306436, "grad_norm": 0.37538942747756915, "learning_rate": 6.938883108700715e-06, "loss": 0.0306, "step": 2723 }, { "epoch": 1.30835734870317, "grad_norm": 0.3704040086081312, "learning_rate": 6.936306953273908e-06, "loss": 0.0272, "step": 2724 }, { "epoch": 1.308837656099904, "grad_norm": 0.397406505497129, "learning_rate": 6.933730192998378e-06, "loss": 0.0281, "step": 2725 }, { "epoch": 1.3093179634966379, "grad_norm": 0.527497267218537, "learning_rate": 6.931152828679033e-06, "loss": 0.0431, "step": 2726 }, { "epoch": 1.3097982708933718, "grad_norm": 0.4501262061364447, "learning_rate": 6.92857486112097e-06, "loss": 0.0401, "step": 2727 }, { "epoch": 1.3102785782901056, "grad_norm": 0.3870323590718999, "learning_rate": 6.925996291129475e-06, "loss": 0.0283, "step": 2728 }, { "epoch": 1.3107588856868395, "grad_norm": 0.5449386005515444, "learning_rate": 6.923417119510023e-06, "loss": 0.0269, "step": 2729 }, { "epoch": 1.3112391930835736, "grad_norm": 0.32210822624799645, "learning_rate": 6.920837347068274e-06, "loss": 0.0283, "step": 2730 }, { "epoch": 1.3117195004803075, "grad_norm": 0.2890421689968733, "learning_rate": 6.918256974610079e-06, "loss": 0.0237, "step": 2731 }, { "epoch": 1.3121998078770414, "grad_norm": 0.4572604834085909, "learning_rate": 6.915676002941473e-06, "loss": 0.0465, "step": 2732 }, { "epoch": 1.3126801152737753, "grad_norm": 0.2987375957397587, "learning_rate": 6.913094432868683e-06, "loss": 0.0275, "step": 2733 }, { "epoch": 1.3131604226705091, "grad_norm": 0.4932797353307123, "learning_rate": 6.910512265198117e-06, "loss": 0.0436, "step": 2734 }, { "epoch": 1.313640730067243, "grad_norm": 0.38888636172443364, "learning_rate": 6.907929500736377e-06, "loss": 0.028, "step": 2735 }, { "epoch": 1.314121037463977, "grad_norm": 0.714041805963483, "learning_rate": 6.905346140290243e-06, "loss": 0.047, "step": 2736 }, { "epoch": 1.3146013448607108, "grad_norm": 0.38420017096511405, "learning_rate": 6.902762184666687e-06, "loss": 0.0248, "step": 2737 }, { "epoch": 1.3150816522574447, "grad_norm": 0.2877749002803969, "learning_rate": 6.900177634672863e-06, "loss": 0.0243, "step": 2738 }, { "epoch": 1.3155619596541785, "grad_norm": 0.6136271736461263, "learning_rate": 6.897592491116117e-06, "loss": 0.0422, "step": 2739 }, { "epoch": 1.3160422670509127, "grad_norm": 0.5628681994584901, "learning_rate": 6.8950067548039755e-06, "loss": 0.0513, "step": 2740 }, { "epoch": 1.3165225744476465, "grad_norm": 0.39348763344036025, "learning_rate": 6.892420426544149e-06, "loss": 0.0369, "step": 2741 }, { "epoch": 1.3170028818443804, "grad_norm": 0.8901763661309178, "learning_rate": 6.889833507144534e-06, "loss": 0.0366, "step": 2742 }, { "epoch": 1.3174831892411143, "grad_norm": 0.43361191589470294, "learning_rate": 6.887245997413215e-06, "loss": 0.0304, "step": 2743 }, { "epoch": 1.3179634966378482, "grad_norm": 0.37380705497932304, "learning_rate": 6.884657898158458e-06, "loss": 0.0352, "step": 2744 }, { "epoch": 1.318443804034582, "grad_norm": 0.4060314045621303, "learning_rate": 6.8820692101887135e-06, "loss": 0.0282, "step": 2745 }, { "epoch": 1.3189241114313162, "grad_norm": 0.4059680550379342, "learning_rate": 6.879479934312616e-06, "loss": 0.0325, "step": 2746 }, { "epoch": 1.31940441882805, "grad_norm": 0.3397237620983705, "learning_rate": 6.876890071338983e-06, "loss": 0.0316, "step": 2747 }, { "epoch": 1.319884726224784, "grad_norm": 0.4058765066978659, "learning_rate": 6.874299622076817e-06, "loss": 0.0383, "step": 2748 }, { "epoch": 1.3203650336215178, "grad_norm": 0.35084140294000193, "learning_rate": 6.871708587335302e-06, "loss": 0.0274, "step": 2749 }, { "epoch": 1.3208453410182517, "grad_norm": 0.4520667400139173, "learning_rate": 6.869116967923806e-06, "loss": 0.0289, "step": 2750 }, { "epoch": 1.3213256484149856, "grad_norm": 0.3285420300844678, "learning_rate": 6.866524764651878e-06, "loss": 0.0323, "step": 2751 }, { "epoch": 1.3218059558117194, "grad_norm": 0.45258555632756126, "learning_rate": 6.863931978329252e-06, "loss": 0.0337, "step": 2752 }, { "epoch": 1.3222862632084533, "grad_norm": 0.4401222451201768, "learning_rate": 6.861338609765842e-06, "loss": 0.0289, "step": 2753 }, { "epoch": 1.3227665706051872, "grad_norm": 0.3103211974531629, "learning_rate": 6.8587446597717465e-06, "loss": 0.0249, "step": 2754 }, { "epoch": 1.3232468780019213, "grad_norm": 0.37115498693801663, "learning_rate": 6.856150129157241e-06, "loss": 0.0291, "step": 2755 }, { "epoch": 1.3237271853986552, "grad_norm": 0.6594626629671632, "learning_rate": 6.853555018732785e-06, "loss": 0.0368, "step": 2756 }, { "epoch": 1.324207492795389, "grad_norm": 1.1961535814568451, "learning_rate": 6.850959329309022e-06, "loss": 0.0527, "step": 2757 }, { "epoch": 1.324687800192123, "grad_norm": 0.5053495802158154, "learning_rate": 6.848363061696772e-06, "loss": 0.045, "step": 2758 }, { "epoch": 1.3251681075888568, "grad_norm": 0.4548061426290774, "learning_rate": 6.845766216707037e-06, "loss": 0.0375, "step": 2759 }, { "epoch": 1.3256484149855907, "grad_norm": 0.3426512876131701, "learning_rate": 6.843168795150998e-06, "loss": 0.0219, "step": 2760 }, { "epoch": 1.3261287223823248, "grad_norm": 0.3837383135364955, "learning_rate": 6.84057079784002e-06, "loss": 0.0429, "step": 2761 }, { "epoch": 1.3266090297790587, "grad_norm": 0.3262241686852066, "learning_rate": 6.837972225585646e-06, "loss": 0.0319, "step": 2762 }, { "epoch": 1.3270893371757926, "grad_norm": 0.4116986274583845, "learning_rate": 6.8353730791995975e-06, "loss": 0.0352, "step": 2763 }, { "epoch": 1.3275696445725265, "grad_norm": 0.3766248127087878, "learning_rate": 6.832773359493774e-06, "loss": 0.0248, "step": 2764 }, { "epoch": 1.3280499519692603, "grad_norm": 0.31496961276018093, "learning_rate": 6.830173067280257e-06, "loss": 0.0228, "step": 2765 }, { "epoch": 1.3285302593659942, "grad_norm": 0.4111827615689983, "learning_rate": 6.827572203371307e-06, "loss": 0.026, "step": 2766 }, { "epoch": 1.329010566762728, "grad_norm": 0.6311320160100057, "learning_rate": 6.82497076857936e-06, "loss": 0.0769, "step": 2767 }, { "epoch": 1.329490874159462, "grad_norm": 0.5327162287648995, "learning_rate": 6.822368763717033e-06, "loss": 0.0395, "step": 2768 }, { "epoch": 1.3299711815561959, "grad_norm": 0.45783319879268114, "learning_rate": 6.819766189597122e-06, "loss": 0.0589, "step": 2769 }, { "epoch": 1.3304514889529298, "grad_norm": 0.36506792163583973, "learning_rate": 6.817163047032598e-06, "loss": 0.0414, "step": 2770 }, { "epoch": 1.3309317963496639, "grad_norm": 0.2750183833311742, "learning_rate": 6.814559336836608e-06, "loss": 0.0249, "step": 2771 }, { "epoch": 1.3314121037463977, "grad_norm": 0.32922963425279644, "learning_rate": 6.811955059822485e-06, "loss": 0.0286, "step": 2772 }, { "epoch": 1.3318924111431316, "grad_norm": 0.35278601908270185, "learning_rate": 6.809350216803729e-06, "loss": 0.0216, "step": 2773 }, { "epoch": 1.3323727185398655, "grad_norm": 0.45202856276454373, "learning_rate": 6.80674480859402e-06, "loss": 0.0312, "step": 2774 }, { "epoch": 1.3328530259365994, "grad_norm": 0.48112150595991565, "learning_rate": 6.804138836007217e-06, "loss": 0.0391, "step": 2775 }, { "epoch": 1.3333333333333333, "grad_norm": 0.31965729716272656, "learning_rate": 6.801532299857355e-06, "loss": 0.0322, "step": 2776 }, { "epoch": 1.3338136407300674, "grad_norm": 0.8541077833322179, "learning_rate": 6.798925200958642e-06, "loss": 0.0312, "step": 2777 }, { "epoch": 1.3342939481268012, "grad_norm": 0.4681945623810459, "learning_rate": 6.7963175401254635e-06, "loss": 0.0294, "step": 2778 }, { "epoch": 1.3347742555235351, "grad_norm": 0.4834259934318795, "learning_rate": 6.793709318172382e-06, "loss": 0.0315, "step": 2779 }, { "epoch": 1.335254562920269, "grad_norm": 0.5373080322117605, "learning_rate": 6.791100535914132e-06, "loss": 0.04, "step": 2780 }, { "epoch": 1.3357348703170029, "grad_norm": 0.4913096954537291, "learning_rate": 6.788491194165629e-06, "loss": 0.0284, "step": 2781 }, { "epoch": 1.3362151777137368, "grad_norm": 0.36133141921390804, "learning_rate": 6.785881293741953e-06, "loss": 0.0284, "step": 2782 }, { "epoch": 1.3366954851104706, "grad_norm": 0.41754321447982085, "learning_rate": 6.783270835458369e-06, "loss": 0.0453, "step": 2783 }, { "epoch": 1.3371757925072045, "grad_norm": 0.3632087642046544, "learning_rate": 6.780659820130311e-06, "loss": 0.0266, "step": 2784 }, { "epoch": 1.3376560999039384, "grad_norm": 0.42299171728530327, "learning_rate": 6.778048248573388e-06, "loss": 0.0249, "step": 2785 }, { "epoch": 1.3381364073006723, "grad_norm": 0.376034451620374, "learning_rate": 6.775436121603382e-06, "loss": 0.0336, "step": 2786 }, { "epoch": 1.3386167146974064, "grad_norm": 0.3519022805031083, "learning_rate": 6.772823440036249e-06, "loss": 0.0279, "step": 2787 }, { "epoch": 1.3390970220941403, "grad_norm": 0.42146095987592863, "learning_rate": 6.770210204688119e-06, "loss": 0.0418, "step": 2788 }, { "epoch": 1.3395773294908742, "grad_norm": 0.34808552419194894, "learning_rate": 6.767596416375295e-06, "loss": 0.0284, "step": 2789 }, { "epoch": 1.340057636887608, "grad_norm": 0.629145840593023, "learning_rate": 6.7649820759142525e-06, "loss": 0.0403, "step": 2790 }, { "epoch": 1.340537944284342, "grad_norm": 0.36889534904706384, "learning_rate": 6.7623671841216375e-06, "loss": 0.0186, "step": 2791 }, { "epoch": 1.341018251681076, "grad_norm": 0.3050558384183464, "learning_rate": 6.759751741814271e-06, "loss": 0.026, "step": 2792 }, { "epoch": 1.34149855907781, "grad_norm": 0.3266793305868794, "learning_rate": 6.7571357498091435e-06, "loss": 0.0258, "step": 2793 }, { "epoch": 1.3419788664745438, "grad_norm": 0.6775956881285454, "learning_rate": 6.754519208923421e-06, "loss": 0.0562, "step": 2794 }, { "epoch": 1.3424591738712777, "grad_norm": 0.4367921556336218, "learning_rate": 6.751902119974437e-06, "loss": 0.032, "step": 2795 }, { "epoch": 1.3429394812680115, "grad_norm": 0.401491380423153, "learning_rate": 6.749284483779698e-06, "loss": 0.0365, "step": 2796 }, { "epoch": 1.3434197886647454, "grad_norm": 0.3633995060787739, "learning_rate": 6.746666301156879e-06, "loss": 0.0311, "step": 2797 }, { "epoch": 1.3439000960614793, "grad_norm": 0.3694026306855599, "learning_rate": 6.7440475729238295e-06, "loss": 0.0341, "step": 2798 }, { "epoch": 1.3443804034582132, "grad_norm": 0.37149588507448256, "learning_rate": 6.7414282998985705e-06, "loss": 0.0428, "step": 2799 }, { "epoch": 1.344860710854947, "grad_norm": 0.36988005093609544, "learning_rate": 6.738808482899285e-06, "loss": 0.0322, "step": 2800 }, { "epoch": 1.345341018251681, "grad_norm": 0.40212086660690877, "learning_rate": 6.736188122744336e-06, "loss": 0.0346, "step": 2801 }, { "epoch": 1.345821325648415, "grad_norm": 0.396336477922158, "learning_rate": 6.733567220252248e-06, "loss": 0.0348, "step": 2802 }, { "epoch": 1.346301633045149, "grad_norm": 0.4060664298982189, "learning_rate": 6.7309457762417214e-06, "loss": 0.0351, "step": 2803 }, { "epoch": 1.3467819404418828, "grad_norm": 0.49053222348550485, "learning_rate": 6.7283237915316225e-06, "loss": 0.034, "step": 2804 }, { "epoch": 1.3472622478386167, "grad_norm": 0.5760451605722786, "learning_rate": 6.725701266940985e-06, "loss": 0.0293, "step": 2805 }, { "epoch": 1.3477425552353506, "grad_norm": 0.3330481855241032, "learning_rate": 6.723078203289013e-06, "loss": 0.0272, "step": 2806 }, { "epoch": 1.3482228626320845, "grad_norm": 0.43315711697748055, "learning_rate": 6.720454601395081e-06, "loss": 0.0394, "step": 2807 }, { "epoch": 1.3487031700288186, "grad_norm": 0.5108691984370773, "learning_rate": 6.7178304620787275e-06, "loss": 0.0288, "step": 2808 }, { "epoch": 1.3491834774255524, "grad_norm": 0.39131201150487754, "learning_rate": 6.715205786159662e-06, "loss": 0.0375, "step": 2809 }, { "epoch": 1.3496637848222863, "grad_norm": 0.26443698987720277, "learning_rate": 6.712580574457761e-06, "loss": 0.0244, "step": 2810 }, { "epoch": 1.3501440922190202, "grad_norm": 0.3573007720990793, "learning_rate": 6.7099548277930675e-06, "loss": 0.0339, "step": 2811 }, { "epoch": 1.350624399615754, "grad_norm": 0.35077746947991995, "learning_rate": 6.7073285469857915e-06, "loss": 0.0211, "step": 2812 }, { "epoch": 1.351104707012488, "grad_norm": 0.34974992418945977, "learning_rate": 6.704701732856313e-06, "loss": 0.0286, "step": 2813 }, { "epoch": 1.3515850144092219, "grad_norm": 0.352112441158966, "learning_rate": 6.702074386225175e-06, "loss": 0.0263, "step": 2814 }, { "epoch": 1.3520653218059557, "grad_norm": 0.3458254175413886, "learning_rate": 6.699446507913083e-06, "loss": 0.0338, "step": 2815 }, { "epoch": 1.3525456292026896, "grad_norm": 0.2953825898816924, "learning_rate": 6.696818098740921e-06, "loss": 0.0256, "step": 2816 }, { "epoch": 1.3530259365994235, "grad_norm": 0.3653080365612704, "learning_rate": 6.694189159529729e-06, "loss": 0.0321, "step": 2817 }, { "epoch": 1.3535062439961576, "grad_norm": 0.4774373963547389, "learning_rate": 6.691559691100712e-06, "loss": 0.0378, "step": 2818 }, { "epoch": 1.3539865513928915, "grad_norm": 0.4116369008596725, "learning_rate": 6.688929694275245e-06, "loss": 0.0429, "step": 2819 }, { "epoch": 1.3544668587896254, "grad_norm": 0.41537213554288466, "learning_rate": 6.686299169874866e-06, "loss": 0.0272, "step": 2820 }, { "epoch": 1.3549471661863592, "grad_norm": 0.39654696519716176, "learning_rate": 6.68366811872128e-06, "loss": 0.0329, "step": 2821 }, { "epoch": 1.3554274735830931, "grad_norm": 0.37354058817141583, "learning_rate": 6.681036541636352e-06, "loss": 0.0288, "step": 2822 }, { "epoch": 1.3559077809798272, "grad_norm": 0.37667870025095906, "learning_rate": 6.678404439442115e-06, "loss": 0.0278, "step": 2823 }, { "epoch": 1.356388088376561, "grad_norm": 0.4168191192902928, "learning_rate": 6.675771812960764e-06, "loss": 0.0267, "step": 2824 }, { "epoch": 1.356868395773295, "grad_norm": 0.6788529972807211, "learning_rate": 6.67313866301466e-06, "loss": 0.0474, "step": 2825 }, { "epoch": 1.3573487031700289, "grad_norm": 0.41472319799898266, "learning_rate": 6.6705049904263265e-06, "loss": 0.0388, "step": 2826 }, { "epoch": 1.3578290105667628, "grad_norm": 0.3669455639643798, "learning_rate": 6.667870796018447e-06, "loss": 0.0268, "step": 2827 }, { "epoch": 1.3583093179634966, "grad_norm": 0.43494990503675257, "learning_rate": 6.665236080613875e-06, "loss": 0.0254, "step": 2828 }, { "epoch": 1.3587896253602305, "grad_norm": 0.47924289386546226, "learning_rate": 6.66260084503562e-06, "loss": 0.0353, "step": 2829 }, { "epoch": 1.3592699327569644, "grad_norm": 0.43971366267803413, "learning_rate": 6.6599650901068566e-06, "loss": 0.0377, "step": 2830 }, { "epoch": 1.3597502401536983, "grad_norm": 0.3668322617069702, "learning_rate": 6.6573288166509255e-06, "loss": 0.0296, "step": 2831 }, { "epoch": 1.3602305475504322, "grad_norm": 0.4117679345852698, "learning_rate": 6.654692025491321e-06, "loss": 0.0323, "step": 2832 }, { "epoch": 1.3607108549471663, "grad_norm": 0.3457791857965604, "learning_rate": 6.652054717451703e-06, "loss": 0.0242, "step": 2833 }, { "epoch": 1.3611911623439001, "grad_norm": 0.49879686059491674, "learning_rate": 6.649416893355897e-06, "loss": 0.0375, "step": 2834 }, { "epoch": 1.361671469740634, "grad_norm": 0.48587995314563753, "learning_rate": 6.646778554027888e-06, "loss": 0.032, "step": 2835 }, { "epoch": 1.362151777137368, "grad_norm": 0.4196048419511372, "learning_rate": 6.6441397002918165e-06, "loss": 0.0295, "step": 2836 }, { "epoch": 1.3626320845341018, "grad_norm": 0.313795915054782, "learning_rate": 6.641500332971986e-06, "loss": 0.0355, "step": 2837 }, { "epoch": 1.3631123919308357, "grad_norm": 0.33703315808551054, "learning_rate": 6.638860452892866e-06, "loss": 0.0257, "step": 2838 }, { "epoch": 1.3635926993275698, "grad_norm": 0.37449846418231164, "learning_rate": 6.636220060879079e-06, "loss": 0.0265, "step": 2839 }, { "epoch": 1.3640730067243036, "grad_norm": 0.33584448318600435, "learning_rate": 6.633579157755413e-06, "loss": 0.0322, "step": 2840 }, { "epoch": 1.3645533141210375, "grad_norm": 0.40061776481215994, "learning_rate": 6.63093774434681e-06, "loss": 0.0302, "step": 2841 }, { "epoch": 1.3650336215177714, "grad_norm": 0.3493651392119964, "learning_rate": 6.628295821478377e-06, "loss": 0.0343, "step": 2842 }, { "epoch": 1.3655139289145053, "grad_norm": 0.3302785719812673, "learning_rate": 6.625653389975377e-06, "loss": 0.0255, "step": 2843 }, { "epoch": 1.3659942363112392, "grad_norm": 0.3459388727593669, "learning_rate": 6.623010450663232e-06, "loss": 0.0293, "step": 2844 }, { "epoch": 1.366474543707973, "grad_norm": 0.48051192066271675, "learning_rate": 6.620367004367523e-06, "loss": 0.0245, "step": 2845 }, { "epoch": 1.366954851104707, "grad_norm": 0.6283674920727298, "learning_rate": 6.61772305191399e-06, "loss": 0.0379, "step": 2846 }, { "epoch": 1.3674351585014408, "grad_norm": 0.3504720056887704, "learning_rate": 6.615078594128531e-06, "loss": 0.0254, "step": 2847 }, { "epoch": 1.3679154658981747, "grad_norm": 0.32015716766346763, "learning_rate": 6.6124336318372e-06, "loss": 0.0291, "step": 2848 }, { "epoch": 1.3683957732949088, "grad_norm": 0.34417175386829446, "learning_rate": 6.609788165866212e-06, "loss": 0.0327, "step": 2849 }, { "epoch": 1.3688760806916427, "grad_norm": 0.3902517610342385, "learning_rate": 6.607142197041936e-06, "loss": 0.0284, "step": 2850 }, { "epoch": 1.3693563880883766, "grad_norm": 0.3988118246687258, "learning_rate": 6.604495726190899e-06, "loss": 0.0318, "step": 2851 }, { "epoch": 1.3698366954851104, "grad_norm": 0.39362589128706227, "learning_rate": 6.6018487541397855e-06, "loss": 0.0237, "step": 2852 }, { "epoch": 1.3703170028818443, "grad_norm": 0.37959072523751136, "learning_rate": 6.599201281715438e-06, "loss": 0.0376, "step": 2853 }, { "epoch": 1.3707973102785782, "grad_norm": 0.4771191328123765, "learning_rate": 6.596553309744854e-06, "loss": 0.0262, "step": 2854 }, { "epoch": 1.3712776176753123, "grad_norm": 0.26893975483688526, "learning_rate": 6.593904839055183e-06, "loss": 0.0197, "step": 2855 }, { "epoch": 1.3717579250720462, "grad_norm": 0.48912516198755485, "learning_rate": 6.591255870473736e-06, "loss": 0.0291, "step": 2856 }, { "epoch": 1.37223823246878, "grad_norm": 0.4511329607032363, "learning_rate": 6.58860640482798e-06, "loss": 0.0255, "step": 2857 }, { "epoch": 1.372718539865514, "grad_norm": 1.3461610887013964, "learning_rate": 6.585956442945531e-06, "loss": 0.0319, "step": 2858 }, { "epoch": 1.3731988472622478, "grad_norm": 0.5793771474653331, "learning_rate": 6.5833059856541645e-06, "loss": 0.0339, "step": 2859 }, { "epoch": 1.3736791546589817, "grad_norm": 0.5452786701856912, "learning_rate": 6.580655033781812e-06, "loss": 0.0219, "step": 2860 }, { "epoch": 1.3741594620557156, "grad_norm": 0.36625761445015026, "learning_rate": 6.5780035881565565e-06, "loss": 0.0288, "step": 2861 }, { "epoch": 1.3746397694524495, "grad_norm": 0.5179417248110216, "learning_rate": 6.575351649606635e-06, "loss": 0.0342, "step": 2862 }, { "epoch": 1.3751200768491834, "grad_norm": 0.347964495107041, "learning_rate": 6.572699218960442e-06, "loss": 0.0204, "step": 2863 }, { "epoch": 1.3756003842459175, "grad_norm": 0.4776612809871579, "learning_rate": 6.570046297046522e-06, "loss": 0.0465, "step": 2864 }, { "epoch": 1.3760806916426513, "grad_norm": 0.9094552747925242, "learning_rate": 6.5673928846935735e-06, "loss": 0.0576, "step": 2865 }, { "epoch": 1.3765609990393852, "grad_norm": 0.47115816034604685, "learning_rate": 6.56473898273045e-06, "loss": 0.0328, "step": 2866 }, { "epoch": 1.377041306436119, "grad_norm": 0.2913575270811946, "learning_rate": 6.56208459198616e-06, "loss": 0.0229, "step": 2867 }, { "epoch": 1.377521613832853, "grad_norm": 0.3931619478504385, "learning_rate": 6.559429713289858e-06, "loss": 0.0378, "step": 2868 }, { "epoch": 1.3780019212295869, "grad_norm": 0.42493594944264784, "learning_rate": 6.5567743474708545e-06, "loss": 0.0268, "step": 2869 }, { "epoch": 1.378482228626321, "grad_norm": 0.3372461252778451, "learning_rate": 6.554118495358614e-06, "loss": 0.0217, "step": 2870 }, { "epoch": 1.3789625360230549, "grad_norm": 0.45764865965556617, "learning_rate": 6.5514621577827505e-06, "loss": 0.0288, "step": 2871 }, { "epoch": 1.3794428434197887, "grad_norm": 0.3566013922552597, "learning_rate": 6.548805335573032e-06, "loss": 0.0326, "step": 2872 }, { "epoch": 1.3799231508165226, "grad_norm": 0.395236001720611, "learning_rate": 6.546148029559374e-06, "loss": 0.0344, "step": 2873 }, { "epoch": 1.3804034582132565, "grad_norm": 0.3732870494982421, "learning_rate": 6.543490240571846e-06, "loss": 0.0293, "step": 2874 }, { "epoch": 1.3808837656099904, "grad_norm": 0.4625951165139278, "learning_rate": 6.5408319694406676e-06, "loss": 0.0258, "step": 2875 }, { "epoch": 1.3813640730067243, "grad_norm": 0.4207463495527933, "learning_rate": 6.538173216996212e-06, "loss": 0.0324, "step": 2876 }, { "epoch": 1.3818443804034581, "grad_norm": 0.3574725502724528, "learning_rate": 6.535513984068995e-06, "loss": 0.0348, "step": 2877 }, { "epoch": 1.382324687800192, "grad_norm": 0.36846140869670807, "learning_rate": 6.5328542714896905e-06, "loss": 0.0275, "step": 2878 }, { "epoch": 1.382804995196926, "grad_norm": 0.4077308098465471, "learning_rate": 6.530194080089118e-06, "loss": 0.0268, "step": 2879 }, { "epoch": 1.38328530259366, "grad_norm": 0.4234424762481803, "learning_rate": 6.52753341069825e-06, "loss": 0.0369, "step": 2880 }, { "epoch": 1.3837656099903939, "grad_norm": 0.509467223688595, "learning_rate": 6.524872264148203e-06, "loss": 0.0341, "step": 2881 }, { "epoch": 1.3842459173871278, "grad_norm": 0.7934683498250322, "learning_rate": 6.522210641270247e-06, "loss": 0.0432, "step": 2882 }, { "epoch": 1.3847262247838616, "grad_norm": 0.42156812772555435, "learning_rate": 6.519548542895799e-06, "loss": 0.0404, "step": 2883 }, { "epoch": 1.3852065321805955, "grad_norm": 0.4402007021437428, "learning_rate": 6.516885969856425e-06, "loss": 0.0339, "step": 2884 }, { "epoch": 1.3856868395773294, "grad_norm": 0.4018768656932937, "learning_rate": 6.514222922983839e-06, "loss": 0.0414, "step": 2885 }, { "epoch": 1.3861671469740635, "grad_norm": 0.35734072467699723, "learning_rate": 6.511559403109905e-06, "loss": 0.0293, "step": 2886 }, { "epoch": 1.3866474543707974, "grad_norm": 0.3599397209375232, "learning_rate": 6.508895411066629e-06, "loss": 0.0307, "step": 2887 }, { "epoch": 1.3871277617675313, "grad_norm": 0.32718764737162565, "learning_rate": 6.5062309476861714e-06, "loss": 0.0295, "step": 2888 }, { "epoch": 1.3876080691642652, "grad_norm": 0.5855293815929146, "learning_rate": 6.503566013800836e-06, "loss": 0.0331, "step": 2889 }, { "epoch": 1.388088376560999, "grad_norm": 0.44521223785465636, "learning_rate": 6.5009006102430775e-06, "loss": 0.0301, "step": 2890 }, { "epoch": 1.388568683957733, "grad_norm": 0.35035914491200976, "learning_rate": 6.498234737845488e-06, "loss": 0.0331, "step": 2891 }, { "epoch": 1.3890489913544668, "grad_norm": 0.5233262059935054, "learning_rate": 6.495568397440814e-06, "loss": 0.0415, "step": 2892 }, { "epoch": 1.3895292987512007, "grad_norm": 0.3606549691749029, "learning_rate": 6.492901589861952e-06, "loss": 0.0302, "step": 2893 }, { "epoch": 1.3900096061479346, "grad_norm": 0.46468655504534323, "learning_rate": 6.490234315941933e-06, "loss": 0.0304, "step": 2894 }, { "epoch": 1.3904899135446687, "grad_norm": 0.41507899900863676, "learning_rate": 6.4875665765139426e-06, "loss": 0.0204, "step": 2895 }, { "epoch": 1.3909702209414025, "grad_norm": 0.5036561901827242, "learning_rate": 6.4848983724113065e-06, "loss": 0.0475, "step": 2896 }, { "epoch": 1.3914505283381364, "grad_norm": 0.5410504613226156, "learning_rate": 6.4822297044675e-06, "loss": 0.0487, "step": 2897 }, { "epoch": 1.3919308357348703, "grad_norm": 0.6016337718232381, "learning_rate": 6.479560573516139e-06, "loss": 0.0232, "step": 2898 }, { "epoch": 1.3924111431316042, "grad_norm": 0.3417058070494167, "learning_rate": 6.476890980390988e-06, "loss": 0.0227, "step": 2899 }, { "epoch": 1.392891450528338, "grad_norm": 0.30928858757254957, "learning_rate": 6.474220925925955e-06, "loss": 0.0243, "step": 2900 }, { "epoch": 1.3933717579250722, "grad_norm": 0.3360126913931893, "learning_rate": 6.4715504109550876e-06, "loss": 0.0258, "step": 2901 }, { "epoch": 1.393852065321806, "grad_norm": 0.8285417270470168, "learning_rate": 6.468879436312584e-06, "loss": 0.051, "step": 2902 }, { "epoch": 1.39433237271854, "grad_norm": 0.32910421012934066, "learning_rate": 6.466208002832781e-06, "loss": 0.0258, "step": 2903 }, { "epoch": 1.3948126801152738, "grad_norm": 0.3462778193127048, "learning_rate": 6.46353611135016e-06, "loss": 0.0349, "step": 2904 }, { "epoch": 1.3952929875120077, "grad_norm": 0.34838771020348513, "learning_rate": 6.460863762699348e-06, "loss": 0.0312, "step": 2905 }, { "epoch": 1.3957732949087416, "grad_norm": 0.4536225235106763, "learning_rate": 6.458190957715112e-06, "loss": 0.0382, "step": 2906 }, { "epoch": 1.3962536023054755, "grad_norm": 0.5020777398406953, "learning_rate": 6.455517697232362e-06, "loss": 0.0452, "step": 2907 }, { "epoch": 1.3967339097022093, "grad_norm": 1.5909208917937527, "learning_rate": 6.4528439820861525e-06, "loss": 0.032, "step": 2908 }, { "epoch": 1.3972142170989432, "grad_norm": 0.33642467943800947, "learning_rate": 6.450169813111674e-06, "loss": 0.03, "step": 2909 }, { "epoch": 1.397694524495677, "grad_norm": 0.4681643928653143, "learning_rate": 6.4474951911442655e-06, "loss": 0.0285, "step": 2910 }, { "epoch": 1.3981748318924112, "grad_norm": 0.2967729001106742, "learning_rate": 6.444820117019404e-06, "loss": 0.0256, "step": 2911 }, { "epoch": 1.398655139289145, "grad_norm": 0.527471116532171, "learning_rate": 6.442144591572712e-06, "loss": 0.0472, "step": 2912 }, { "epoch": 1.399135446685879, "grad_norm": 0.42717906793239, "learning_rate": 6.439468615639947e-06, "loss": 0.0362, "step": 2913 }, { "epoch": 1.3996157540826129, "grad_norm": 0.3372590340985268, "learning_rate": 6.436792190057006e-06, "loss": 0.0319, "step": 2914 }, { "epoch": 1.4000960614793467, "grad_norm": 0.3115422325820803, "learning_rate": 6.4341153156599375e-06, "loss": 0.0288, "step": 2915 }, { "epoch": 1.4005763688760806, "grad_norm": 0.46724069702463444, "learning_rate": 6.431437993284919e-06, "loss": 0.0338, "step": 2916 }, { "epoch": 1.4010566762728147, "grad_norm": 0.3497123018744295, "learning_rate": 6.428760223768274e-06, "loss": 0.024, "step": 2917 }, { "epoch": 1.4015369836695486, "grad_norm": 0.3015682750870508, "learning_rate": 6.426082007946459e-06, "loss": 0.0269, "step": 2918 }, { "epoch": 1.4020172910662825, "grad_norm": 0.5523743403274974, "learning_rate": 6.42340334665608e-06, "loss": 0.0351, "step": 2919 }, { "epoch": 1.4024975984630164, "grad_norm": 0.45789035449846094, "learning_rate": 6.4207242407338745e-06, "loss": 0.0305, "step": 2920 }, { "epoch": 1.4029779058597502, "grad_norm": 0.2537912784307932, "learning_rate": 6.41804469101672e-06, "loss": 0.0203, "step": 2921 }, { "epoch": 1.4034582132564841, "grad_norm": 0.38616289882859195, "learning_rate": 6.415364698341635e-06, "loss": 0.0282, "step": 2922 }, { "epoch": 1.403938520653218, "grad_norm": 0.31610433396844906, "learning_rate": 6.412684263545775e-06, "loss": 0.0228, "step": 2923 }, { "epoch": 1.4044188280499519, "grad_norm": 0.3247495467798641, "learning_rate": 6.410003387466433e-06, "loss": 0.0282, "step": 2924 }, { "epoch": 1.4048991354466858, "grad_norm": 0.24504590883418131, "learning_rate": 6.407322070941041e-06, "loss": 0.0229, "step": 2925 }, { "epoch": 1.4053794428434199, "grad_norm": 0.5342071248399537, "learning_rate": 6.40464031480717e-06, "loss": 0.0417, "step": 2926 }, { "epoch": 1.4058597502401537, "grad_norm": 0.7615210138893405, "learning_rate": 6.401958119902523e-06, "loss": 0.0292, "step": 2927 }, { "epoch": 1.4063400576368876, "grad_norm": 0.5974938729153428, "learning_rate": 6.399275487064944e-06, "loss": 0.0278, "step": 2928 }, { "epoch": 1.4068203650336215, "grad_norm": 0.26646951389626067, "learning_rate": 6.396592417132416e-06, "loss": 0.0229, "step": 2929 }, { "epoch": 1.4073006724303554, "grad_norm": 0.9102507381915027, "learning_rate": 6.393908910943053e-06, "loss": 0.0354, "step": 2930 }, { "epoch": 1.4077809798270893, "grad_norm": 0.3717109837566094, "learning_rate": 6.39122496933511e-06, "loss": 0.0292, "step": 2931 }, { "epoch": 1.4082612872238234, "grad_norm": 0.36040445734015164, "learning_rate": 6.388540593146975e-06, "loss": 0.0268, "step": 2932 }, { "epoch": 1.4087415946205573, "grad_norm": 0.31355792703545343, "learning_rate": 6.385855783217172e-06, "loss": 0.0232, "step": 2933 }, { "epoch": 1.4092219020172911, "grad_norm": 0.3303067953345841, "learning_rate": 6.3831705403843645e-06, "loss": 0.0324, "step": 2934 }, { "epoch": 1.409702209414025, "grad_norm": 0.3354034739588526, "learning_rate": 6.380484865487346e-06, "loss": 0.0257, "step": 2935 }, { "epoch": 1.410182516810759, "grad_norm": 0.39771626926553977, "learning_rate": 6.3777987593650445e-06, "loss": 0.025, "step": 2936 }, { "epoch": 1.4106628242074928, "grad_norm": 0.30673840123286783, "learning_rate": 6.37511222285653e-06, "loss": 0.0252, "step": 2937 }, { "epoch": 1.4111431316042267, "grad_norm": 0.2791698245841723, "learning_rate": 6.372425256801e-06, "loss": 0.0192, "step": 2938 }, { "epoch": 1.4116234390009605, "grad_norm": 0.3531801271767405, "learning_rate": 6.369737862037788e-06, "loss": 0.0368, "step": 2939 }, { "epoch": 1.4121037463976944, "grad_norm": 0.38523103421480276, "learning_rate": 6.367050039406362e-06, "loss": 0.0257, "step": 2940 }, { "epoch": 1.4125840537944283, "grad_norm": 0.36554461316788134, "learning_rate": 6.364361789746324e-06, "loss": 0.0329, "step": 2941 }, { "epoch": 1.4130643611911624, "grad_norm": 0.4708313953314256, "learning_rate": 6.361673113897408e-06, "loss": 0.0345, "step": 2942 }, { "epoch": 1.4135446685878963, "grad_norm": 0.3047662104102519, "learning_rate": 6.358984012699482e-06, "loss": 0.0197, "step": 2943 }, { "epoch": 1.4140249759846302, "grad_norm": 0.3753701768200154, "learning_rate": 6.356294486992548e-06, "loss": 0.02, "step": 2944 }, { "epoch": 1.414505283381364, "grad_norm": 0.48911171283617344, "learning_rate": 6.3536045376167385e-06, "loss": 0.0468, "step": 2945 }, { "epoch": 1.414985590778098, "grad_norm": 0.5106005836276446, "learning_rate": 6.35091416541232e-06, "loss": 0.0494, "step": 2946 }, { "epoch": 1.4154658981748318, "grad_norm": 0.4440966197161187, "learning_rate": 6.348223371219687e-06, "loss": 0.0368, "step": 2947 }, { "epoch": 1.415946205571566, "grad_norm": 0.3811182784440654, "learning_rate": 6.345532155879375e-06, "loss": 0.0334, "step": 2948 }, { "epoch": 1.4164265129682998, "grad_norm": 0.3675829015997376, "learning_rate": 6.3428405202320405e-06, "loss": 0.0331, "step": 2949 }, { "epoch": 1.4169068203650337, "grad_norm": 0.34166427711227043, "learning_rate": 6.340148465118478e-06, "loss": 0.0324, "step": 2950 }, { "epoch": 1.4173871277617676, "grad_norm": 0.46135243897527584, "learning_rate": 6.3374559913796105e-06, "loss": 0.0384, "step": 2951 }, { "epoch": 1.4178674351585014, "grad_norm": 0.29521768914330004, "learning_rate": 6.3347630998564934e-06, "loss": 0.0235, "step": 2952 }, { "epoch": 1.4183477425552353, "grad_norm": 0.39302794049200424, "learning_rate": 6.3320697913903095e-06, "loss": 0.023, "step": 2953 }, { "epoch": 1.4188280499519692, "grad_norm": 1.5361238353872235, "learning_rate": 6.329376066822376e-06, "loss": 0.0361, "step": 2954 }, { "epoch": 1.419308357348703, "grad_norm": 0.3888615491396321, "learning_rate": 6.326681926994136e-06, "loss": 0.0276, "step": 2955 }, { "epoch": 1.419788664745437, "grad_norm": 0.4151128449421023, "learning_rate": 6.323987372747165e-06, "loss": 0.03, "step": 2956 }, { "epoch": 1.420268972142171, "grad_norm": 0.34650741050649014, "learning_rate": 6.3212924049231676e-06, "loss": 0.0291, "step": 2957 }, { "epoch": 1.420749279538905, "grad_norm": 0.32248331603354646, "learning_rate": 6.318597024363977e-06, "loss": 0.0327, "step": 2958 }, { "epoch": 1.4212295869356388, "grad_norm": 0.30983079906052546, "learning_rate": 6.3159012319115556e-06, "loss": 0.0276, "step": 2959 }, { "epoch": 1.4217098943323727, "grad_norm": 0.2979714762514332, "learning_rate": 6.313205028407994e-06, "loss": 0.0276, "step": 2960 }, { "epoch": 1.4221902017291066, "grad_norm": 0.7239555546434696, "learning_rate": 6.310508414695511e-06, "loss": 0.0361, "step": 2961 }, { "epoch": 1.4226705091258405, "grad_norm": 0.6571687120250687, "learning_rate": 6.3078113916164565e-06, "loss": 0.0381, "step": 2962 }, { "epoch": 1.4231508165225746, "grad_norm": 0.30605782587622166, "learning_rate": 6.305113960013304e-06, "loss": 0.0304, "step": 2963 }, { "epoch": 1.4236311239193085, "grad_norm": 0.3499050417815181, "learning_rate": 6.302416120728656e-06, "loss": 0.0329, "step": 2964 }, { "epoch": 1.4241114313160423, "grad_norm": 0.5401929175398352, "learning_rate": 6.2997178746052426e-06, "loss": 0.027, "step": 2965 }, { "epoch": 1.4245917387127762, "grad_norm": 0.33765365973560296, "learning_rate": 6.297019222485924e-06, "loss": 0.0295, "step": 2966 }, { "epoch": 1.42507204610951, "grad_norm": 0.42409935486541994, "learning_rate": 6.294320165213683e-06, "loss": 0.0276, "step": 2967 }, { "epoch": 1.425552353506244, "grad_norm": 0.7348639589287866, "learning_rate": 6.29162070363163e-06, "loss": 0.0359, "step": 2968 }, { "epoch": 1.4260326609029779, "grad_norm": 0.6238323287513995, "learning_rate": 6.288920838583001e-06, "loss": 0.0513, "step": 2969 }, { "epoch": 1.4265129682997117, "grad_norm": 0.5420305091159022, "learning_rate": 6.286220570911161e-06, "loss": 0.0443, "step": 2970 }, { "epoch": 1.4269932756964456, "grad_norm": 0.926149974733824, "learning_rate": 6.2835199014596e-06, "loss": 0.0293, "step": 2971 }, { "epoch": 1.4274735830931795, "grad_norm": 0.6350357797166392, "learning_rate": 6.2808188310719295e-06, "loss": 0.0287, "step": 2972 }, { "epoch": 1.4279538904899136, "grad_norm": 0.372724963778092, "learning_rate": 6.278117360591891e-06, "loss": 0.0347, "step": 2973 }, { "epoch": 1.4284341978866475, "grad_norm": 0.4789179496462536, "learning_rate": 6.275415490863348e-06, "loss": 0.031, "step": 2974 }, { "epoch": 1.4289145052833814, "grad_norm": 1.2531965210879925, "learning_rate": 6.2727132227302915e-06, "loss": 0.0337, "step": 2975 }, { "epoch": 1.4293948126801153, "grad_norm": 0.3292578921871811, "learning_rate": 6.270010557036832e-06, "loss": 0.029, "step": 2976 }, { "epoch": 1.4298751200768491, "grad_norm": 0.43720009113080216, "learning_rate": 6.267307494627211e-06, "loss": 0.0294, "step": 2977 }, { "epoch": 1.430355427473583, "grad_norm": 0.3676666206447526, "learning_rate": 6.26460403634579e-06, "loss": 0.0265, "step": 2978 }, { "epoch": 1.4308357348703171, "grad_norm": 0.3508307063280374, "learning_rate": 6.261900183037053e-06, "loss": 0.0269, "step": 2979 }, { "epoch": 1.431316042267051, "grad_norm": 0.354909041996203, "learning_rate": 6.25919593554561e-06, "loss": 0.0213, "step": 2980 }, { "epoch": 1.4317963496637849, "grad_norm": 0.33531364657209284, "learning_rate": 6.256491294716193e-06, "loss": 0.0302, "step": 2981 }, { "epoch": 1.4322766570605188, "grad_norm": 0.3395467177642713, "learning_rate": 6.253786261393656e-06, "loss": 0.035, "step": 2982 }, { "epoch": 1.4327569644572526, "grad_norm": 0.384011884538928, "learning_rate": 6.251080836422977e-06, "loss": 0.0261, "step": 2983 }, { "epoch": 1.4332372718539865, "grad_norm": 0.3751516340713227, "learning_rate": 6.248375020649257e-06, "loss": 0.0277, "step": 2984 }, { "epoch": 1.4337175792507204, "grad_norm": 0.3365372086103453, "learning_rate": 6.2456688149177195e-06, "loss": 0.0327, "step": 2985 }, { "epoch": 1.4341978866474543, "grad_norm": 0.47156930968999555, "learning_rate": 6.242962220073703e-06, "loss": 0.0278, "step": 2986 }, { "epoch": 1.4346781940441882, "grad_norm": 0.29779505645207816, "learning_rate": 6.240255236962676e-06, "loss": 0.0356, "step": 2987 }, { "epoch": 1.435158501440922, "grad_norm": 0.45599850709170564, "learning_rate": 6.237547866430226e-06, "loss": 0.0312, "step": 2988 }, { "epoch": 1.4356388088376562, "grad_norm": 0.3524024814551884, "learning_rate": 6.23484010932206e-06, "loss": 0.0326, "step": 2989 }, { "epoch": 1.43611911623439, "grad_norm": 0.4696623109454822, "learning_rate": 6.232131966484007e-06, "loss": 0.0421, "step": 2990 }, { "epoch": 1.436599423631124, "grad_norm": 0.5001949365509526, "learning_rate": 6.2294234387620126e-06, "loss": 0.0254, "step": 2991 }, { "epoch": 1.4370797310278578, "grad_norm": 0.40870271898970234, "learning_rate": 6.226714527002149e-06, "loss": 0.0326, "step": 2992 }, { "epoch": 1.4375600384245917, "grad_norm": 0.3661778583058412, "learning_rate": 6.224005232050605e-06, "loss": 0.0288, "step": 2993 }, { "epoch": 1.4380403458213258, "grad_norm": 0.412020484956316, "learning_rate": 6.2212955547536906e-06, "loss": 0.032, "step": 2994 }, { "epoch": 1.4385206532180597, "grad_norm": 0.322044696743901, "learning_rate": 6.2185854959578304e-06, "loss": 0.03, "step": 2995 }, { "epoch": 1.4390009606147935, "grad_norm": 0.48115540280794433, "learning_rate": 6.215875056509575e-06, "loss": 0.0563, "step": 2996 }, { "epoch": 1.4394812680115274, "grad_norm": 0.4381014080043817, "learning_rate": 6.213164237255589e-06, "loss": 0.0371, "step": 2997 }, { "epoch": 1.4399615754082613, "grad_norm": 0.8968958426228555, "learning_rate": 6.210453039042658e-06, "loss": 0.03, "step": 2998 }, { "epoch": 1.4404418828049952, "grad_norm": 0.4855019284502454, "learning_rate": 6.207741462717685e-06, "loss": 0.0452, "step": 2999 }, { "epoch": 1.440922190201729, "grad_norm": 0.5000373486533403, "learning_rate": 6.205029509127691e-06, "loss": 0.0332, "step": 3000 }, { "epoch": 1.441402497598463, "grad_norm": 0.6462845411057925, "learning_rate": 6.202317179119817e-06, "loss": 0.0531, "step": 3001 }, { "epoch": 1.4418828049951968, "grad_norm": 0.8025420596144532, "learning_rate": 6.199604473541317e-06, "loss": 0.0297, "step": 3002 }, { "epoch": 1.4423631123919307, "grad_norm": 0.30643678884971803, "learning_rate": 6.19689139323957e-06, "loss": 0.0197, "step": 3003 }, { "epoch": 1.4428434197886648, "grad_norm": 0.45444229877040854, "learning_rate": 6.1941779390620626e-06, "loss": 0.0245, "step": 3004 }, { "epoch": 1.4433237271853987, "grad_norm": 0.525526396443818, "learning_rate": 6.1914641118564035e-06, "loss": 0.0291, "step": 3005 }, { "epoch": 1.4438040345821326, "grad_norm": 0.5884015313184124, "learning_rate": 6.188749912470319e-06, "loss": 0.0312, "step": 3006 }, { "epoch": 1.4442843419788665, "grad_norm": 0.5736818972866619, "learning_rate": 6.1860353417516485e-06, "loss": 0.0623, "step": 3007 }, { "epoch": 1.4447646493756003, "grad_norm": 0.5420221157760347, "learning_rate": 6.183320400548351e-06, "loss": 0.0439, "step": 3008 }, { "epoch": 1.4452449567723342, "grad_norm": 0.4708430457674903, "learning_rate": 6.1806050897084955e-06, "loss": 0.0442, "step": 3009 }, { "epoch": 1.4457252641690683, "grad_norm": 0.39871175076223225, "learning_rate": 6.1778894100802726e-06, "loss": 0.0305, "step": 3010 }, { "epoch": 1.4462055715658022, "grad_norm": 0.3523576955717442, "learning_rate": 6.1751733625119835e-06, "loss": 0.0294, "step": 3011 }, { "epoch": 1.446685878962536, "grad_norm": 0.4280219910606775, "learning_rate": 6.1724569478520495e-06, "loss": 0.0267, "step": 3012 }, { "epoch": 1.44716618635927, "grad_norm": 0.36012647187771496, "learning_rate": 6.169740166949001e-06, "loss": 0.0295, "step": 3013 }, { "epoch": 1.4476464937560038, "grad_norm": 0.6915601622654116, "learning_rate": 6.167023020651485e-06, "loss": 0.0407, "step": 3014 }, { "epoch": 1.4481268011527377, "grad_norm": 0.3309149473723583, "learning_rate": 6.1643055098082625e-06, "loss": 0.0254, "step": 3015 }, { "epoch": 1.4486071085494716, "grad_norm": 0.5274129551379748, "learning_rate": 6.161587635268211e-06, "loss": 0.0479, "step": 3016 }, { "epoch": 1.4490874159462055, "grad_norm": 0.33762885784681956, "learning_rate": 6.158869397880318e-06, "loss": 0.028, "step": 3017 }, { "epoch": 1.4495677233429394, "grad_norm": 0.3369873513634773, "learning_rate": 6.156150798493686e-06, "loss": 0.03, "step": 3018 }, { "epoch": 1.4500480307396733, "grad_norm": 0.3653889467911949, "learning_rate": 6.153431837957529e-06, "loss": 0.023, "step": 3019 }, { "epoch": 1.4505283381364074, "grad_norm": 0.351970949912574, "learning_rate": 6.150712517121175e-06, "loss": 0.0244, "step": 3020 }, { "epoch": 1.4510086455331412, "grad_norm": 1.387889450337649, "learning_rate": 6.147992836834067e-06, "loss": 0.0274, "step": 3021 }, { "epoch": 1.4514889529298751, "grad_norm": 0.37303608780900993, "learning_rate": 6.145272797945756e-06, "loss": 0.0318, "step": 3022 }, { "epoch": 1.451969260326609, "grad_norm": 0.34795192435224903, "learning_rate": 6.142552401305907e-06, "loss": 0.0244, "step": 3023 }, { "epoch": 1.4524495677233429, "grad_norm": 0.36270155509114493, "learning_rate": 6.139831647764296e-06, "loss": 0.0248, "step": 3024 }, { "epoch": 1.452929875120077, "grad_norm": 0.602096908568734, "learning_rate": 6.137110538170813e-06, "loss": 0.0244, "step": 3025 }, { "epoch": 1.4534101825168109, "grad_norm": 0.2829943485725966, "learning_rate": 6.134389073375457e-06, "loss": 0.0221, "step": 3026 }, { "epoch": 1.4538904899135447, "grad_norm": 0.39244455415821716, "learning_rate": 6.131667254228337e-06, "loss": 0.031, "step": 3027 }, { "epoch": 1.4543707973102786, "grad_norm": 0.35740803261008275, "learning_rate": 6.128945081579676e-06, "loss": 0.03, "step": 3028 }, { "epoch": 1.4548511047070125, "grad_norm": 0.4163240049154014, "learning_rate": 6.126222556279803e-06, "loss": 0.0319, "step": 3029 }, { "epoch": 1.4553314121037464, "grad_norm": 0.9144465668375049, "learning_rate": 6.123499679179161e-06, "loss": 0.036, "step": 3030 }, { "epoch": 1.4558117195004803, "grad_norm": 0.3359784172226285, "learning_rate": 6.120776451128301e-06, "loss": 0.028, "step": 3031 }, { "epoch": 1.4562920268972142, "grad_norm": 0.3322781737026661, "learning_rate": 6.118052872977884e-06, "loss": 0.0351, "step": 3032 }, { "epoch": 1.456772334293948, "grad_norm": 0.34129518778022083, "learning_rate": 6.115328945578683e-06, "loss": 0.0353, "step": 3033 }, { "epoch": 1.457252641690682, "grad_norm": 0.37558074523914864, "learning_rate": 6.112604669781572e-06, "loss": 0.0351, "step": 3034 }, { "epoch": 1.457732949087416, "grad_norm": 0.36380200967954646, "learning_rate": 6.1098800464375454e-06, "loss": 0.0318, "step": 3035 }, { "epoch": 1.45821325648415, "grad_norm": 0.4368254903969246, "learning_rate": 6.107155076397696e-06, "loss": 0.0392, "step": 3036 }, { "epoch": 1.4586935638808838, "grad_norm": 0.4768428557689771, "learning_rate": 6.1044297605132316e-06, "loss": 0.0271, "step": 3037 }, { "epoch": 1.4591738712776177, "grad_norm": 1.094443395169809, "learning_rate": 6.101704099635463e-06, "loss": 0.0362, "step": 3038 }, { "epoch": 1.4596541786743515, "grad_norm": 0.38575998085771535, "learning_rate": 6.098978094615815e-06, "loss": 0.0245, "step": 3039 }, { "epoch": 1.4601344860710854, "grad_norm": 0.5518493120234275, "learning_rate": 6.096251746305812e-06, "loss": 0.051, "step": 3040 }, { "epoch": 1.4606147934678195, "grad_norm": 0.9085676893494585, "learning_rate": 6.093525055557092e-06, "loss": 0.0353, "step": 3041 }, { "epoch": 1.4610951008645534, "grad_norm": 0.482531282736558, "learning_rate": 6.090798023221397e-06, "loss": 0.0174, "step": 3042 }, { "epoch": 1.4615754082612873, "grad_norm": 0.38339221958711955, "learning_rate": 6.088070650150577e-06, "loss": 0.029, "step": 3043 }, { "epoch": 1.4620557156580212, "grad_norm": 0.5298612195344582, "learning_rate": 6.08534293719659e-06, "loss": 0.0277, "step": 3044 }, { "epoch": 1.462536023054755, "grad_norm": 1.4899674890458396, "learning_rate": 6.0826148852114945e-06, "loss": 0.0404, "step": 3045 }, { "epoch": 1.463016330451489, "grad_norm": 0.6083198492275005, "learning_rate": 6.079886495047458e-06, "loss": 0.0478, "step": 3046 }, { "epoch": 1.4634966378482228, "grad_norm": 0.3238371658383652, "learning_rate": 6.077157767556756e-06, "loss": 0.0327, "step": 3047 }, { "epoch": 1.4639769452449567, "grad_norm": 0.3563758626690178, "learning_rate": 6.07442870359177e-06, "loss": 0.0369, "step": 3048 }, { "epoch": 1.4644572526416906, "grad_norm": 0.4241971898132114, "learning_rate": 6.071699304004979e-06, "loss": 0.0348, "step": 3049 }, { "epoch": 1.4649375600384245, "grad_norm": 0.3207959896937584, "learning_rate": 6.0689695696489745e-06, "loss": 0.0253, "step": 3050 }, { "epoch": 1.4654178674351586, "grad_norm": 0.5556046994948288, "learning_rate": 6.066239501376452e-06, "loss": 0.0235, "step": 3051 }, { "epoch": 1.4658981748318924, "grad_norm": 0.7482189993871567, "learning_rate": 6.0635091000402044e-06, "loss": 0.0405, "step": 3052 }, { "epoch": 1.4663784822286263, "grad_norm": 0.4039758447274521, "learning_rate": 6.0607783664931385e-06, "loss": 0.0305, "step": 3053 }, { "epoch": 1.4668587896253602, "grad_norm": 0.2993211815372871, "learning_rate": 6.0580473015882566e-06, "loss": 0.0215, "step": 3054 }, { "epoch": 1.467339097022094, "grad_norm": 0.30259626984847005, "learning_rate": 6.055315906178669e-06, "loss": 0.0289, "step": 3055 }, { "epoch": 1.4678194044188282, "grad_norm": 0.5213533837436859, "learning_rate": 6.052584181117589e-06, "loss": 0.0335, "step": 3056 }, { "epoch": 1.468299711815562, "grad_norm": 0.44406513880190085, "learning_rate": 6.04985212725833e-06, "loss": 0.0332, "step": 3057 }, { "epoch": 1.468780019212296, "grad_norm": 0.8565352583503593, "learning_rate": 6.04711974545431e-06, "loss": 0.0349, "step": 3058 }, { "epoch": 1.4692603266090298, "grad_norm": 0.3444494124385286, "learning_rate": 6.044387036559052e-06, "loss": 0.026, "step": 3059 }, { "epoch": 1.4697406340057637, "grad_norm": 0.33533978509258383, "learning_rate": 6.041654001426176e-06, "loss": 0.0339, "step": 3060 }, { "epoch": 1.4702209414024976, "grad_norm": 0.9136616880838273, "learning_rate": 6.038920640909408e-06, "loss": 0.0609, "step": 3061 }, { "epoch": 1.4707012487992315, "grad_norm": 1.1009636752363914, "learning_rate": 6.036186955862575e-06, "loss": 0.049, "step": 3062 }, { "epoch": 1.4711815561959654, "grad_norm": 0.6135939921579044, "learning_rate": 6.033452947139603e-06, "loss": 0.0372, "step": 3063 }, { "epoch": 1.4716618635926992, "grad_norm": 0.3907832631851922, "learning_rate": 6.03071861559452e-06, "loss": 0.0226, "step": 3064 }, { "epoch": 1.4721421709894331, "grad_norm": 0.4099570865963333, "learning_rate": 6.027983962081457e-06, "loss": 0.0301, "step": 3065 }, { "epoch": 1.4726224783861672, "grad_norm": 0.7651245241777834, "learning_rate": 6.0252489874546435e-06, "loss": 0.0373, "step": 3066 }, { "epoch": 1.473102785782901, "grad_norm": 0.6572658247485343, "learning_rate": 6.022513692568412e-06, "loss": 0.0463, "step": 3067 }, { "epoch": 1.473583093179635, "grad_norm": 0.5093046408750957, "learning_rate": 6.01977807827719e-06, "loss": 0.0299, "step": 3068 }, { "epoch": 1.4740634005763689, "grad_norm": 0.3879778056571466, "learning_rate": 6.017042145435509e-06, "loss": 0.0255, "step": 3069 }, { "epoch": 1.4745437079731027, "grad_norm": 0.45372661579173895, "learning_rate": 6.014305894897998e-06, "loss": 0.0368, "step": 3070 }, { "epoch": 1.4750240153698366, "grad_norm": 0.44551493479381876, "learning_rate": 6.011569327519387e-06, "loss": 0.042, "step": 3071 }, { "epoch": 1.4755043227665707, "grad_norm": 0.38502447689270985, "learning_rate": 6.008832444154503e-06, "loss": 0.04, "step": 3072 }, { "epoch": 1.4759846301633046, "grad_norm": 0.5760621288431558, "learning_rate": 6.006095245658275e-06, "loss": 0.0606, "step": 3073 }, { "epoch": 1.4764649375600385, "grad_norm": 0.318967723903424, "learning_rate": 6.003357732885724e-06, "loss": 0.0236, "step": 3074 }, { "epoch": 1.4769452449567724, "grad_norm": 0.4169163426137864, "learning_rate": 6.000619906691976e-06, "loss": 0.0297, "step": 3075 }, { "epoch": 1.4774255523535063, "grad_norm": 0.42803282278855265, "learning_rate": 5.997881767932252e-06, "loss": 0.0237, "step": 3076 }, { "epoch": 1.4779058597502401, "grad_norm": 0.8018663787140973, "learning_rate": 5.995143317461871e-06, "loss": 0.0524, "step": 3077 }, { "epoch": 1.478386167146974, "grad_norm": 0.3263084784998617, "learning_rate": 5.992404556136247e-06, "loss": 0.0255, "step": 3078 }, { "epoch": 1.478866474543708, "grad_norm": 0.28788418061377213, "learning_rate": 5.989665484810896e-06, "loss": 0.0213, "step": 3079 }, { "epoch": 1.4793467819404418, "grad_norm": 0.3176309116668661, "learning_rate": 5.986926104341427e-06, "loss": 0.0198, "step": 3080 }, { "epoch": 1.4798270893371757, "grad_norm": 0.5249224673952383, "learning_rate": 5.984186415583547e-06, "loss": 0.0234, "step": 3081 }, { "epoch": 1.4803073967339098, "grad_norm": 0.3446216456304679, "learning_rate": 5.981446419393058e-06, "loss": 0.0324, "step": 3082 }, { "epoch": 1.4807877041306436, "grad_norm": 0.9367640732599972, "learning_rate": 5.978706116625859e-06, "loss": 0.0604, "step": 3083 }, { "epoch": 1.4812680115273775, "grad_norm": 0.34394135777287027, "learning_rate": 5.975965508137947e-06, "loss": 0.0234, "step": 3084 }, { "epoch": 1.4817483189241114, "grad_norm": 0.49603576802159144, "learning_rate": 5.97322459478541e-06, "loss": 0.0306, "step": 3085 }, { "epoch": 1.4822286263208453, "grad_norm": 0.33940030155106093, "learning_rate": 5.970483377424433e-06, "loss": 0.0202, "step": 3086 }, { "epoch": 1.4827089337175792, "grad_norm": 0.4211086266511579, "learning_rate": 5.967741856911299e-06, "loss": 0.03, "step": 3087 }, { "epoch": 1.4831892411143133, "grad_norm": 0.4653590490979963, "learning_rate": 5.965000034102382e-06, "loss": 0.0409, "step": 3088 }, { "epoch": 1.4836695485110472, "grad_norm": 0.8134205722727028, "learning_rate": 5.96225790985415e-06, "loss": 0.0489, "step": 3089 }, { "epoch": 1.484149855907781, "grad_norm": 0.3780886115545037, "learning_rate": 5.959515485023169e-06, "loss": 0.0294, "step": 3090 }, { "epoch": 1.484630163304515, "grad_norm": 0.3280928990269333, "learning_rate": 5.9567727604660966e-06, "loss": 0.0241, "step": 3091 }, { "epoch": 1.4851104707012488, "grad_norm": 0.35655201141316967, "learning_rate": 5.9540297370396825e-06, "loss": 0.0197, "step": 3092 }, { "epoch": 1.4855907780979827, "grad_norm": 0.38019707569520633, "learning_rate": 5.951286415600772e-06, "loss": 0.0266, "step": 3093 }, { "epoch": 1.4860710854947166, "grad_norm": 0.3457159394056325, "learning_rate": 5.948542797006303e-06, "loss": 0.025, "step": 3094 }, { "epoch": 1.4865513928914504, "grad_norm": 0.3078240976971388, "learning_rate": 5.945798882113306e-06, "loss": 0.0248, "step": 3095 }, { "epoch": 1.4870317002881843, "grad_norm": 0.8500214852767574, "learning_rate": 5.943054671778904e-06, "loss": 0.0384, "step": 3096 }, { "epoch": 1.4875120076849184, "grad_norm": 0.3290256331217434, "learning_rate": 5.940310166860312e-06, "loss": 0.0306, "step": 3097 }, { "epoch": 1.4879923150816523, "grad_norm": 0.3029544830362551, "learning_rate": 5.937565368214839e-06, "loss": 0.0257, "step": 3098 }, { "epoch": 1.4884726224783862, "grad_norm": 0.3937484603990035, "learning_rate": 5.934820276699883e-06, "loss": 0.0248, "step": 3099 }, { "epoch": 1.48895292987512, "grad_norm": 0.40877123764419365, "learning_rate": 5.932074893172934e-06, "loss": 0.0332, "step": 3100 }, { "epoch": 1.489433237271854, "grad_norm": 0.46450619265358467, "learning_rate": 5.929329218491574e-06, "loss": 0.0268, "step": 3101 }, { "epoch": 1.4899135446685878, "grad_norm": 0.47162781964785017, "learning_rate": 5.926583253513477e-06, "loss": 0.0284, "step": 3102 }, { "epoch": 1.490393852065322, "grad_norm": 0.4319131458436851, "learning_rate": 5.923836999096408e-06, "loss": 0.0244, "step": 3103 }, { "epoch": 1.4908741594620558, "grad_norm": 0.41926289624108887, "learning_rate": 5.921090456098215e-06, "loss": 0.0354, "step": 3104 }, { "epoch": 1.4913544668587897, "grad_norm": 0.5966365821587173, "learning_rate": 5.918343625376847e-06, "loss": 0.0339, "step": 3105 }, { "epoch": 1.4918347742555236, "grad_norm": 0.5416002086140738, "learning_rate": 5.9155965077903375e-06, "loss": 0.035, "step": 3106 }, { "epoch": 1.4923150816522575, "grad_norm": 0.7420368761341735, "learning_rate": 5.91284910419681e-06, "loss": 0.0548, "step": 3107 }, { "epoch": 1.4927953890489913, "grad_norm": 0.31260698144155835, "learning_rate": 5.910101415454475e-06, "loss": 0.025, "step": 3108 }, { "epoch": 1.4932756964457252, "grad_norm": 0.37961422901617287, "learning_rate": 5.907353442421636e-06, "loss": 0.0265, "step": 3109 }, { "epoch": 1.493756003842459, "grad_norm": 0.3876273355907605, "learning_rate": 5.904605185956685e-06, "loss": 0.0226, "step": 3110 }, { "epoch": 1.494236311239193, "grad_norm": 0.4581188334440578, "learning_rate": 5.9018566469181e-06, "loss": 0.0316, "step": 3111 }, { "epoch": 1.4947166186359269, "grad_norm": 0.385312179558122, "learning_rate": 5.899107826164447e-06, "loss": 0.027, "step": 3112 }, { "epoch": 1.495196926032661, "grad_norm": 0.2902671054948503, "learning_rate": 5.896358724554385e-06, "loss": 0.0271, "step": 3113 }, { "epoch": 1.4956772334293948, "grad_norm": 0.3557387252429253, "learning_rate": 5.893609342946656e-06, "loss": 0.0278, "step": 3114 }, { "epoch": 1.4961575408261287, "grad_norm": 0.38738532602688996, "learning_rate": 5.890859682200088e-06, "loss": 0.0405, "step": 3115 }, { "epoch": 1.4966378482228626, "grad_norm": 0.3558247951782325, "learning_rate": 5.8881097431736035e-06, "loss": 0.0248, "step": 3116 }, { "epoch": 1.4971181556195965, "grad_norm": 0.2612759552911419, "learning_rate": 5.885359526726204e-06, "loss": 0.0222, "step": 3117 }, { "epoch": 1.4975984630163304, "grad_norm": 0.4390243888042039, "learning_rate": 5.882609033716983e-06, "loss": 0.0272, "step": 3118 }, { "epoch": 1.4980787704130645, "grad_norm": 0.39948565909969724, "learning_rate": 5.879858265005115e-06, "loss": 0.0279, "step": 3119 }, { "epoch": 1.4985590778097984, "grad_norm": 0.30154269346714657, "learning_rate": 5.877107221449868e-06, "loss": 0.0262, "step": 3120 }, { "epoch": 1.4990393852065322, "grad_norm": 0.9535181833221642, "learning_rate": 5.8743559039105924e-06, "loss": 0.0396, "step": 3121 }, { "epoch": 1.4995196926032661, "grad_norm": 1.3446910661741396, "learning_rate": 5.87160431324672e-06, "loss": 0.0424, "step": 3122 }, { "epoch": 1.5, "grad_norm": 0.43445045565672086, "learning_rate": 5.868852450317773e-06, "loss": 0.0321, "step": 3123 }, { "epoch": 1.5004803073967339, "grad_norm": 0.48746973673617766, "learning_rate": 5.866100315983359e-06, "loss": 0.0382, "step": 3124 }, { "epoch": 1.5009606147934678, "grad_norm": 0.41798195853073133, "learning_rate": 5.863347911103165e-06, "loss": 0.031, "step": 3125 }, { "epoch": 1.5014409221902016, "grad_norm": 0.3229119868605541, "learning_rate": 5.860595236536971e-06, "loss": 0.0243, "step": 3126 }, { "epoch": 1.5019212295869355, "grad_norm": 0.40539786159668356, "learning_rate": 5.857842293144632e-06, "loss": 0.0331, "step": 3127 }, { "epoch": 1.5024015369836694, "grad_norm": 0.4651145099750222, "learning_rate": 5.855089081786094e-06, "loss": 0.0346, "step": 3128 }, { "epoch": 1.5028818443804035, "grad_norm": 0.3863527379874971, "learning_rate": 5.852335603321383e-06, "loss": 0.0291, "step": 3129 }, { "epoch": 1.5033621517771374, "grad_norm": 0.7028914674817851, "learning_rate": 5.84958185861061e-06, "loss": 0.0269, "step": 3130 }, { "epoch": 1.5038424591738713, "grad_norm": 0.4528106701763775, "learning_rate": 5.846827848513968e-06, "loss": 0.0438, "step": 3131 }, { "epoch": 1.5043227665706052, "grad_norm": 0.42941778855816765, "learning_rate": 5.8440735738917345e-06, "loss": 0.037, "step": 3132 }, { "epoch": 1.5048030739673393, "grad_norm": 1.3105227252011689, "learning_rate": 5.841319035604267e-06, "loss": 0.0601, "step": 3133 }, { "epoch": 1.5052833813640731, "grad_norm": 0.311566151721558, "learning_rate": 5.8385642345120085e-06, "loss": 0.0215, "step": 3134 }, { "epoch": 1.505763688760807, "grad_norm": 0.39350789099612166, "learning_rate": 5.835809171475482e-06, "loss": 0.0334, "step": 3135 }, { "epoch": 1.506243996157541, "grad_norm": 0.407913577344384, "learning_rate": 5.833053847355295e-06, "loss": 0.0292, "step": 3136 }, { "epoch": 1.5067243035542748, "grad_norm": 0.33018101676228284, "learning_rate": 5.8302982630121296e-06, "loss": 0.0204, "step": 3137 }, { "epoch": 1.5072046109510087, "grad_norm": 0.40898613923397653, "learning_rate": 5.82754241930676e-06, "loss": 0.0403, "step": 3138 }, { "epoch": 1.5076849183477425, "grad_norm": 0.37874528703644444, "learning_rate": 5.824786317100033e-06, "loss": 0.0295, "step": 3139 }, { "epoch": 1.5081652257444764, "grad_norm": 0.4817757913266935, "learning_rate": 5.822029957252877e-06, "loss": 0.0367, "step": 3140 }, { "epoch": 1.5086455331412103, "grad_norm": 0.40538100601142224, "learning_rate": 5.8192733406263036e-06, "loss": 0.0284, "step": 3141 }, { "epoch": 1.5091258405379442, "grad_norm": 0.35940184301638667, "learning_rate": 5.816516468081406e-06, "loss": 0.0281, "step": 3142 }, { "epoch": 1.509606147934678, "grad_norm": 0.5572378069560608, "learning_rate": 5.8137593404793525e-06, "loss": 0.0529, "step": 3143 }, { "epoch": 1.510086455331412, "grad_norm": 0.3698918609333422, "learning_rate": 5.8110019586813944e-06, "loss": 0.0293, "step": 3144 }, { "epoch": 1.510566762728146, "grad_norm": 0.2622760584851391, "learning_rate": 5.808244323548859e-06, "loss": 0.0244, "step": 3145 }, { "epoch": 1.51104707012488, "grad_norm": 0.37380709505061227, "learning_rate": 5.805486435943158e-06, "loss": 0.0182, "step": 3146 }, { "epoch": 1.5115273775216138, "grad_norm": 0.36160243397447317, "learning_rate": 5.802728296725779e-06, "loss": 0.0301, "step": 3147 }, { "epoch": 1.5120076849183477, "grad_norm": 0.35058010095606223, "learning_rate": 5.7999699067582896e-06, "loss": 0.0263, "step": 3148 }, { "epoch": 1.5124879923150818, "grad_norm": 0.37533690482780846, "learning_rate": 5.797211266902331e-06, "loss": 0.0333, "step": 3149 }, { "epoch": 1.5129682997118157, "grad_norm": 0.3730923136949735, "learning_rate": 5.7944523780196275e-06, "loss": 0.0225, "step": 3150 }, { "epoch": 1.5134486071085496, "grad_norm": 0.46084601385358814, "learning_rate": 5.79169324097198e-06, "loss": 0.0306, "step": 3151 }, { "epoch": 1.5139289145052834, "grad_norm": 0.31975491939952705, "learning_rate": 5.788933856621266e-06, "loss": 0.0259, "step": 3152 }, { "epoch": 1.5144092219020173, "grad_norm": 0.4138419857242112, "learning_rate": 5.786174225829443e-06, "loss": 0.0388, "step": 3153 }, { "epoch": 1.5148895292987512, "grad_norm": 0.31235445958066965, "learning_rate": 5.78341434945854e-06, "loss": 0.024, "step": 3154 }, { "epoch": 1.515369836695485, "grad_norm": 0.2920678574197183, "learning_rate": 5.780654228370669e-06, "loss": 0.0363, "step": 3155 }, { "epoch": 1.515850144092219, "grad_norm": 0.4064717183334576, "learning_rate": 5.777893863428012e-06, "loss": 0.0302, "step": 3156 }, { "epoch": 1.5163304514889528, "grad_norm": 0.30707008904111754, "learning_rate": 5.775133255492835e-06, "loss": 0.0226, "step": 3157 }, { "epoch": 1.5168107588856867, "grad_norm": 0.38357599267905407, "learning_rate": 5.7723724054274735e-06, "loss": 0.0425, "step": 3158 }, { "epoch": 1.5172910662824206, "grad_norm": 0.4804046223810054, "learning_rate": 5.769611314094337e-06, "loss": 0.0355, "step": 3159 }, { "epoch": 1.5177713736791547, "grad_norm": 0.9488096197118809, "learning_rate": 5.766849982355921e-06, "loss": 0.0278, "step": 3160 }, { "epoch": 1.5182516810758886, "grad_norm": 0.30967447556721056, "learning_rate": 5.764088411074785e-06, "loss": 0.029, "step": 3161 }, { "epoch": 1.5187319884726225, "grad_norm": 0.34237861574605455, "learning_rate": 5.761326601113568e-06, "loss": 0.0375, "step": 3162 }, { "epoch": 1.5192122958693564, "grad_norm": 0.4504039240222762, "learning_rate": 5.758564553334984e-06, "loss": 0.0366, "step": 3163 }, { "epoch": 1.5196926032660905, "grad_norm": 0.37223702301364997, "learning_rate": 5.755802268601818e-06, "loss": 0.0301, "step": 3164 }, { "epoch": 1.5201729106628243, "grad_norm": 0.680313387793209, "learning_rate": 5.753039747776933e-06, "loss": 0.0475, "step": 3165 }, { "epoch": 1.5206532180595582, "grad_norm": 0.2882992683794043, "learning_rate": 5.7502769917232635e-06, "loss": 0.0278, "step": 3166 }, { "epoch": 1.521133525456292, "grad_norm": 0.4963114309373402, "learning_rate": 5.74751400130382e-06, "loss": 0.0379, "step": 3167 }, { "epoch": 1.521613832853026, "grad_norm": 0.29134104209542133, "learning_rate": 5.744750777381682e-06, "loss": 0.0298, "step": 3168 }, { "epoch": 1.5220941402497599, "grad_norm": 0.30868878764587354, "learning_rate": 5.741987320820005e-06, "loss": 0.0268, "step": 3169 }, { "epoch": 1.5225744476464937, "grad_norm": 0.7432825529949977, "learning_rate": 5.739223632482016e-06, "loss": 0.0384, "step": 3170 }, { "epoch": 1.5230547550432276, "grad_norm": 0.3514172245800839, "learning_rate": 5.7364597132310154e-06, "loss": 0.0306, "step": 3171 }, { "epoch": 1.5235350624399615, "grad_norm": 0.24835194482961137, "learning_rate": 5.733695563930375e-06, "loss": 0.0203, "step": 3172 }, { "epoch": 1.5240153698366954, "grad_norm": 0.439727271670017, "learning_rate": 5.730931185443538e-06, "loss": 0.0307, "step": 3173 }, { "epoch": 1.5244956772334293, "grad_norm": 0.2910897074673792, "learning_rate": 5.728166578634018e-06, "loss": 0.0231, "step": 3174 }, { "epoch": 1.5249759846301632, "grad_norm": 0.4207806119190366, "learning_rate": 5.725401744365407e-06, "loss": 0.0332, "step": 3175 }, { "epoch": 1.5254562920268973, "grad_norm": 0.3657203104161631, "learning_rate": 5.722636683501359e-06, "loss": 0.0265, "step": 3176 }, { "epoch": 1.5259365994236311, "grad_norm": 0.41797146788969963, "learning_rate": 5.719871396905603e-06, "loss": 0.0328, "step": 3177 }, { "epoch": 1.526416906820365, "grad_norm": 0.36909803936706825, "learning_rate": 5.717105885441936e-06, "loss": 0.0227, "step": 3178 }, { "epoch": 1.526897214217099, "grad_norm": 0.24596143331259568, "learning_rate": 5.714340149974231e-06, "loss": 0.0168, "step": 3179 }, { "epoch": 1.527377521613833, "grad_norm": 0.3567288759776677, "learning_rate": 5.711574191366427e-06, "loss": 0.0264, "step": 3180 }, { "epoch": 1.5278578290105669, "grad_norm": 0.4618457570065902, "learning_rate": 5.70880801048253e-06, "loss": 0.0242, "step": 3181 }, { "epoch": 1.5283381364073008, "grad_norm": 0.493502220985007, "learning_rate": 5.70604160818662e-06, "loss": 0.0334, "step": 3182 }, { "epoch": 1.5288184438040346, "grad_norm": 0.3436343394251586, "learning_rate": 5.703274985342844e-06, "loss": 0.0239, "step": 3183 }, { "epoch": 1.5292987512007685, "grad_norm": 1.249207183590463, "learning_rate": 5.70050814281542e-06, "loss": 0.0278, "step": 3184 }, { "epoch": 1.5297790585975024, "grad_norm": 0.7186881849879787, "learning_rate": 5.697741081468631e-06, "loss": 0.0325, "step": 3185 }, { "epoch": 1.5302593659942363, "grad_norm": 0.2941612770499657, "learning_rate": 5.694973802166832e-06, "loss": 0.018, "step": 3186 }, { "epoch": 1.5307396733909702, "grad_norm": 0.4708678207984971, "learning_rate": 5.692206305774444e-06, "loss": 0.0391, "step": 3187 }, { "epoch": 1.531219980787704, "grad_norm": 0.3252008774396793, "learning_rate": 5.6894385931559555e-06, "loss": 0.0289, "step": 3188 }, { "epoch": 1.531700288184438, "grad_norm": 0.610417681862749, "learning_rate": 5.686670665175925e-06, "loss": 0.0457, "step": 3189 }, { "epoch": 1.5321805955811718, "grad_norm": 0.6366606704542316, "learning_rate": 5.683902522698975e-06, "loss": 0.0441, "step": 3190 }, { "epoch": 1.532660902977906, "grad_norm": 0.3152871747926857, "learning_rate": 5.681134166589798e-06, "loss": 0.029, "step": 3191 }, { "epoch": 1.5331412103746398, "grad_norm": 0.2456177182629036, "learning_rate": 5.67836559771315e-06, "loss": 0.0189, "step": 3192 }, { "epoch": 1.5336215177713737, "grad_norm": 0.4324618351751667, "learning_rate": 5.675596816933858e-06, "loss": 0.0324, "step": 3193 }, { "epoch": 1.5341018251681076, "grad_norm": 0.3531714861791666, "learning_rate": 5.67282782511681e-06, "loss": 0.0335, "step": 3194 }, { "epoch": 1.5345821325648417, "grad_norm": 0.683506169924937, "learning_rate": 5.670058623126964e-06, "loss": 0.0326, "step": 3195 }, { "epoch": 1.5350624399615755, "grad_norm": 0.3105704948745065, "learning_rate": 5.66728921182934e-06, "loss": 0.0234, "step": 3196 }, { "epoch": 1.5355427473583094, "grad_norm": 0.44356475495001263, "learning_rate": 5.664519592089029e-06, "loss": 0.0373, "step": 3197 }, { "epoch": 1.5360230547550433, "grad_norm": 0.32059680675106494, "learning_rate": 5.661749764771182e-06, "loss": 0.026, "step": 3198 }, { "epoch": 1.5365033621517772, "grad_norm": 0.38105157366698034, "learning_rate": 5.658979730741014e-06, "loss": 0.0259, "step": 3199 }, { "epoch": 1.536983669548511, "grad_norm": 0.34815016182604097, "learning_rate": 5.656209490863809e-06, "loss": 0.032, "step": 3200 }, { "epoch": 1.537463976945245, "grad_norm": 0.3722276782828193, "learning_rate": 5.653439046004914e-06, "loss": 0.0253, "step": 3201 }, { "epoch": 1.5379442843419788, "grad_norm": 0.2973096829788525, "learning_rate": 5.65066839702974e-06, "loss": 0.023, "step": 3202 }, { "epoch": 1.5384245917387127, "grad_norm": 0.2588678783207932, "learning_rate": 5.647897544803757e-06, "loss": 0.0192, "step": 3203 }, { "epoch": 1.5389048991354466, "grad_norm": 0.27625672667966156, "learning_rate": 5.645126490192507e-06, "loss": 0.0194, "step": 3204 }, { "epoch": 1.5393852065321805, "grad_norm": 0.4494961828421391, "learning_rate": 5.64235523406159e-06, "loss": 0.0328, "step": 3205 }, { "epoch": 1.5398655139289144, "grad_norm": 0.4462900467953894, "learning_rate": 5.639583777276667e-06, "loss": 0.0358, "step": 3206 }, { "epoch": 1.5403458213256485, "grad_norm": 0.9334879764322411, "learning_rate": 5.6368121207034675e-06, "loss": 0.0391, "step": 3207 }, { "epoch": 1.5408261287223823, "grad_norm": 0.25812532856277925, "learning_rate": 5.6340402652077785e-06, "loss": 0.0237, "step": 3208 }, { "epoch": 1.5413064361191162, "grad_norm": 0.3556165519370836, "learning_rate": 5.631268211655452e-06, "loss": 0.0264, "step": 3209 }, { "epoch": 1.54178674351585, "grad_norm": 0.35699905420519074, "learning_rate": 5.6284959609124e-06, "loss": 0.0316, "step": 3210 }, { "epoch": 1.5422670509125842, "grad_norm": 0.4105306159598001, "learning_rate": 5.625723513844599e-06, "loss": 0.0255, "step": 3211 }, { "epoch": 1.542747358309318, "grad_norm": 0.8062146574532175, "learning_rate": 5.622950871318082e-06, "loss": 0.0414, "step": 3212 }, { "epoch": 1.543227665706052, "grad_norm": 0.4888447366233077, "learning_rate": 5.620178034198946e-06, "loss": 0.0394, "step": 3213 }, { "epoch": 1.5437079731027858, "grad_norm": 0.3378689619864133, "learning_rate": 5.6174050033533485e-06, "loss": 0.029, "step": 3214 }, { "epoch": 1.5441882804995197, "grad_norm": 0.36141888294957475, "learning_rate": 5.61463177964751e-06, "loss": 0.019, "step": 3215 }, { "epoch": 1.5446685878962536, "grad_norm": 0.5024434246859866, "learning_rate": 5.61185836394771e-06, "loss": 0.0343, "step": 3216 }, { "epoch": 1.5451488952929875, "grad_norm": 0.36977810472580763, "learning_rate": 5.609084757120282e-06, "loss": 0.0296, "step": 3217 }, { "epoch": 1.5456292026897214, "grad_norm": 0.43043610741844646, "learning_rate": 5.606310960031626e-06, "loss": 0.0265, "step": 3218 }, { "epoch": 1.5461095100864553, "grad_norm": 0.5049577625951819, "learning_rate": 5.6035369735482006e-06, "loss": 0.0344, "step": 3219 }, { "epoch": 1.5465898174831891, "grad_norm": 0.4882067328948608, "learning_rate": 5.600762798536522e-06, "loss": 0.0351, "step": 3220 }, { "epoch": 1.547070124879923, "grad_norm": 0.5069616982288707, "learning_rate": 5.5979884358631665e-06, "loss": 0.032, "step": 3221 }, { "epoch": 1.547550432276657, "grad_norm": 0.487908072320104, "learning_rate": 5.595213886394765e-06, "loss": 0.0351, "step": 3222 }, { "epoch": 1.548030739673391, "grad_norm": 0.7312723939393814, "learning_rate": 5.592439150998012e-06, "loss": 0.0393, "step": 3223 }, { "epoch": 1.5485110470701249, "grad_norm": 0.3150278782658502, "learning_rate": 5.58966423053966e-06, "loss": 0.0291, "step": 3224 }, { "epoch": 1.5489913544668588, "grad_norm": 0.4053441972652173, "learning_rate": 5.586889125886514e-06, "loss": 0.0261, "step": 3225 }, { "epoch": 1.5494716618635929, "grad_norm": 0.37452228764714796, "learning_rate": 5.584113837905442e-06, "loss": 0.027, "step": 3226 }, { "epoch": 1.5499519692603267, "grad_norm": 0.4653819729390256, "learning_rate": 5.581338367463368e-06, "loss": 0.0289, "step": 3227 }, { "epoch": 1.5504322766570606, "grad_norm": 0.5539566394348882, "learning_rate": 5.57856271542727e-06, "loss": 0.0406, "step": 3228 }, { "epoch": 1.5509125840537945, "grad_norm": 0.3871246348595946, "learning_rate": 5.575786882664187e-06, "loss": 0.0392, "step": 3229 }, { "epoch": 1.5513928914505284, "grad_norm": 0.4902823824311029, "learning_rate": 5.57301087004121e-06, "loss": 0.0277, "step": 3230 }, { "epoch": 1.5518731988472623, "grad_norm": 0.3835335533927022, "learning_rate": 5.57023467842549e-06, "loss": 0.0299, "step": 3231 }, { "epoch": 1.5523535062439962, "grad_norm": 0.3133799037538476, "learning_rate": 5.567458308684233e-06, "loss": 0.037, "step": 3232 }, { "epoch": 1.55283381364073, "grad_norm": 0.5461258505123334, "learning_rate": 5.564681761684697e-06, "loss": 0.0313, "step": 3233 }, { "epoch": 1.553314121037464, "grad_norm": 0.2906047660278051, "learning_rate": 5.561905038294203e-06, "loss": 0.0258, "step": 3234 }, { "epoch": 1.5537944284341978, "grad_norm": 0.3691894808281212, "learning_rate": 5.55912813938012e-06, "loss": 0.0241, "step": 3235 }, { "epoch": 1.5542747358309317, "grad_norm": 0.38323486259175005, "learning_rate": 5.556351065809873e-06, "loss": 0.0303, "step": 3236 }, { "epoch": 1.5547550432276656, "grad_norm": 0.3789163778039989, "learning_rate": 5.553573818450946e-06, "loss": 0.0312, "step": 3237 }, { "epoch": 1.5552353506243997, "grad_norm": 0.4760354448746208, "learning_rate": 5.550796398170872e-06, "loss": 0.031, "step": 3238 }, { "epoch": 1.5557156580211335, "grad_norm": 0.38913385190628674, "learning_rate": 5.548018805837244e-06, "loss": 0.0307, "step": 3239 }, { "epoch": 1.5561959654178674, "grad_norm": 0.34767584779487026, "learning_rate": 5.5452410423176995e-06, "loss": 0.0344, "step": 3240 }, { "epoch": 1.5566762728146013, "grad_norm": 0.43643813693538797, "learning_rate": 5.5424631084799385e-06, "loss": 0.0419, "step": 3241 }, { "epoch": 1.5571565802113354, "grad_norm": 0.41465602174558164, "learning_rate": 5.539685005191709e-06, "loss": 0.0302, "step": 3242 }, { "epoch": 1.5576368876080693, "grad_norm": 0.36867878220656763, "learning_rate": 5.536906733320816e-06, "loss": 0.0316, "step": 3243 }, { "epoch": 1.5581171950048032, "grad_norm": 0.4674666282069446, "learning_rate": 5.534128293735113e-06, "loss": 0.0509, "step": 3244 }, { "epoch": 1.558597502401537, "grad_norm": 0.38354704803416173, "learning_rate": 5.531349687302506e-06, "loss": 0.0289, "step": 3245 }, { "epoch": 1.559077809798271, "grad_norm": 0.3608064016301451, "learning_rate": 5.5285709148909584e-06, "loss": 0.0297, "step": 3246 }, { "epoch": 1.5595581171950048, "grad_norm": 0.3963659763665162, "learning_rate": 5.52579197736848e-06, "loss": 0.0326, "step": 3247 }, { "epoch": 1.5600384245917387, "grad_norm": 0.44093964199338676, "learning_rate": 5.523012875603135e-06, "loss": 0.0336, "step": 3248 }, { "epoch": 1.5605187319884726, "grad_norm": 0.54508975441162, "learning_rate": 5.520233610463035e-06, "loss": 0.045, "step": 3249 }, { "epoch": 1.5609990393852065, "grad_norm": 0.4107405945967228, "learning_rate": 5.517454182816348e-06, "loss": 0.0376, "step": 3250 }, { "epoch": 1.5614793467819403, "grad_norm": 0.271682360398674, "learning_rate": 5.514674593531288e-06, "loss": 0.0214, "step": 3251 }, { "epoch": 1.5619596541786742, "grad_norm": 0.95299638419253, "learning_rate": 5.511894843476128e-06, "loss": 0.0342, "step": 3252 }, { "epoch": 1.562439961575408, "grad_norm": 0.646315774490002, "learning_rate": 5.509114933519179e-06, "loss": 0.0434, "step": 3253 }, { "epoch": 1.5629202689721422, "grad_norm": 0.35865814562196896, "learning_rate": 5.506334864528808e-06, "loss": 0.0258, "step": 3254 }, { "epoch": 1.563400576368876, "grad_norm": 0.3166583548885205, "learning_rate": 5.503554637373434e-06, "loss": 0.03, "step": 3255 }, { "epoch": 1.56388088376561, "grad_norm": 0.45904021937049955, "learning_rate": 5.500774252921521e-06, "loss": 0.039, "step": 3256 }, { "epoch": 1.5643611911623438, "grad_norm": 0.38414585808590096, "learning_rate": 5.497993712041589e-06, "loss": 0.0279, "step": 3257 }, { "epoch": 1.564841498559078, "grad_norm": 0.3681161410544605, "learning_rate": 5.495213015602195e-06, "loss": 0.0227, "step": 3258 }, { "epoch": 1.5653218059558118, "grad_norm": 0.3338899740787236, "learning_rate": 5.492432164471957e-06, "loss": 0.0251, "step": 3259 }, { "epoch": 1.5658021133525457, "grad_norm": 1.042333891467213, "learning_rate": 5.489651159519534e-06, "loss": 0.0324, "step": 3260 }, { "epoch": 1.5662824207492796, "grad_norm": 0.3310411506439069, "learning_rate": 5.486870001613635e-06, "loss": 0.0296, "step": 3261 }, { "epoch": 1.5667627281460135, "grad_norm": 0.34750863812526545, "learning_rate": 5.484088691623018e-06, "loss": 0.0225, "step": 3262 }, { "epoch": 1.5672430355427474, "grad_norm": 0.36797104029934785, "learning_rate": 5.481307230416484e-06, "loss": 0.0345, "step": 3263 }, { "epoch": 1.5677233429394812, "grad_norm": 0.42490811009420576, "learning_rate": 5.478525618862887e-06, "loss": 0.045, "step": 3264 }, { "epoch": 1.5682036503362151, "grad_norm": 0.31366201766092333, "learning_rate": 5.4757438578311275e-06, "loss": 0.0232, "step": 3265 }, { "epoch": 1.568683957732949, "grad_norm": 0.29454333129313615, "learning_rate": 5.472961948190147e-06, "loss": 0.0219, "step": 3266 }, { "epoch": 1.5691642651296829, "grad_norm": 0.2878796979244289, "learning_rate": 5.470179890808939e-06, "loss": 0.0239, "step": 3267 }, { "epoch": 1.5696445725264168, "grad_norm": 0.35449196494561575, "learning_rate": 5.467397686556539e-06, "loss": 0.0267, "step": 3268 }, { "epoch": 1.5701248799231509, "grad_norm": 0.329393609842168, "learning_rate": 5.464615336302034e-06, "loss": 0.0219, "step": 3269 }, { "epoch": 1.5706051873198847, "grad_norm": 0.36150986412618963, "learning_rate": 5.461832840914551e-06, "loss": 0.0285, "step": 3270 }, { "epoch": 1.5710854947166186, "grad_norm": 0.3874596193884061, "learning_rate": 5.459050201263266e-06, "loss": 0.0302, "step": 3271 }, { "epoch": 1.5715658021133525, "grad_norm": 0.3275756521356713, "learning_rate": 5.456267418217398e-06, "loss": 0.0212, "step": 3272 }, { "epoch": 1.5720461095100866, "grad_norm": 0.38895436979874487, "learning_rate": 5.45348449264621e-06, "loss": 0.0439, "step": 3273 }, { "epoch": 1.5725264169068205, "grad_norm": 0.4312196113121505, "learning_rate": 5.450701425419014e-06, "loss": 0.0221, "step": 3274 }, { "epoch": 1.5730067243035544, "grad_norm": 0.36121561379969597, "learning_rate": 5.4479182174051616e-06, "loss": 0.0308, "step": 3275 }, { "epoch": 1.5734870317002883, "grad_norm": 0.6354514580530405, "learning_rate": 5.4451348694740495e-06, "loss": 0.0383, "step": 3276 }, { "epoch": 1.5739673390970221, "grad_norm": 1.2086886102981054, "learning_rate": 5.442351382495117e-06, "loss": 0.0415, "step": 3277 }, { "epoch": 1.574447646493756, "grad_norm": 0.25313025512553056, "learning_rate": 5.439567757337853e-06, "loss": 0.0246, "step": 3278 }, { "epoch": 1.57492795389049, "grad_norm": 0.3677233053729359, "learning_rate": 5.43678399487178e-06, "loss": 0.0352, "step": 3279 }, { "epoch": 1.5754082612872238, "grad_norm": 0.38892551937907743, "learning_rate": 5.434000095966473e-06, "loss": 0.0268, "step": 3280 }, { "epoch": 1.5758885686839577, "grad_norm": 0.5223749518482866, "learning_rate": 5.431216061491542e-06, "loss": 0.0415, "step": 3281 }, { "epoch": 1.5763688760806915, "grad_norm": 0.4271889150696393, "learning_rate": 5.4284318923166426e-06, "loss": 0.0285, "step": 3282 }, { "epoch": 1.5768491834774254, "grad_norm": 0.5102397289671419, "learning_rate": 5.425647589311473e-06, "loss": 0.0268, "step": 3283 }, { "epoch": 1.5773294908741593, "grad_norm": 0.4739870868425796, "learning_rate": 5.422863153345773e-06, "loss": 0.0309, "step": 3284 }, { "epoch": 1.5778097982708934, "grad_norm": 0.34293121445564706, "learning_rate": 5.4200785852893224e-06, "loss": 0.0256, "step": 3285 }, { "epoch": 1.5782901056676273, "grad_norm": 1.1124562140597398, "learning_rate": 5.417293886011943e-06, "loss": 0.0442, "step": 3286 }, { "epoch": 1.5787704130643612, "grad_norm": 0.33458221070388655, "learning_rate": 5.414509056383498e-06, "loss": 0.0239, "step": 3287 }, { "epoch": 1.579250720461095, "grad_norm": 0.4918258590698042, "learning_rate": 5.411724097273891e-06, "loss": 0.0365, "step": 3288 }, { "epoch": 1.5797310278578292, "grad_norm": 0.3579901179321833, "learning_rate": 5.408939009553067e-06, "loss": 0.0219, "step": 3289 }, { "epoch": 1.580211335254563, "grad_norm": 0.3340407088249012, "learning_rate": 5.40615379409101e-06, "loss": 0.0206, "step": 3290 }, { "epoch": 1.580691642651297, "grad_norm": 0.5074953148857294, "learning_rate": 5.403368451757742e-06, "loss": 0.0405, "step": 3291 }, { "epoch": 1.5811719500480308, "grad_norm": 0.4356673586446424, "learning_rate": 5.400582983423331e-06, "loss": 0.0363, "step": 3292 }, { "epoch": 1.5816522574447647, "grad_norm": 0.5510014963356703, "learning_rate": 5.39779738995788e-06, "loss": 0.0198, "step": 3293 }, { "epoch": 1.5821325648414986, "grad_norm": 0.4801622766034896, "learning_rate": 5.395011672231527e-06, "loss": 0.0382, "step": 3294 }, { "epoch": 1.5826128722382324, "grad_norm": 0.36829092245819434, "learning_rate": 5.392225831114456e-06, "loss": 0.0398, "step": 3295 }, { "epoch": 1.5830931796349663, "grad_norm": 0.3055509028220042, "learning_rate": 5.389439867476887e-06, "loss": 0.0337, "step": 3296 }, { "epoch": 1.5835734870317002, "grad_norm": 0.3084141439059703, "learning_rate": 5.386653782189077e-06, "loss": 0.0291, "step": 3297 }, { "epoch": 1.584053794428434, "grad_norm": 0.3920597900283518, "learning_rate": 5.383867576121324e-06, "loss": 0.0491, "step": 3298 }, { "epoch": 1.584534101825168, "grad_norm": 0.4518237498549912, "learning_rate": 5.381081250143958e-06, "loss": 0.0381, "step": 3299 }, { "epoch": 1.585014409221902, "grad_norm": 0.331072383841286, "learning_rate": 5.378294805127352e-06, "loss": 0.0276, "step": 3300 }, { "epoch": 1.585494716618636, "grad_norm": 0.38832654992202464, "learning_rate": 5.375508241941916e-06, "loss": 0.0286, "step": 3301 }, { "epoch": 1.5859750240153698, "grad_norm": 0.3983645916300891, "learning_rate": 5.372721561458093e-06, "loss": 0.0313, "step": 3302 }, { "epoch": 1.5864553314121037, "grad_norm": 0.5523770081310068, "learning_rate": 5.369934764546366e-06, "loss": 0.0278, "step": 3303 }, { "epoch": 1.5869356388088378, "grad_norm": 0.5032117298325557, "learning_rate": 5.367147852077252e-06, "loss": 0.0376, "step": 3304 }, { "epoch": 1.5874159462055717, "grad_norm": 0.37169316650565937, "learning_rate": 5.364360824921308e-06, "loss": 0.0405, "step": 3305 }, { "epoch": 1.5878962536023056, "grad_norm": 0.36758269473282756, "learning_rate": 5.3615736839491216e-06, "loss": 0.0228, "step": 3306 }, { "epoch": 1.5883765609990395, "grad_norm": 0.3185221419392269, "learning_rate": 5.358786430031319e-06, "loss": 0.0373, "step": 3307 }, { "epoch": 1.5888568683957733, "grad_norm": 2.082103198734496, "learning_rate": 5.355999064038562e-06, "loss": 0.0265, "step": 3308 }, { "epoch": 1.5893371757925072, "grad_norm": 0.29227965234157544, "learning_rate": 5.353211586841547e-06, "loss": 0.0156, "step": 3309 }, { "epoch": 1.589817483189241, "grad_norm": 0.3325014708822988, "learning_rate": 5.350423999311002e-06, "loss": 0.0344, "step": 3310 }, { "epoch": 1.590297790585975, "grad_norm": 0.25640171676339746, "learning_rate": 5.347636302317696e-06, "loss": 0.0196, "step": 3311 }, { "epoch": 1.5907780979827089, "grad_norm": 0.342269477563172, "learning_rate": 5.344848496732426e-06, "loss": 0.0263, "step": 3312 }, { "epoch": 1.5912584053794427, "grad_norm": 0.3715158148400332, "learning_rate": 5.342060583426025e-06, "loss": 0.0221, "step": 3313 }, { "epoch": 1.5917387127761766, "grad_norm": 0.28762218131062656, "learning_rate": 5.339272563269362e-06, "loss": 0.0234, "step": 3314 }, { "epoch": 1.5922190201729105, "grad_norm": 0.3220728314462182, "learning_rate": 5.3364844371333355e-06, "loss": 0.0314, "step": 3315 }, { "epoch": 1.5926993275696446, "grad_norm": 0.3888349522043892, "learning_rate": 5.333696205888882e-06, "loss": 0.0274, "step": 3316 }, { "epoch": 1.5931796349663785, "grad_norm": 0.4761430777255065, "learning_rate": 5.330907870406963e-06, "loss": 0.0235, "step": 3317 }, { "epoch": 1.5936599423631124, "grad_norm": 0.24934980448988034, "learning_rate": 5.3281194315585815e-06, "loss": 0.0175, "step": 3318 }, { "epoch": 1.5941402497598463, "grad_norm": 0.3086893494328813, "learning_rate": 5.3253308902147674e-06, "loss": 0.0191, "step": 3319 }, { "epoch": 1.5946205571565804, "grad_norm": 0.39070134787635047, "learning_rate": 5.322542247246583e-06, "loss": 0.037, "step": 3320 }, { "epoch": 1.5951008645533142, "grad_norm": 0.37543842792470783, "learning_rate": 5.319753503525122e-06, "loss": 0.035, "step": 3321 }, { "epoch": 1.5955811719500481, "grad_norm": 0.2165035800677036, "learning_rate": 5.316964659921515e-06, "loss": 0.0183, "step": 3322 }, { "epoch": 1.596061479346782, "grad_norm": 0.3269940294034457, "learning_rate": 5.314175717306916e-06, "loss": 0.0312, "step": 3323 }, { "epoch": 1.5965417867435159, "grad_norm": 0.28684255766457345, "learning_rate": 5.3113866765525145e-06, "loss": 0.026, "step": 3324 }, { "epoch": 1.5970220941402498, "grad_norm": 0.27182493323004153, "learning_rate": 5.308597538529532e-06, "loss": 0.0195, "step": 3325 }, { "epoch": 1.5975024015369836, "grad_norm": 0.33078279727128096, "learning_rate": 5.3058083041092145e-06, "loss": 0.0256, "step": 3326 }, { "epoch": 1.5979827089337175, "grad_norm": 0.3305944916531448, "learning_rate": 5.303018974162843e-06, "loss": 0.0261, "step": 3327 }, { "epoch": 1.5984630163304514, "grad_norm": 0.3366517847721705, "learning_rate": 5.300229549561726e-06, "loss": 0.0271, "step": 3328 }, { "epoch": 1.5989433237271853, "grad_norm": 0.29573595244039125, "learning_rate": 5.297440031177206e-06, "loss": 0.0236, "step": 3329 }, { "epoch": 1.5994236311239192, "grad_norm": 0.2895192120431485, "learning_rate": 5.2946504198806484e-06, "loss": 0.0272, "step": 3330 }, { "epoch": 1.5999039385206533, "grad_norm": 0.5814480593726495, "learning_rate": 5.29186071654345e-06, "loss": 0.0283, "step": 3331 }, { "epoch": 1.6003842459173871, "grad_norm": 0.34740114561126334, "learning_rate": 5.289070922037038e-06, "loss": 0.0384, "step": 3332 }, { "epoch": 1.600864553314121, "grad_norm": 0.3754930800369522, "learning_rate": 5.286281037232869e-06, "loss": 0.0355, "step": 3333 }, { "epoch": 1.601344860710855, "grad_norm": 0.46953220642154786, "learning_rate": 5.283491063002424e-06, "loss": 0.0391, "step": 3334 }, { "epoch": 1.601825168107589, "grad_norm": 0.6527276862985042, "learning_rate": 5.280701000217211e-06, "loss": 0.0404, "step": 3335 }, { "epoch": 1.602305475504323, "grad_norm": 0.33809690571958934, "learning_rate": 5.277910849748773e-06, "loss": 0.0284, "step": 3336 }, { "epoch": 1.6027857829010568, "grad_norm": 0.32943389669214473, "learning_rate": 5.275120612468675e-06, "loss": 0.0337, "step": 3337 }, { "epoch": 1.6032660902977907, "grad_norm": 0.3197637761006376, "learning_rate": 5.272330289248507e-06, "loss": 0.0263, "step": 3338 }, { "epoch": 1.6037463976945245, "grad_norm": 0.3420496769617597, "learning_rate": 5.269539880959893e-06, "loss": 0.025, "step": 3339 }, { "epoch": 1.6042267050912584, "grad_norm": 0.3284418713008828, "learning_rate": 5.266749388474476e-06, "loss": 0.0274, "step": 3340 }, { "epoch": 1.6047070124879923, "grad_norm": 0.32725270886832764, "learning_rate": 5.26395881266393e-06, "loss": 0.0237, "step": 3341 }, { "epoch": 1.6051873198847262, "grad_norm": 0.28279370179352586, "learning_rate": 5.261168154399953e-06, "loss": 0.0266, "step": 3342 }, { "epoch": 1.60566762728146, "grad_norm": 0.4781408760700571, "learning_rate": 5.2583774145542695e-06, "loss": 0.0377, "step": 3343 }, { "epoch": 1.606147934678194, "grad_norm": 0.3045307739678876, "learning_rate": 5.255586593998631e-06, "loss": 0.0175, "step": 3344 }, { "epoch": 1.6066282420749278, "grad_norm": 0.33139336634691174, "learning_rate": 5.252795693604811e-06, "loss": 0.0399, "step": 3345 }, { "epoch": 1.6071085494716617, "grad_norm": 0.31645571503560016, "learning_rate": 5.2500047142446075e-06, "loss": 0.0224, "step": 3346 }, { "epoch": 1.6075888568683958, "grad_norm": 0.327255442484458, "learning_rate": 5.247213656789851e-06, "loss": 0.0293, "step": 3347 }, { "epoch": 1.6080691642651297, "grad_norm": 0.3693096557704257, "learning_rate": 5.244422522112387e-06, "loss": 0.0261, "step": 3348 }, { "epoch": 1.6085494716618636, "grad_norm": 0.25988855421251716, "learning_rate": 5.241631311084089e-06, "loss": 0.0221, "step": 3349 }, { "epoch": 1.6090297790585975, "grad_norm": 0.37771770015606926, "learning_rate": 5.238840024576852e-06, "loss": 0.0355, "step": 3350 }, { "epoch": 1.6095100864553316, "grad_norm": 0.40658089241353523, "learning_rate": 5.236048663462601e-06, "loss": 0.0372, "step": 3351 }, { "epoch": 1.6099903938520654, "grad_norm": 0.6226769535681459, "learning_rate": 5.233257228613279e-06, "loss": 0.0329, "step": 3352 }, { "epoch": 1.6104707012487993, "grad_norm": 0.34251582616727966, "learning_rate": 5.23046572090085e-06, "loss": 0.0199, "step": 3353 }, { "epoch": 1.6109510086455332, "grad_norm": 0.37472317959579066, "learning_rate": 5.227674141197305e-06, "loss": 0.0293, "step": 3354 }, { "epoch": 1.611431316042267, "grad_norm": 0.32657239560374457, "learning_rate": 5.224882490374658e-06, "loss": 0.022, "step": 3355 }, { "epoch": 1.611911623439001, "grad_norm": 0.5205776790033706, "learning_rate": 5.222090769304943e-06, "loss": 0.0381, "step": 3356 }, { "epoch": 1.6123919308357348, "grad_norm": 0.2747102253301205, "learning_rate": 5.219298978860216e-06, "loss": 0.0162, "step": 3357 }, { "epoch": 1.6128722382324687, "grad_norm": 0.2960181524424586, "learning_rate": 5.2165071199125535e-06, "loss": 0.0295, "step": 3358 }, { "epoch": 1.6133525456292026, "grad_norm": 0.3696134080123318, "learning_rate": 5.213715193334058e-06, "loss": 0.0279, "step": 3359 }, { "epoch": 1.6138328530259365, "grad_norm": 0.3787472941587381, "learning_rate": 5.210923199996848e-06, "loss": 0.0299, "step": 3360 }, { "epoch": 1.6143131604226704, "grad_norm": 0.38895713674201926, "learning_rate": 5.208131140773065e-06, "loss": 0.0443, "step": 3361 }, { "epoch": 1.6147934678194045, "grad_norm": 0.86496531381608, "learning_rate": 5.2053390165348725e-06, "loss": 0.0352, "step": 3362 }, { "epoch": 1.6152737752161384, "grad_norm": 0.3794247330992399, "learning_rate": 5.202546828154452e-06, "loss": 0.0373, "step": 3363 }, { "epoch": 1.6157540826128722, "grad_norm": 0.24782123507678933, "learning_rate": 5.199754576504006e-06, "loss": 0.0238, "step": 3364 }, { "epoch": 1.6162343900096061, "grad_norm": 0.28449530126472505, "learning_rate": 5.196962262455755e-06, "loss": 0.028, "step": 3365 }, { "epoch": 1.6167146974063402, "grad_norm": 0.3131002380074779, "learning_rate": 5.194169886881945e-06, "loss": 0.0191, "step": 3366 }, { "epoch": 1.617195004803074, "grad_norm": 1.0198349157010196, "learning_rate": 5.191377450654832e-06, "loss": 0.025, "step": 3367 }, { "epoch": 1.617675312199808, "grad_norm": 0.38946103267468796, "learning_rate": 5.188584954646698e-06, "loss": 0.0332, "step": 3368 }, { "epoch": 1.6181556195965419, "grad_norm": 0.5425091710575838, "learning_rate": 5.185792399729842e-06, "loss": 0.0263, "step": 3369 }, { "epoch": 1.6186359269932757, "grad_norm": 0.4591738769120489, "learning_rate": 5.182999786776581e-06, "loss": 0.0287, "step": 3370 }, { "epoch": 1.6191162343900096, "grad_norm": 0.3660180782137811, "learning_rate": 5.180207116659249e-06, "loss": 0.0263, "step": 3371 }, { "epoch": 1.6195965417867435, "grad_norm": 0.38188207821664805, "learning_rate": 5.177414390250198e-06, "loss": 0.0294, "step": 3372 }, { "epoch": 1.6200768491834774, "grad_norm": 0.3377402563353725, "learning_rate": 5.1746216084218e-06, "loss": 0.0282, "step": 3373 }, { "epoch": 1.6205571565802113, "grad_norm": 0.6214543076015218, "learning_rate": 5.171828772046444e-06, "loss": 0.0376, "step": 3374 }, { "epoch": 1.6210374639769451, "grad_norm": 0.31621952713649343, "learning_rate": 5.1690358819965334e-06, "loss": 0.0303, "step": 3375 }, { "epoch": 1.621517771373679, "grad_norm": 0.30797333836702395, "learning_rate": 5.166242939144488e-06, "loss": 0.0225, "step": 3376 }, { "epoch": 1.621998078770413, "grad_norm": 0.37740160043097826, "learning_rate": 5.163449944362748e-06, "loss": 0.0315, "step": 3377 }, { "epoch": 1.622478386167147, "grad_norm": 0.4651745999851052, "learning_rate": 5.1606568985237685e-06, "loss": 0.0275, "step": 3378 }, { "epoch": 1.622958693563881, "grad_norm": 0.3270087864359758, "learning_rate": 5.157863802500017e-06, "loss": 0.0377, "step": 3379 }, { "epoch": 1.6234390009606148, "grad_norm": 0.38116201889036966, "learning_rate": 5.155070657163982e-06, "loss": 0.0367, "step": 3380 }, { "epoch": 1.6239193083573487, "grad_norm": 0.39589750038312604, "learning_rate": 5.152277463388164e-06, "loss": 0.0196, "step": 3381 }, { "epoch": 1.6243996157540828, "grad_norm": 0.4399648297970043, "learning_rate": 5.149484222045078e-06, "loss": 0.0283, "step": 3382 }, { "epoch": 1.6248799231508166, "grad_norm": 0.7821919769702612, "learning_rate": 5.146690934007255e-06, "loss": 0.0256, "step": 3383 }, { "epoch": 1.6253602305475505, "grad_norm": 0.5039097166102381, "learning_rate": 5.143897600147246e-06, "loss": 0.0467, "step": 3384 }, { "epoch": 1.6258405379442844, "grad_norm": 0.6586198159593792, "learning_rate": 5.141104221337608e-06, "loss": 0.0297, "step": 3385 }, { "epoch": 1.6263208453410183, "grad_norm": 0.4855350395833944, "learning_rate": 5.138310798450912e-06, "loss": 0.042, "step": 3386 }, { "epoch": 1.6268011527377522, "grad_norm": 0.28136049137645497, "learning_rate": 5.135517332359749e-06, "loss": 0.03, "step": 3387 }, { "epoch": 1.627281460134486, "grad_norm": 0.456515201179332, "learning_rate": 5.132723823936724e-06, "loss": 0.0268, "step": 3388 }, { "epoch": 1.62776176753122, "grad_norm": 0.37127724644408616, "learning_rate": 5.129930274054446e-06, "loss": 0.0274, "step": 3389 }, { "epoch": 1.6282420749279538, "grad_norm": 0.4704834709167351, "learning_rate": 5.127136683585543e-06, "loss": 0.0275, "step": 3390 }, { "epoch": 1.6287223823246877, "grad_norm": 0.3692905527760252, "learning_rate": 5.1243430534026596e-06, "loss": 0.0272, "step": 3391 }, { "epoch": 1.6292026897214216, "grad_norm": 0.45928918844909616, "learning_rate": 5.1215493843784454e-06, "loss": 0.0369, "step": 3392 }, { "epoch": 1.6296829971181557, "grad_norm": 0.3495979720464528, "learning_rate": 5.118755677385569e-06, "loss": 0.0274, "step": 3393 }, { "epoch": 1.6301633045148896, "grad_norm": 0.3956654332295499, "learning_rate": 5.115961933296702e-06, "loss": 0.0316, "step": 3394 }, { "epoch": 1.6306436119116234, "grad_norm": 0.35097117069843564, "learning_rate": 5.1131681529845345e-06, "loss": 0.0287, "step": 3395 }, { "epoch": 1.6311239193083573, "grad_norm": 0.3047543833600572, "learning_rate": 5.110374337321767e-06, "loss": 0.0278, "step": 3396 }, { "epoch": 1.6316042267050914, "grad_norm": 0.2810431454823304, "learning_rate": 5.107580487181112e-06, "loss": 0.0218, "step": 3397 }, { "epoch": 1.6320845341018253, "grad_norm": 1.024655944616989, "learning_rate": 5.104786603435288e-06, "loss": 0.0345, "step": 3398 }, { "epoch": 1.6325648414985592, "grad_norm": 0.4967627063375021, "learning_rate": 5.101992686957028e-06, "loss": 0.0317, "step": 3399 }, { "epoch": 1.633045148895293, "grad_norm": 0.4165579215207955, "learning_rate": 5.099198738619073e-06, "loss": 0.0235, "step": 3400 }, { "epoch": 1.633525456292027, "grad_norm": 0.3314460003886491, "learning_rate": 5.096404759294178e-06, "loss": 0.0256, "step": 3401 }, { "epoch": 1.6340057636887608, "grad_norm": 0.5364202395159503, "learning_rate": 5.093610749855103e-06, "loss": 0.0412, "step": 3402 }, { "epoch": 1.6344860710854947, "grad_norm": 0.28215404060873045, "learning_rate": 5.09081671117462e-06, "loss": 0.0273, "step": 3403 }, { "epoch": 1.6349663784822286, "grad_norm": 0.6748061584723536, "learning_rate": 5.088022644125507e-06, "loss": 0.0476, "step": 3404 }, { "epoch": 1.6354466858789625, "grad_norm": 0.3263616427253085, "learning_rate": 5.0852285495805565e-06, "loss": 0.0308, "step": 3405 }, { "epoch": 1.6359269932756964, "grad_norm": 0.4126264762932067, "learning_rate": 5.082434428412566e-06, "loss": 0.0345, "step": 3406 }, { "epoch": 1.6364073006724302, "grad_norm": 0.70025978768579, "learning_rate": 5.07964028149434e-06, "loss": 0.0518, "step": 3407 }, { "epoch": 1.6368876080691641, "grad_norm": 0.4487400837138861, "learning_rate": 5.0768461096986935e-06, "loss": 0.035, "step": 3408 }, { "epoch": 1.6373679154658982, "grad_norm": 0.33763106130518306, "learning_rate": 5.074051913898446e-06, "loss": 0.0228, "step": 3409 }, { "epoch": 1.637848222862632, "grad_norm": 0.4139059485661042, "learning_rate": 5.071257694966431e-06, "loss": 0.0336, "step": 3410 }, { "epoch": 1.638328530259366, "grad_norm": 0.5003664166528021, "learning_rate": 5.068463453775484e-06, "loss": 0.0244, "step": 3411 }, { "epoch": 1.6388088376560999, "grad_norm": 0.4005993664438495, "learning_rate": 5.0656691911984465e-06, "loss": 0.0284, "step": 3412 }, { "epoch": 1.639289145052834, "grad_norm": 0.4109279855941154, "learning_rate": 5.0628749081081715e-06, "loss": 0.021, "step": 3413 }, { "epoch": 1.6397694524495678, "grad_norm": 0.42061038155085634, "learning_rate": 5.060080605377516e-06, "loss": 0.0239, "step": 3414 }, { "epoch": 1.6402497598463017, "grad_norm": 0.3850046703748627, "learning_rate": 5.057286283879339e-06, "loss": 0.0319, "step": 3415 }, { "epoch": 1.6407300672430356, "grad_norm": 0.762616128517838, "learning_rate": 5.054491944486514e-06, "loss": 0.0255, "step": 3416 }, { "epoch": 1.6412103746397695, "grad_norm": 0.401999564035588, "learning_rate": 5.051697588071911e-06, "loss": 0.0402, "step": 3417 }, { "epoch": 1.6416906820365034, "grad_norm": 0.657302285581366, "learning_rate": 5.0489032155084124e-06, "loss": 0.0286, "step": 3418 }, { "epoch": 1.6421709894332372, "grad_norm": 0.36097601376390226, "learning_rate": 5.0461088276689026e-06, "loss": 0.0264, "step": 3419 }, { "epoch": 1.6426512968299711, "grad_norm": 0.3242182980316847, "learning_rate": 5.043314425426271e-06, "loss": 0.0257, "step": 3420 }, { "epoch": 1.643131604226705, "grad_norm": 0.3653828375154603, "learning_rate": 5.040520009653411e-06, "loss": 0.0277, "step": 3421 }, { "epoch": 1.643611911623439, "grad_norm": 0.3842984543604942, "learning_rate": 5.037725581223221e-06, "loss": 0.0288, "step": 3422 }, { "epoch": 1.6440922190201728, "grad_norm": 0.38358615394075884, "learning_rate": 5.0349311410086025e-06, "loss": 0.0319, "step": 3423 }, { "epoch": 1.6445725264169067, "grad_norm": 0.34264291348981935, "learning_rate": 5.032136689882462e-06, "loss": 0.0185, "step": 3424 }, { "epoch": 1.6450528338136408, "grad_norm": 0.28480047848421475, "learning_rate": 5.029342228717709e-06, "loss": 0.0258, "step": 3425 }, { "epoch": 1.6455331412103746, "grad_norm": 0.3263083505568583, "learning_rate": 5.0265477583872566e-06, "loss": 0.0327, "step": 3426 }, { "epoch": 1.6460134486071085, "grad_norm": 0.3026416486427862, "learning_rate": 5.023753279764017e-06, "loss": 0.0175, "step": 3427 }, { "epoch": 1.6464937560038426, "grad_norm": 0.347185263673907, "learning_rate": 5.02095879372091e-06, "loss": 0.0285, "step": 3428 }, { "epoch": 1.6469740634005765, "grad_norm": 0.34321948786419665, "learning_rate": 5.018164301130858e-06, "loss": 0.0266, "step": 3429 }, { "epoch": 1.6474543707973104, "grad_norm": 0.4002854030334408, "learning_rate": 5.01536980286678e-06, "loss": 0.0235, "step": 3430 }, { "epoch": 1.6479346781940443, "grad_norm": 0.2573955404850342, "learning_rate": 5.0125752998015985e-06, "loss": 0.0285, "step": 3431 }, { "epoch": 1.6484149855907781, "grad_norm": 0.4250715658642843, "learning_rate": 5.009780792808243e-06, "loss": 0.0257, "step": 3432 }, { "epoch": 1.648895292987512, "grad_norm": 0.3569499329518998, "learning_rate": 5.006986282759638e-06, "loss": 0.0235, "step": 3433 }, { "epoch": 1.649375600384246, "grad_norm": 0.2754038192533353, "learning_rate": 5.004191770528713e-06, "loss": 0.0221, "step": 3434 }, { "epoch": 1.6498559077809798, "grad_norm": 0.3078170491163154, "learning_rate": 5.001397256988393e-06, "loss": 0.0207, "step": 3435 }, { "epoch": 1.6503362151777137, "grad_norm": 0.377040224333181, "learning_rate": 4.998602743011608e-06, "loss": 0.0229, "step": 3436 }, { "epoch": 1.6508165225744476, "grad_norm": 0.3880505524265007, "learning_rate": 4.995808229471288e-06, "loss": 0.0304, "step": 3437 }, { "epoch": 1.6512968299711814, "grad_norm": 0.2900441078095584, "learning_rate": 4.993013717240363e-06, "loss": 0.0222, "step": 3438 }, { "epoch": 1.6517771373679153, "grad_norm": 0.3828600366896208, "learning_rate": 4.990219207191759e-06, "loss": 0.0242, "step": 3439 }, { "epoch": 1.6522574447646494, "grad_norm": 0.3926986483151745, "learning_rate": 4.987424700198402e-06, "loss": 0.0301, "step": 3440 }, { "epoch": 1.6527377521613833, "grad_norm": 0.3647621474794288, "learning_rate": 4.984630197133224e-06, "loss": 0.0361, "step": 3441 }, { "epoch": 1.6532180595581172, "grad_norm": 0.3714045331670584, "learning_rate": 4.981835698869145e-06, "loss": 0.0233, "step": 3442 }, { "epoch": 1.653698366954851, "grad_norm": 0.4690508076938825, "learning_rate": 4.97904120627909e-06, "loss": 0.0246, "step": 3443 }, { "epoch": 1.6541786743515852, "grad_norm": 0.29214780973607074, "learning_rate": 4.976246720235983e-06, "loss": 0.0167, "step": 3444 }, { "epoch": 1.654658981748319, "grad_norm": 0.3795991923464498, "learning_rate": 4.973452241612745e-06, "loss": 0.028, "step": 3445 }, { "epoch": 1.655139289145053, "grad_norm": 0.5995851987489194, "learning_rate": 4.9706577712822914e-06, "loss": 0.0457, "step": 3446 }, { "epoch": 1.6556195965417868, "grad_norm": 0.32799705203703855, "learning_rate": 4.96786331011754e-06, "loss": 0.0273, "step": 3447 }, { "epoch": 1.6560999039385207, "grad_norm": 0.3357235866737033, "learning_rate": 4.965068858991399e-06, "loss": 0.0252, "step": 3448 }, { "epoch": 1.6565802113352546, "grad_norm": 0.4909422291586557, "learning_rate": 4.962274418776781e-06, "loss": 0.0447, "step": 3449 }, { "epoch": 1.6570605187319885, "grad_norm": 0.6545415508900848, "learning_rate": 4.959479990346591e-06, "loss": 0.0483, "step": 3450 }, { "epoch": 1.6575408261287223, "grad_norm": 0.44476331879941605, "learning_rate": 4.95668557457373e-06, "loss": 0.0443, "step": 3451 }, { "epoch": 1.6580211335254562, "grad_norm": 0.28018344710905063, "learning_rate": 4.953891172331098e-06, "loss": 0.0249, "step": 3452 }, { "epoch": 1.65850144092219, "grad_norm": 0.3712902417389291, "learning_rate": 4.951096784491588e-06, "loss": 0.0295, "step": 3453 }, { "epoch": 1.658981748318924, "grad_norm": 0.339507193897664, "learning_rate": 4.94830241192809e-06, "loss": 0.0236, "step": 3454 }, { "epoch": 1.6594620557156579, "grad_norm": 0.2544308718860551, "learning_rate": 4.945508055513488e-06, "loss": 0.0204, "step": 3455 }, { "epoch": 1.659942363112392, "grad_norm": 0.4634287214649988, "learning_rate": 4.942713716120662e-06, "loss": 0.0379, "step": 3456 }, { "epoch": 1.6604226705091258, "grad_norm": 0.364737636273259, "learning_rate": 4.939919394622487e-06, "loss": 0.0227, "step": 3457 }, { "epoch": 1.6609029779058597, "grad_norm": 0.34521761983673577, "learning_rate": 4.9371250918918285e-06, "loss": 0.0266, "step": 3458 }, { "epoch": 1.6613832853025938, "grad_norm": 0.41212937045275727, "learning_rate": 4.9343308088015535e-06, "loss": 0.0285, "step": 3459 }, { "epoch": 1.6618635926993277, "grad_norm": 0.477373817125457, "learning_rate": 4.931536546224517e-06, "loss": 0.0377, "step": 3460 }, { "epoch": 1.6623439000960616, "grad_norm": 0.6536102547156083, "learning_rate": 4.92874230503357e-06, "loss": 0.0342, "step": 3461 }, { "epoch": 1.6628242074927955, "grad_norm": 0.3829415722916596, "learning_rate": 4.925948086101556e-06, "loss": 0.023, "step": 3462 }, { "epoch": 1.6633045148895294, "grad_norm": 0.3274573169556948, "learning_rate": 4.92315389030131e-06, "loss": 0.025, "step": 3463 }, { "epoch": 1.6637848222862632, "grad_norm": 0.36333784214997705, "learning_rate": 4.920359718505663e-06, "loss": 0.0209, "step": 3464 }, { "epoch": 1.6642651296829971, "grad_norm": 0.31305351520310537, "learning_rate": 4.917565571587435e-06, "loss": 0.0224, "step": 3465 }, { "epoch": 1.664745437079731, "grad_norm": 0.47781211292097503, "learning_rate": 4.9147714504194434e-06, "loss": 0.0385, "step": 3466 }, { "epoch": 1.6652257444764649, "grad_norm": 0.2939222430539397, "learning_rate": 4.911977355874494e-06, "loss": 0.0208, "step": 3467 }, { "epoch": 1.6657060518731988, "grad_norm": 0.3960347672700225, "learning_rate": 4.909183288825382e-06, "loss": 0.0327, "step": 3468 }, { "epoch": 1.6661863592699326, "grad_norm": 0.39482860912584333, "learning_rate": 4.906389250144898e-06, "loss": 0.0313, "step": 3469 }, { "epoch": 1.6666666666666665, "grad_norm": 0.46533201855938944, "learning_rate": 4.903595240705824e-06, "loss": 0.0398, "step": 3470 }, { "epoch": 1.6671469740634006, "grad_norm": 0.4196844411205205, "learning_rate": 4.9008012613809285e-06, "loss": 0.029, "step": 3471 }, { "epoch": 1.6676272814601345, "grad_norm": 0.967733980182863, "learning_rate": 4.898007313042975e-06, "loss": 0.0336, "step": 3472 }, { "epoch": 1.6681075888568684, "grad_norm": 0.6195631127351312, "learning_rate": 4.895213396564713e-06, "loss": 0.0296, "step": 3473 }, { "epoch": 1.6685878962536023, "grad_norm": 0.4038057660605518, "learning_rate": 4.89241951281889e-06, "loss": 0.0303, "step": 3474 }, { "epoch": 1.6690682036503364, "grad_norm": 0.3530252380539052, "learning_rate": 4.8896256626782335e-06, "loss": 0.0285, "step": 3475 }, { "epoch": 1.6695485110470702, "grad_norm": 0.4864095180850793, "learning_rate": 4.886831847015467e-06, "loss": 0.0345, "step": 3476 }, { "epoch": 1.6700288184438041, "grad_norm": 0.38628988161984606, "learning_rate": 4.884038066703301e-06, "loss": 0.0253, "step": 3477 }, { "epoch": 1.670509125840538, "grad_norm": 0.42693164348336704, "learning_rate": 4.881244322614434e-06, "loss": 0.0296, "step": 3478 }, { "epoch": 1.670989433237272, "grad_norm": 0.2575223884564015, "learning_rate": 4.878450615621555e-06, "loss": 0.022, "step": 3479 }, { "epoch": 1.6714697406340058, "grad_norm": 0.38834241163228445, "learning_rate": 4.87565694659734e-06, "loss": 0.0246, "step": 3480 }, { "epoch": 1.6719500480307397, "grad_norm": 0.31675066042333444, "learning_rate": 4.8728633164144565e-06, "loss": 0.0254, "step": 3481 }, { "epoch": 1.6724303554274735, "grad_norm": 0.496997920275487, "learning_rate": 4.870069725945556e-06, "loss": 0.0326, "step": 3482 }, { "epoch": 1.6729106628242074, "grad_norm": 0.24266466084633093, "learning_rate": 4.867276176063278e-06, "loss": 0.019, "step": 3483 }, { "epoch": 1.6733909702209413, "grad_norm": 0.40678935977650016, "learning_rate": 4.8644826676402515e-06, "loss": 0.0332, "step": 3484 }, { "epoch": 1.6738712776176752, "grad_norm": 0.33682148231188475, "learning_rate": 4.8616892015490905e-06, "loss": 0.0248, "step": 3485 }, { "epoch": 1.674351585014409, "grad_norm": 0.45965974804247106, "learning_rate": 4.858895778662396e-06, "loss": 0.0318, "step": 3486 }, { "epoch": 1.6748318924111432, "grad_norm": 0.44175200618655996, "learning_rate": 4.856102399852755e-06, "loss": 0.0249, "step": 3487 }, { "epoch": 1.675312199807877, "grad_norm": 0.3349571007176208, "learning_rate": 4.8533090659927446e-06, "loss": 0.0259, "step": 3488 }, { "epoch": 1.675792507204611, "grad_norm": 0.33964639860194756, "learning_rate": 4.850515777954924e-06, "loss": 0.0326, "step": 3489 }, { "epoch": 1.6762728146013448, "grad_norm": 0.32229862321408365, "learning_rate": 4.847722536611839e-06, "loss": 0.0308, "step": 3490 }, { "epoch": 1.676753121998079, "grad_norm": 0.32211286020846086, "learning_rate": 4.84492934283602e-06, "loss": 0.0233, "step": 3491 }, { "epoch": 1.6772334293948128, "grad_norm": 0.36702723965403933, "learning_rate": 4.842136197499985e-06, "loss": 0.0272, "step": 3492 }, { "epoch": 1.6777137367915467, "grad_norm": 0.30020641761601, "learning_rate": 4.839343101476235e-06, "loss": 0.0262, "step": 3493 }, { "epoch": 1.6781940441882806, "grad_norm": 0.5087659355505825, "learning_rate": 4.836550055637254e-06, "loss": 0.0375, "step": 3494 }, { "epoch": 1.6786743515850144, "grad_norm": 0.3128856595000371, "learning_rate": 4.8337570608555125e-06, "loss": 0.0244, "step": 3495 }, { "epoch": 1.6791546589817483, "grad_norm": 0.3410820290660685, "learning_rate": 4.830964118003468e-06, "loss": 0.0276, "step": 3496 }, { "epoch": 1.6796349663784822, "grad_norm": 0.3255137331981828, "learning_rate": 4.828171227953557e-06, "loss": 0.0314, "step": 3497 }, { "epoch": 1.680115273775216, "grad_norm": 0.328343747061433, "learning_rate": 4.825378391578201e-06, "loss": 0.0287, "step": 3498 }, { "epoch": 1.68059558117195, "grad_norm": 0.28567128371711453, "learning_rate": 4.822585609749804e-06, "loss": 0.0202, "step": 3499 }, { "epoch": 1.6810758885686838, "grad_norm": 0.3269653348804731, "learning_rate": 4.8197928833407534e-06, "loss": 0.0243, "step": 3500 }, { "epoch": 1.6815561959654177, "grad_norm": 0.29056836054314467, "learning_rate": 4.8170002132234215e-06, "loss": 0.0199, "step": 3501 }, { "epoch": 1.6820365033621518, "grad_norm": 0.4062417961150477, "learning_rate": 4.814207600270159e-06, "loss": 0.0299, "step": 3502 }, { "epoch": 1.6825168107588857, "grad_norm": 1.106136867393574, "learning_rate": 4.811415045353303e-06, "loss": 0.0294, "step": 3503 }, { "epoch": 1.6829971181556196, "grad_norm": 0.3797260213616591, "learning_rate": 4.808622549345169e-06, "loss": 0.0248, "step": 3504 }, { "epoch": 1.6834774255523535, "grad_norm": 0.3984892574071235, "learning_rate": 4.805830113118057e-06, "loss": 0.0387, "step": 3505 }, { "epoch": 1.6839577329490876, "grad_norm": 0.3449436161984806, "learning_rate": 4.803037737544247e-06, "loss": 0.0253, "step": 3506 }, { "epoch": 1.6844380403458215, "grad_norm": 0.3845095057538799, "learning_rate": 4.800245423495997e-06, "loss": 0.0285, "step": 3507 }, { "epoch": 1.6849183477425553, "grad_norm": 0.29757022065022476, "learning_rate": 4.7974531718455505e-06, "loss": 0.0237, "step": 3508 }, { "epoch": 1.6853986551392892, "grad_norm": 0.31198928831896994, "learning_rate": 4.7946609834651274e-06, "loss": 0.0196, "step": 3509 }, { "epoch": 1.685878962536023, "grad_norm": 0.3580416632228998, "learning_rate": 4.791868859226936e-06, "loss": 0.0281, "step": 3510 }, { "epoch": 1.686359269932757, "grad_norm": 0.47487537687932174, "learning_rate": 4.789076800003154e-06, "loss": 0.0307, "step": 3511 }, { "epoch": 1.6868395773294909, "grad_norm": 0.30616215867226537, "learning_rate": 4.786284806665944e-06, "loss": 0.0242, "step": 3512 }, { "epoch": 1.6873198847262247, "grad_norm": 0.3405855229064906, "learning_rate": 4.783492880087447e-06, "loss": 0.026, "step": 3513 }, { "epoch": 1.6878001921229586, "grad_norm": 0.4299147646469349, "learning_rate": 4.780701021139786e-06, "loss": 0.023, "step": 3514 }, { "epoch": 1.6882804995196925, "grad_norm": 0.33663086256974706, "learning_rate": 4.7779092306950586e-06, "loss": 0.0253, "step": 3515 }, { "epoch": 1.6887608069164264, "grad_norm": 0.33211314140357895, "learning_rate": 4.775117509625344e-06, "loss": 0.0292, "step": 3516 }, { "epoch": 1.6892411143131603, "grad_norm": 0.3912062069560051, "learning_rate": 4.7723258588026955e-06, "loss": 0.025, "step": 3517 }, { "epoch": 1.6897214217098944, "grad_norm": 0.346702646371921, "learning_rate": 4.769534279099151e-06, "loss": 0.0269, "step": 3518 }, { "epoch": 1.6902017291066282, "grad_norm": 1.0507955581233808, "learning_rate": 4.766742771386723e-06, "loss": 0.0406, "step": 3519 }, { "epoch": 1.6906820365033621, "grad_norm": 0.8523092964312573, "learning_rate": 4.763951336537401e-06, "loss": 0.0354, "step": 3520 }, { "epoch": 1.691162343900096, "grad_norm": 0.33558115998270216, "learning_rate": 4.761159975423149e-06, "loss": 0.022, "step": 3521 }, { "epoch": 1.6916426512968301, "grad_norm": 0.3741946750854459, "learning_rate": 4.7583686889159145e-06, "loss": 0.0336, "step": 3522 }, { "epoch": 1.692122958693564, "grad_norm": 0.33329520429116066, "learning_rate": 4.755577477887615e-06, "loss": 0.0279, "step": 3523 }, { "epoch": 1.6926032660902979, "grad_norm": 0.4751609254364396, "learning_rate": 4.7527863432101495e-06, "loss": 0.024, "step": 3524 }, { "epoch": 1.6930835734870318, "grad_norm": 0.34949876241143957, "learning_rate": 4.7499952857553924e-06, "loss": 0.026, "step": 3525 }, { "epoch": 1.6935638808837656, "grad_norm": 0.4092994720385161, "learning_rate": 4.747204306395191e-06, "loss": 0.0392, "step": 3526 }, { "epoch": 1.6940441882804995, "grad_norm": 0.36277593610348113, "learning_rate": 4.744413406001371e-06, "loss": 0.0275, "step": 3527 }, { "epoch": 1.6945244956772334, "grad_norm": 0.31079784256390514, "learning_rate": 4.741622585445731e-06, "loss": 0.026, "step": 3528 }, { "epoch": 1.6950048030739673, "grad_norm": 0.3837353818642693, "learning_rate": 4.73883184560005e-06, "loss": 0.0363, "step": 3529 }, { "epoch": 1.6954851104707012, "grad_norm": 0.4226276276992672, "learning_rate": 4.736041187336073e-06, "loss": 0.0226, "step": 3530 }, { "epoch": 1.695965417867435, "grad_norm": 0.36932663541845223, "learning_rate": 4.733250611525524e-06, "loss": 0.0323, "step": 3531 }, { "epoch": 1.696445725264169, "grad_norm": 0.3649519773605918, "learning_rate": 4.7304601190401076e-06, "loss": 0.0297, "step": 3532 }, { "epoch": 1.696926032660903, "grad_norm": 0.7170202126670897, "learning_rate": 4.727669710751494e-06, "loss": 0.0442, "step": 3533 }, { "epoch": 1.697406340057637, "grad_norm": 0.2983588717180754, "learning_rate": 4.724879387531327e-06, "loss": 0.0181, "step": 3534 }, { "epoch": 1.6978866474543708, "grad_norm": 0.24260190138991256, "learning_rate": 4.722089150251228e-06, "loss": 0.0206, "step": 3535 }, { "epoch": 1.6983669548511047, "grad_norm": 0.3378509934719898, "learning_rate": 4.719298999782791e-06, "loss": 0.0258, "step": 3536 }, { "epoch": 1.6988472622478388, "grad_norm": 0.34636088625700207, "learning_rate": 4.716508936997579e-06, "loss": 0.0224, "step": 3537 }, { "epoch": 1.6993275696445727, "grad_norm": 0.49707513307250756, "learning_rate": 4.7137189627671334e-06, "loss": 0.0416, "step": 3538 }, { "epoch": 1.6998078770413065, "grad_norm": 0.27775249634430255, "learning_rate": 4.7109290779629625e-06, "loss": 0.0214, "step": 3539 }, { "epoch": 1.7002881844380404, "grad_norm": 0.5108380870286501, "learning_rate": 4.708139283456551e-06, "loss": 0.0269, "step": 3540 }, { "epoch": 1.7007684918347743, "grad_norm": 0.7050045916539299, "learning_rate": 4.705349580119353e-06, "loss": 0.0189, "step": 3541 }, { "epoch": 1.7012487992315082, "grad_norm": 0.4398564191681809, "learning_rate": 4.702559968822795e-06, "loss": 0.0464, "step": 3542 }, { "epoch": 1.701729106628242, "grad_norm": 0.39993175389975383, "learning_rate": 4.699770450438275e-06, "loss": 0.0449, "step": 3543 }, { "epoch": 1.702209414024976, "grad_norm": 0.4468264667134198, "learning_rate": 4.696981025837159e-06, "loss": 0.0259, "step": 3544 }, { "epoch": 1.7026897214217098, "grad_norm": 0.4435853482763639, "learning_rate": 4.694191695890788e-06, "loss": 0.0296, "step": 3545 }, { "epoch": 1.7031700288184437, "grad_norm": 0.3624310987941425, "learning_rate": 4.691402461470469e-06, "loss": 0.0351, "step": 3546 }, { "epoch": 1.7036503362151776, "grad_norm": 0.3996923192763031, "learning_rate": 4.6886133234474854e-06, "loss": 0.0293, "step": 3547 }, { "epoch": 1.7041306436119115, "grad_norm": 0.3824154502736305, "learning_rate": 4.685824282693085e-06, "loss": 0.0331, "step": 3548 }, { "epoch": 1.7046109510086456, "grad_norm": 0.3720324764672001, "learning_rate": 4.683035340078486e-06, "loss": 0.0405, "step": 3549 }, { "epoch": 1.7050912584053795, "grad_norm": 0.904361337754094, "learning_rate": 4.680246496474879e-06, "loss": 0.0392, "step": 3550 }, { "epoch": 1.7055715658021133, "grad_norm": 0.29984111865805063, "learning_rate": 4.6774577527534195e-06, "loss": 0.023, "step": 3551 }, { "epoch": 1.7060518731988472, "grad_norm": 0.24851976131052758, "learning_rate": 4.674669109785236e-06, "loss": 0.0223, "step": 3552 }, { "epoch": 1.7065321805955813, "grad_norm": 0.2766078029114044, "learning_rate": 4.671880568441419e-06, "loss": 0.0222, "step": 3553 }, { "epoch": 1.7070124879923152, "grad_norm": 0.3881522416468508, "learning_rate": 4.669092129593037e-06, "loss": 0.0338, "step": 3554 }, { "epoch": 1.707492795389049, "grad_norm": 0.27640792611683035, "learning_rate": 4.666303794111119e-06, "loss": 0.03, "step": 3555 }, { "epoch": 1.707973102785783, "grad_norm": 0.2831006241390689, "learning_rate": 4.663515562866665e-06, "loss": 0.0202, "step": 3556 }, { "epoch": 1.7084534101825168, "grad_norm": 0.32132760223019163, "learning_rate": 4.66072743673064e-06, "loss": 0.0217, "step": 3557 }, { "epoch": 1.7089337175792507, "grad_norm": 0.26937941043877345, "learning_rate": 4.657939416573976e-06, "loss": 0.0207, "step": 3558 }, { "epoch": 1.7094140249759846, "grad_norm": 0.5434026034531317, "learning_rate": 4.655151503267577e-06, "loss": 0.0505, "step": 3559 }, { "epoch": 1.7098943323727185, "grad_norm": 0.31915314635599806, "learning_rate": 4.652363697682307e-06, "loss": 0.0223, "step": 3560 }, { "epoch": 1.7103746397694524, "grad_norm": 0.47793885093331095, "learning_rate": 4.649576000688999e-06, "loss": 0.0362, "step": 3561 }, { "epoch": 1.7108549471661862, "grad_norm": 0.3237250343435186, "learning_rate": 4.646788413158455e-06, "loss": 0.0326, "step": 3562 }, { "epoch": 1.7113352545629201, "grad_norm": 0.3611731392781741, "learning_rate": 4.644000935961439e-06, "loss": 0.0239, "step": 3563 }, { "epoch": 1.7118155619596542, "grad_norm": 0.41755390960498256, "learning_rate": 4.641213569968682e-06, "loss": 0.0302, "step": 3564 }, { "epoch": 1.7122958693563881, "grad_norm": 0.2993994935737092, "learning_rate": 4.63842631605088e-06, "loss": 0.0201, "step": 3565 }, { "epoch": 1.712776176753122, "grad_norm": 0.28751357882349343, "learning_rate": 4.635639175078694e-06, "loss": 0.0213, "step": 3566 }, { "epoch": 1.7132564841498559, "grad_norm": 0.45770068977117406, "learning_rate": 4.6328521479227495e-06, "loss": 0.0325, "step": 3567 }, { "epoch": 1.71373679154659, "grad_norm": 0.5181321091945725, "learning_rate": 4.6300652354536346e-06, "loss": 0.0443, "step": 3568 }, { "epoch": 1.7142170989433239, "grad_norm": 0.43694044343740746, "learning_rate": 4.6272784385419085e-06, "loss": 0.0246, "step": 3569 }, { "epoch": 1.7146974063400577, "grad_norm": 0.2546124017780289, "learning_rate": 4.624491758058086e-06, "loss": 0.0135, "step": 3570 }, { "epoch": 1.7151777137367916, "grad_norm": 0.4945483033762881, "learning_rate": 4.621705194872649e-06, "loss": 0.0313, "step": 3571 }, { "epoch": 1.7156580211335255, "grad_norm": 0.3301312339359128, "learning_rate": 4.618918749856044e-06, "loss": 0.0372, "step": 3572 }, { "epoch": 1.7161383285302594, "grad_norm": 0.29159703531585235, "learning_rate": 4.616132423878679e-06, "loss": 0.0243, "step": 3573 }, { "epoch": 1.7166186359269933, "grad_norm": 0.3177046104580261, "learning_rate": 4.6133462178109246e-06, "loss": 0.0227, "step": 3574 }, { "epoch": 1.7170989433237271, "grad_norm": 0.38123559234927284, "learning_rate": 4.610560132523113e-06, "loss": 0.0404, "step": 3575 }, { "epoch": 1.717579250720461, "grad_norm": 0.5935268957418899, "learning_rate": 4.607774168885545e-06, "loss": 0.0252, "step": 3576 }, { "epoch": 1.718059558117195, "grad_norm": 0.3499353703582889, "learning_rate": 4.604988327768474e-06, "loss": 0.0216, "step": 3577 }, { "epoch": 1.7185398655139288, "grad_norm": 0.2608670715621617, "learning_rate": 4.602202610042121e-06, "loss": 0.0194, "step": 3578 }, { "epoch": 1.7190201729106627, "grad_norm": 0.5711232221428885, "learning_rate": 4.59941701657667e-06, "loss": 0.0292, "step": 3579 }, { "epoch": 1.7195004803073968, "grad_norm": 0.46075530700779316, "learning_rate": 4.596631548242259e-06, "loss": 0.0327, "step": 3580 }, { "epoch": 1.7199807877041307, "grad_norm": 0.5200584067381084, "learning_rate": 4.593846205908993e-06, "loss": 0.0304, "step": 3581 }, { "epoch": 1.7204610951008645, "grad_norm": 0.47548985500266383, "learning_rate": 4.591060990446935e-06, "loss": 0.036, "step": 3582 }, { "epoch": 1.7209414024975984, "grad_norm": 0.36388451615535355, "learning_rate": 4.58827590272611e-06, "loss": 0.0281, "step": 3583 }, { "epoch": 1.7214217098943325, "grad_norm": 0.3217254829431618, "learning_rate": 4.585490943616504e-06, "loss": 0.0273, "step": 3584 }, { "epoch": 1.7219020172910664, "grad_norm": 0.3116691908300809, "learning_rate": 4.5827061139880595e-06, "loss": 0.0301, "step": 3585 }, { "epoch": 1.7223823246878003, "grad_norm": 0.7363470839579178, "learning_rate": 4.579921414710679e-06, "loss": 0.0223, "step": 3586 }, { "epoch": 1.7228626320845342, "grad_norm": 0.7009201566620327, "learning_rate": 4.5771368466542286e-06, "loss": 0.0386, "step": 3587 }, { "epoch": 1.723342939481268, "grad_norm": 0.36456885939218386, "learning_rate": 4.574352410688529e-06, "loss": 0.0286, "step": 3588 }, { "epoch": 1.723823246878002, "grad_norm": 0.6838893558823019, "learning_rate": 4.57156810768336e-06, "loss": 0.0228, "step": 3589 }, { "epoch": 1.7243035542747358, "grad_norm": 0.5451972238111695, "learning_rate": 4.568783938508459e-06, "loss": 0.0298, "step": 3590 }, { "epoch": 1.7247838616714697, "grad_norm": 0.594753385291269, "learning_rate": 4.565999904033528e-06, "loss": 0.0283, "step": 3591 }, { "epoch": 1.7252641690682036, "grad_norm": 0.39476531734312187, "learning_rate": 4.563216005128221e-06, "loss": 0.0345, "step": 3592 }, { "epoch": 1.7257444764649374, "grad_norm": 0.31036609129370213, "learning_rate": 4.56043224266215e-06, "loss": 0.0246, "step": 3593 }, { "epoch": 1.7262247838616713, "grad_norm": 0.5657833690470468, "learning_rate": 4.557648617504885e-06, "loss": 0.0407, "step": 3594 }, { "epoch": 1.7267050912584054, "grad_norm": 0.4570321594787583, "learning_rate": 4.554865130525953e-06, "loss": 0.0319, "step": 3595 }, { "epoch": 1.7271853986551393, "grad_norm": 0.47860374003500955, "learning_rate": 4.552081782594841e-06, "loss": 0.0257, "step": 3596 }, { "epoch": 1.7276657060518732, "grad_norm": 0.33073372906671894, "learning_rate": 4.549298574580987e-06, "loss": 0.028, "step": 3597 }, { "epoch": 1.728146013448607, "grad_norm": 0.3219784618552595, "learning_rate": 4.5465155073537905e-06, "loss": 0.021, "step": 3598 }, { "epoch": 1.7286263208453412, "grad_norm": 0.3773839043724425, "learning_rate": 4.543732581782603e-06, "loss": 0.0274, "step": 3599 }, { "epoch": 1.729106628242075, "grad_norm": 0.34511483448148583, "learning_rate": 4.5409497987367345e-06, "loss": 0.0241, "step": 3600 }, { "epoch": 1.729586935638809, "grad_norm": 0.46619804575766416, "learning_rate": 4.5381671590854495e-06, "loss": 0.027, "step": 3601 }, { "epoch": 1.7300672430355428, "grad_norm": 0.4275332066102676, "learning_rate": 4.5353846636979685e-06, "loss": 0.0287, "step": 3602 }, { "epoch": 1.7305475504322767, "grad_norm": 0.48806867597203707, "learning_rate": 4.532602313443463e-06, "loss": 0.0438, "step": 3603 }, { "epoch": 1.7310278578290106, "grad_norm": 0.3475059732901757, "learning_rate": 4.529820109191065e-06, "loss": 0.027, "step": 3604 }, { "epoch": 1.7315081652257445, "grad_norm": 0.2912817923890908, "learning_rate": 4.527038051809855e-06, "loss": 0.0229, "step": 3605 }, { "epoch": 1.7319884726224783, "grad_norm": 0.37248364480133006, "learning_rate": 4.524256142168874e-06, "loss": 0.0274, "step": 3606 }, { "epoch": 1.7324687800192122, "grad_norm": 0.38812495590984786, "learning_rate": 4.521474381137113e-06, "loss": 0.0291, "step": 3607 }, { "epoch": 1.732949087415946, "grad_norm": 0.6977059708369895, "learning_rate": 4.5186927695835176e-06, "loss": 0.0259, "step": 3608 }, { "epoch": 1.73342939481268, "grad_norm": 0.3961375134469626, "learning_rate": 4.515911308376985e-06, "loss": 0.0435, "step": 3609 }, { "epoch": 1.7339097022094139, "grad_norm": 0.4999146821361793, "learning_rate": 4.513129998386366e-06, "loss": 0.0359, "step": 3610 }, { "epoch": 1.734390009606148, "grad_norm": 0.31666050104965543, "learning_rate": 4.510348840480468e-06, "loss": 0.022, "step": 3611 }, { "epoch": 1.7348703170028819, "grad_norm": 0.393004216752545, "learning_rate": 4.507567835528043e-06, "loss": 0.0219, "step": 3612 }, { "epoch": 1.7353506243996157, "grad_norm": 0.45832272657347073, "learning_rate": 4.504786984397805e-06, "loss": 0.0277, "step": 3613 }, { "epoch": 1.7358309317963496, "grad_norm": 0.5399914972505444, "learning_rate": 4.502006287958413e-06, "loss": 0.0362, "step": 3614 }, { "epoch": 1.7363112391930837, "grad_norm": 0.8022175890255002, "learning_rate": 4.49922574707848e-06, "loss": 0.047, "step": 3615 }, { "epoch": 1.7367915465898176, "grad_norm": 0.2667079842013037, "learning_rate": 4.496445362626568e-06, "loss": 0.0247, "step": 3616 }, { "epoch": 1.7372718539865515, "grad_norm": 0.36214171350034235, "learning_rate": 4.493665135471194e-06, "loss": 0.0361, "step": 3617 }, { "epoch": 1.7377521613832854, "grad_norm": 0.3583954147540581, "learning_rate": 4.4908850664808245e-06, "loss": 0.0244, "step": 3618 }, { "epoch": 1.7382324687800192, "grad_norm": 0.34615632930440154, "learning_rate": 4.488105156523874e-06, "loss": 0.0244, "step": 3619 }, { "epoch": 1.7387127761767531, "grad_norm": 1.1101090549796757, "learning_rate": 4.485325406468711e-06, "loss": 0.0396, "step": 3620 }, { "epoch": 1.739193083573487, "grad_norm": 0.3247215773662764, "learning_rate": 4.482545817183653e-06, "loss": 0.0327, "step": 3621 }, { "epoch": 1.739673390970221, "grad_norm": 0.3537991548689777, "learning_rate": 4.479766389536967e-06, "loss": 0.0229, "step": 3622 }, { "epoch": 1.7401536983669548, "grad_norm": 0.3229462480759579, "learning_rate": 4.476987124396868e-06, "loss": 0.0277, "step": 3623 }, { "epoch": 1.7406340057636887, "grad_norm": 0.26192764403927354, "learning_rate": 4.4742080226315215e-06, "loss": 0.0292, "step": 3624 }, { "epoch": 1.7411143131604225, "grad_norm": 0.3954715690182722, "learning_rate": 4.471429085109043e-06, "loss": 0.0304, "step": 3625 }, { "epoch": 1.7415946205571564, "grad_norm": 0.35553875852980615, "learning_rate": 4.4686503126974955e-06, "loss": 0.0279, "step": 3626 }, { "epoch": 1.7420749279538905, "grad_norm": 0.48903546093872036, "learning_rate": 4.465871706264888e-06, "loss": 0.0355, "step": 3627 }, { "epoch": 1.7425552353506244, "grad_norm": 0.37622041450815663, "learning_rate": 4.463093266679185e-06, "loss": 0.0426, "step": 3628 }, { "epoch": 1.7430355427473583, "grad_norm": 0.2722362781564949, "learning_rate": 4.460314994808292e-06, "loss": 0.0191, "step": 3629 }, { "epoch": 1.7435158501440924, "grad_norm": 0.30252476241570014, "learning_rate": 4.457536891520063e-06, "loss": 0.0239, "step": 3630 }, { "epoch": 1.7439961575408263, "grad_norm": 0.3964243357440774, "learning_rate": 4.454758957682302e-06, "loss": 0.0277, "step": 3631 }, { "epoch": 1.7444764649375601, "grad_norm": 0.35550204531150925, "learning_rate": 4.451981194162758e-06, "loss": 0.0201, "step": 3632 }, { "epoch": 1.744956772334294, "grad_norm": 0.7595431801520169, "learning_rate": 4.449203601829129e-06, "loss": 0.0328, "step": 3633 }, { "epoch": 1.745437079731028, "grad_norm": 0.30696512078788557, "learning_rate": 4.446426181549055e-06, "loss": 0.0242, "step": 3634 }, { "epoch": 1.7459173871277618, "grad_norm": 0.4315839299305404, "learning_rate": 4.4436489341901275e-06, "loss": 0.0325, "step": 3635 }, { "epoch": 1.7463976945244957, "grad_norm": 0.35043173985937154, "learning_rate": 4.440871860619882e-06, "loss": 0.0445, "step": 3636 }, { "epoch": 1.7468780019212296, "grad_norm": 0.3302898037993392, "learning_rate": 4.438094961705798e-06, "loss": 0.0225, "step": 3637 }, { "epoch": 1.7473583093179634, "grad_norm": 0.39229788643778574, "learning_rate": 4.435318238315305e-06, "loss": 0.0236, "step": 3638 }, { "epoch": 1.7478386167146973, "grad_norm": 0.3101528140517937, "learning_rate": 4.4325416913157706e-06, "loss": 0.0232, "step": 3639 }, { "epoch": 1.7483189241114312, "grad_norm": 0.4192693398532501, "learning_rate": 4.429765321574512e-06, "loss": 0.0325, "step": 3640 }, { "epoch": 1.748799231508165, "grad_norm": 0.3869441651098886, "learning_rate": 4.426989129958791e-06, "loss": 0.0274, "step": 3641 }, { "epoch": 1.7492795389048992, "grad_norm": 0.4645948269126304, "learning_rate": 4.424213117335815e-06, "loss": 0.0308, "step": 3642 }, { "epoch": 1.749759846301633, "grad_norm": 0.3058723172034332, "learning_rate": 4.4214372845727305e-06, "loss": 0.0301, "step": 3643 }, { "epoch": 1.750240153698367, "grad_norm": 0.2949494745730839, "learning_rate": 4.418661632536633e-06, "loss": 0.0235, "step": 3644 }, { "epoch": 1.7507204610951008, "grad_norm": 0.5228878434122466, "learning_rate": 4.4158861620945584e-06, "loss": 0.0391, "step": 3645 }, { "epoch": 1.751200768491835, "grad_norm": 0.3835887943029605, "learning_rate": 4.413110874113487e-06, "loss": 0.0261, "step": 3646 }, { "epoch": 1.7516810758885688, "grad_norm": 0.4029010314444043, "learning_rate": 4.4103357694603425e-06, "loss": 0.0361, "step": 3647 }, { "epoch": 1.7521613832853027, "grad_norm": 0.4954305287847675, "learning_rate": 4.40756084900199e-06, "loss": 0.0427, "step": 3648 }, { "epoch": 1.7526416906820366, "grad_norm": 0.2836294571947477, "learning_rate": 4.404786113605236e-06, "loss": 0.0234, "step": 3649 }, { "epoch": 1.7531219980787704, "grad_norm": 0.3888595548785155, "learning_rate": 4.402011564136835e-06, "loss": 0.0393, "step": 3650 }, { "epoch": 1.7536023054755043, "grad_norm": 0.346484912278926, "learning_rate": 4.39923720146348e-06, "loss": 0.0251, "step": 3651 }, { "epoch": 1.7540826128722382, "grad_norm": 0.4619538731593112, "learning_rate": 4.396463026451801e-06, "loss": 0.0283, "step": 3652 }, { "epoch": 1.754562920268972, "grad_norm": 0.3664845051856329, "learning_rate": 4.393689039968376e-06, "loss": 0.0325, "step": 3653 }, { "epoch": 1.755043227665706, "grad_norm": 0.26652527073356225, "learning_rate": 4.3909152428797206e-06, "loss": 0.0229, "step": 3654 }, { "epoch": 1.7555235350624399, "grad_norm": 0.5921924260746761, "learning_rate": 4.388141636052293e-06, "loss": 0.0442, "step": 3655 }, { "epoch": 1.7560038424591737, "grad_norm": 0.3638652310031289, "learning_rate": 4.385368220352489e-06, "loss": 0.0267, "step": 3656 }, { "epoch": 1.7564841498559076, "grad_norm": 0.30082539070712383, "learning_rate": 4.382594996646652e-06, "loss": 0.02, "step": 3657 }, { "epoch": 1.7569644572526417, "grad_norm": 0.4369712941454417, "learning_rate": 4.379821965801056e-06, "loss": 0.0347, "step": 3658 }, { "epoch": 1.7574447646493756, "grad_norm": 0.442114891387335, "learning_rate": 4.377049128681921e-06, "loss": 0.0368, "step": 3659 }, { "epoch": 1.7579250720461095, "grad_norm": 0.6042319109180055, "learning_rate": 4.374276486155403e-06, "loss": 0.0322, "step": 3660 }, { "epoch": 1.7584053794428436, "grad_norm": 0.5233288391768507, "learning_rate": 4.371504039087602e-06, "loss": 0.0216, "step": 3661 }, { "epoch": 1.7588856868395775, "grad_norm": 0.3121681324524714, "learning_rate": 4.36873178834455e-06, "loss": 0.027, "step": 3662 }, { "epoch": 1.7593659942363113, "grad_norm": 0.417385373867633, "learning_rate": 4.3659597347922215e-06, "loss": 0.0234, "step": 3663 }, { "epoch": 1.7598463016330452, "grad_norm": 0.28603883880878606, "learning_rate": 4.363187879296534e-06, "loss": 0.0226, "step": 3664 }, { "epoch": 1.760326609029779, "grad_norm": 0.4394620959204002, "learning_rate": 4.3604162227233335e-06, "loss": 0.0341, "step": 3665 }, { "epoch": 1.760806916426513, "grad_norm": 0.45265837759354915, "learning_rate": 4.357644765938412e-06, "loss": 0.0297, "step": 3666 }, { "epoch": 1.7612872238232469, "grad_norm": 0.4977236040883337, "learning_rate": 4.354873509807493e-06, "loss": 0.0461, "step": 3667 }, { "epoch": 1.7617675312199808, "grad_norm": 0.31677516571201797, "learning_rate": 4.3521024551962435e-06, "loss": 0.0244, "step": 3668 }, { "epoch": 1.7622478386167146, "grad_norm": 0.329473109197052, "learning_rate": 4.349331602970263e-06, "loss": 0.0225, "step": 3669 }, { "epoch": 1.7627281460134485, "grad_norm": 0.7461893872185981, "learning_rate": 4.3465609539950885e-06, "loss": 0.063, "step": 3670 }, { "epoch": 1.7632084534101824, "grad_norm": 0.3283309474536216, "learning_rate": 4.343790509136191e-06, "loss": 0.025, "step": 3671 }, { "epoch": 1.7636887608069163, "grad_norm": 0.21809245274763908, "learning_rate": 4.341020269258987e-06, "loss": 0.0182, "step": 3672 }, { "epoch": 1.7641690682036504, "grad_norm": 0.3990350824372215, "learning_rate": 4.33825023522882e-06, "loss": 0.0293, "step": 3673 }, { "epoch": 1.7646493756003843, "grad_norm": 0.38631002008137466, "learning_rate": 4.335480407910973e-06, "loss": 0.0333, "step": 3674 }, { "epoch": 1.7651296829971181, "grad_norm": 0.4490941848399881, "learning_rate": 4.332710788170661e-06, "loss": 0.0346, "step": 3675 }, { "epoch": 1.765609990393852, "grad_norm": 0.31314546255425163, "learning_rate": 4.329941376873038e-06, "loss": 0.0236, "step": 3676 }, { "epoch": 1.7660902977905861, "grad_norm": 0.3196039664058349, "learning_rate": 4.327172174883192e-06, "loss": 0.0347, "step": 3677 }, { "epoch": 1.76657060518732, "grad_norm": 0.24423510579592286, "learning_rate": 4.324403183066143e-06, "loss": 0.022, "step": 3678 }, { "epoch": 1.767050912584054, "grad_norm": 0.36605071107599696, "learning_rate": 4.321634402286851e-06, "loss": 0.0292, "step": 3679 }, { "epoch": 1.7675312199807878, "grad_norm": 0.36293740058186014, "learning_rate": 4.318865833410203e-06, "loss": 0.0316, "step": 3680 }, { "epoch": 1.7680115273775217, "grad_norm": 0.308635414499735, "learning_rate": 4.3160974773010255e-06, "loss": 0.0254, "step": 3681 }, { "epoch": 1.7684918347742555, "grad_norm": 0.33032163062953057, "learning_rate": 4.313329334824076e-06, "loss": 0.0239, "step": 3682 }, { "epoch": 1.7689721421709894, "grad_norm": 0.3980287505415847, "learning_rate": 4.310561406844045e-06, "loss": 0.0323, "step": 3683 }, { "epoch": 1.7694524495677233, "grad_norm": 0.3659316690355484, "learning_rate": 4.307793694225558e-06, "loss": 0.0326, "step": 3684 }, { "epoch": 1.7699327569644572, "grad_norm": 0.25708060602449595, "learning_rate": 4.305026197833168e-06, "loss": 0.0241, "step": 3685 }, { "epoch": 1.770413064361191, "grad_norm": 0.2612189511289192, "learning_rate": 4.302258918531369e-06, "loss": 0.0209, "step": 3686 }, { "epoch": 1.770893371757925, "grad_norm": 0.2559870630900653, "learning_rate": 4.299491857184582e-06, "loss": 0.0231, "step": 3687 }, { "epoch": 1.7713736791546588, "grad_norm": 0.30250831859397215, "learning_rate": 4.296725014657157e-06, "loss": 0.0221, "step": 3688 }, { "epoch": 1.771853986551393, "grad_norm": 0.33481379895209173, "learning_rate": 4.293958391813382e-06, "loss": 0.02, "step": 3689 }, { "epoch": 1.7723342939481268, "grad_norm": 0.40252315709068365, "learning_rate": 4.291191989517472e-06, "loss": 0.0249, "step": 3690 }, { "epoch": 1.7728146013448607, "grad_norm": 0.3191154588420702, "learning_rate": 4.2884258086335755e-06, "loss": 0.0246, "step": 3691 }, { "epoch": 1.7732949087415946, "grad_norm": 0.25282433679783023, "learning_rate": 4.285659850025769e-06, "loss": 0.0151, "step": 3692 }, { "epoch": 1.7737752161383287, "grad_norm": 0.24294728652253242, "learning_rate": 4.282894114558064e-06, "loss": 0.0268, "step": 3693 }, { "epoch": 1.7742555235350626, "grad_norm": 0.31263557086627686, "learning_rate": 4.280128603094399e-06, "loss": 0.0221, "step": 3694 }, { "epoch": 1.7747358309317964, "grad_norm": 0.2708939753568489, "learning_rate": 4.277363316498643e-06, "loss": 0.023, "step": 3695 }, { "epoch": 1.7752161383285303, "grad_norm": 0.3130445700660059, "learning_rate": 4.2745982556345935e-06, "loss": 0.0198, "step": 3696 }, { "epoch": 1.7756964457252642, "grad_norm": 0.43412836569278734, "learning_rate": 4.271833421365983e-06, "loss": 0.0248, "step": 3697 }, { "epoch": 1.776176753121998, "grad_norm": 0.3497156906470842, "learning_rate": 4.269068814556465e-06, "loss": 0.024, "step": 3698 }, { "epoch": 1.776657060518732, "grad_norm": 0.4221473856845227, "learning_rate": 4.266304436069628e-06, "loss": 0.0268, "step": 3699 }, { "epoch": 1.7771373679154658, "grad_norm": 0.44744748645753285, "learning_rate": 4.263540286768986e-06, "loss": 0.0231, "step": 3700 }, { "epoch": 1.7776176753121997, "grad_norm": 0.26523079752554923, "learning_rate": 4.2607763675179855e-06, "loss": 0.0194, "step": 3701 }, { "epoch": 1.7780979827089336, "grad_norm": 0.333695912566233, "learning_rate": 4.258012679179997e-06, "loss": 0.0212, "step": 3702 }, { "epoch": 1.7785782901056675, "grad_norm": 0.359470770686518, "learning_rate": 4.255249222618319e-06, "loss": 0.026, "step": 3703 }, { "epoch": 1.7790585975024016, "grad_norm": 0.27778914797187887, "learning_rate": 4.252485998696182e-06, "loss": 0.0217, "step": 3704 }, { "epoch": 1.7795389048991355, "grad_norm": 0.3082626726990077, "learning_rate": 4.249723008276737e-06, "loss": 0.0252, "step": 3705 }, { "epoch": 1.7800192122958693, "grad_norm": 0.4763100334214595, "learning_rate": 4.246960252223068e-06, "loss": 0.0223, "step": 3706 }, { "epoch": 1.7804995196926032, "grad_norm": 0.3902417800674871, "learning_rate": 4.244197731398183e-06, "loss": 0.0359, "step": 3707 }, { "epoch": 1.7809798270893373, "grad_norm": 0.33541038158026437, "learning_rate": 4.241435446665017e-06, "loss": 0.0262, "step": 3708 }, { "epoch": 1.7814601344860712, "grad_norm": 0.3993156769605381, "learning_rate": 4.238673398886433e-06, "loss": 0.0294, "step": 3709 }, { "epoch": 1.781940441882805, "grad_norm": 0.30718486721812593, "learning_rate": 4.235911588925216e-06, "loss": 0.0245, "step": 3710 }, { "epoch": 1.782420749279539, "grad_norm": 0.27809684971705384, "learning_rate": 4.23315001764408e-06, "loss": 0.0228, "step": 3711 }, { "epoch": 1.7829010566762729, "grad_norm": 0.311520762211288, "learning_rate": 4.230388685905663e-06, "loss": 0.0284, "step": 3712 }, { "epoch": 1.7833813640730067, "grad_norm": 0.3633814922622185, "learning_rate": 4.22762759457253e-06, "loss": 0.027, "step": 3713 }, { "epoch": 1.7838616714697406, "grad_norm": 0.22664665550704607, "learning_rate": 4.2248667445071665e-06, "loss": 0.0202, "step": 3714 }, { "epoch": 1.7843419788664745, "grad_norm": 0.3974879154206644, "learning_rate": 4.222106136571989e-06, "loss": 0.0293, "step": 3715 }, { "epoch": 1.7848222862632084, "grad_norm": 0.4233629056608124, "learning_rate": 4.219345771629333e-06, "loss": 0.0219, "step": 3716 }, { "epoch": 1.7853025936599423, "grad_norm": 0.2533414519276015, "learning_rate": 4.2165856505414606e-06, "loss": 0.0192, "step": 3717 }, { "epoch": 1.7857829010566761, "grad_norm": 0.43247766043874925, "learning_rate": 4.213825774170559e-06, "loss": 0.0355, "step": 3718 }, { "epoch": 1.78626320845341, "grad_norm": 0.31626008763803737, "learning_rate": 4.211066143378735e-06, "loss": 0.0269, "step": 3719 }, { "epoch": 1.7867435158501441, "grad_norm": 0.8861101394136922, "learning_rate": 4.208306759028022e-06, "loss": 0.0239, "step": 3720 }, { "epoch": 1.787223823246878, "grad_norm": 0.565668336825216, "learning_rate": 4.205547621980375e-06, "loss": 0.0325, "step": 3721 }, { "epoch": 1.7877041306436119, "grad_norm": 0.3457102885633062, "learning_rate": 4.20278873309767e-06, "loss": 0.0242, "step": 3722 }, { "epoch": 1.7881844380403458, "grad_norm": 0.7768450391640498, "learning_rate": 4.200030093241713e-06, "loss": 0.043, "step": 3723 }, { "epoch": 1.7886647454370799, "grad_norm": 0.34890190191901876, "learning_rate": 4.197271703274222e-06, "loss": 0.0296, "step": 3724 }, { "epoch": 1.7891450528338138, "grad_norm": 0.38884090707227587, "learning_rate": 4.194513564056843e-06, "loss": 0.0251, "step": 3725 }, { "epoch": 1.7896253602305476, "grad_norm": 0.5934812150724097, "learning_rate": 4.1917556764511424e-06, "loss": 0.032, "step": 3726 }, { "epoch": 1.7901056676272815, "grad_norm": 0.31899497986126985, "learning_rate": 4.188998041318608e-06, "loss": 0.0287, "step": 3727 }, { "epoch": 1.7905859750240154, "grad_norm": 0.32634605401745126, "learning_rate": 4.186240659520649e-06, "loss": 0.0208, "step": 3728 }, { "epoch": 1.7910662824207493, "grad_norm": 0.27907541672369596, "learning_rate": 4.183483531918595e-06, "loss": 0.02, "step": 3729 }, { "epoch": 1.7915465898174832, "grad_norm": 1.2569615577852664, "learning_rate": 4.180726659373696e-06, "loss": 0.0375, "step": 3730 }, { "epoch": 1.792026897214217, "grad_norm": 0.3154683378090491, "learning_rate": 4.177970042747124e-06, "loss": 0.0189, "step": 3731 }, { "epoch": 1.792507204610951, "grad_norm": 0.5498135754248901, "learning_rate": 4.175213682899969e-06, "loss": 0.0376, "step": 3732 }, { "epoch": 1.7929875120076848, "grad_norm": 0.3021105344879481, "learning_rate": 4.172457580693242e-06, "loss": 0.0286, "step": 3733 }, { "epoch": 1.7934678194044187, "grad_norm": 0.5565596483867229, "learning_rate": 4.169701736987872e-06, "loss": 0.035, "step": 3734 }, { "epoch": 1.7939481268011528, "grad_norm": 0.3014156090109605, "learning_rate": 4.166946152644708e-06, "loss": 0.0247, "step": 3735 }, { "epoch": 1.7944284341978867, "grad_norm": 0.3213396110350875, "learning_rate": 4.164190828524519e-06, "loss": 0.0254, "step": 3736 }, { "epoch": 1.7949087415946205, "grad_norm": 0.28374550495138856, "learning_rate": 4.161435765487993e-06, "loss": 0.0216, "step": 3737 }, { "epoch": 1.7953890489913544, "grad_norm": 0.35097548533249245, "learning_rate": 4.158680964395734e-06, "loss": 0.031, "step": 3738 }, { "epoch": 1.7958693563880885, "grad_norm": 0.40620389557758446, "learning_rate": 4.155926426108268e-06, "loss": 0.0229, "step": 3739 }, { "epoch": 1.7963496637848224, "grad_norm": 0.3910603744364216, "learning_rate": 4.153172151486033e-06, "loss": 0.0396, "step": 3740 }, { "epoch": 1.7968299711815563, "grad_norm": 0.3986468487467855, "learning_rate": 4.150418141389392e-06, "loss": 0.0319, "step": 3741 }, { "epoch": 1.7973102785782902, "grad_norm": 0.4507174673185706, "learning_rate": 4.1476643966786175e-06, "loss": 0.0345, "step": 3742 }, { "epoch": 1.797790585975024, "grad_norm": 0.34457472979613507, "learning_rate": 4.144910918213908e-06, "loss": 0.0351, "step": 3743 }, { "epoch": 1.798270893371758, "grad_norm": 0.31656038647901613, "learning_rate": 4.142157706855367e-06, "loss": 0.0309, "step": 3744 }, { "epoch": 1.7987512007684918, "grad_norm": 0.3851175306876083, "learning_rate": 4.13940476346303e-06, "loss": 0.0251, "step": 3745 }, { "epoch": 1.7992315081652257, "grad_norm": 0.3322546313883673, "learning_rate": 4.1366520888968355e-06, "loss": 0.0288, "step": 3746 }, { "epoch": 1.7997118155619596, "grad_norm": 0.3153377261418856, "learning_rate": 4.133899684016644e-06, "loss": 0.0278, "step": 3747 }, { "epoch": 1.8001921229586935, "grad_norm": 0.35819899358759066, "learning_rate": 4.131147549682228e-06, "loss": 0.0326, "step": 3748 }, { "epoch": 1.8006724303554273, "grad_norm": 0.3466920576613707, "learning_rate": 4.1283956867532825e-06, "loss": 0.0306, "step": 3749 }, { "epoch": 1.8011527377521612, "grad_norm": 0.280594229037193, "learning_rate": 4.12564409608941e-06, "loss": 0.0267, "step": 3750 }, { "epoch": 1.8016330451488953, "grad_norm": 0.2773324802451537, "learning_rate": 4.122892778550132e-06, "loss": 0.0201, "step": 3751 }, { "epoch": 1.8021133525456292, "grad_norm": 0.39528816930150046, "learning_rate": 4.120141734994886e-06, "loss": 0.0366, "step": 3752 }, { "epoch": 1.802593659942363, "grad_norm": 0.27365259354728005, "learning_rate": 4.117390966283019e-06, "loss": 0.0292, "step": 3753 }, { "epoch": 1.803073967339097, "grad_norm": 0.4021196903992992, "learning_rate": 4.114640473273798e-06, "loss": 0.0252, "step": 3754 }, { "epoch": 1.803554274735831, "grad_norm": 0.2766216413167318, "learning_rate": 4.111890256826398e-06, "loss": 0.0205, "step": 3755 }, { "epoch": 1.804034582132565, "grad_norm": 0.34906673395150584, "learning_rate": 4.109140317799913e-06, "loss": 0.0259, "step": 3756 }, { "epoch": 1.8045148895292988, "grad_norm": 0.2994853404917118, "learning_rate": 4.1063906570533475e-06, "loss": 0.0235, "step": 3757 }, { "epoch": 1.8049951969260327, "grad_norm": 0.30188601180510655, "learning_rate": 4.1036412754456154e-06, "loss": 0.0311, "step": 3758 }, { "epoch": 1.8054755043227666, "grad_norm": 0.2886154811313445, "learning_rate": 4.100892173835553e-06, "loss": 0.0178, "step": 3759 }, { "epoch": 1.8059558117195005, "grad_norm": 0.3607269147992047, "learning_rate": 4.098143353081902e-06, "loss": 0.0363, "step": 3760 }, { "epoch": 1.8064361191162344, "grad_norm": 0.3626586399699536, "learning_rate": 4.095394814043316e-06, "loss": 0.031, "step": 3761 }, { "epoch": 1.8069164265129682, "grad_norm": 0.29916857450938034, "learning_rate": 4.092646557578365e-06, "loss": 0.0195, "step": 3762 }, { "epoch": 1.8073967339097021, "grad_norm": 0.3497357624012194, "learning_rate": 4.089898584545527e-06, "loss": 0.0379, "step": 3763 }, { "epoch": 1.807877041306436, "grad_norm": 0.3093472511893341, "learning_rate": 4.087150895803192e-06, "loss": 0.0231, "step": 3764 }, { "epoch": 1.8083573487031699, "grad_norm": 0.3621523127312712, "learning_rate": 4.084403492209664e-06, "loss": 0.0253, "step": 3765 }, { "epoch": 1.808837656099904, "grad_norm": 0.3079474716970158, "learning_rate": 4.081656374623153e-06, "loss": 0.0242, "step": 3766 }, { "epoch": 1.8093179634966379, "grad_norm": 0.5945099626720509, "learning_rate": 4.078909543901786e-06, "loss": 0.0277, "step": 3767 }, { "epoch": 1.8097982708933718, "grad_norm": 0.5262570118161777, "learning_rate": 4.076163000903595e-06, "loss": 0.0223, "step": 3768 }, { "epoch": 1.8102785782901056, "grad_norm": 0.3304593260056681, "learning_rate": 4.073416746486524e-06, "loss": 0.025, "step": 3769 }, { "epoch": 1.8107588856868397, "grad_norm": 0.3122387011593699, "learning_rate": 4.070670781508428e-06, "loss": 0.0248, "step": 3770 }, { "epoch": 1.8112391930835736, "grad_norm": 0.3698934564392852, "learning_rate": 4.067925106827068e-06, "loss": 0.0281, "step": 3771 }, { "epoch": 1.8117195004803075, "grad_norm": 0.2814765763303357, "learning_rate": 4.06517972330012e-06, "loss": 0.0177, "step": 3772 }, { "epoch": 1.8121998078770414, "grad_norm": 0.39514223733310333, "learning_rate": 4.062434631785162e-06, "loss": 0.0294, "step": 3773 }, { "epoch": 1.8126801152737753, "grad_norm": 0.3130897065915732, "learning_rate": 4.059689833139689e-06, "loss": 0.0236, "step": 3774 }, { "epoch": 1.8131604226705091, "grad_norm": 0.8877356556848218, "learning_rate": 4.056945328221097e-06, "loss": 0.0221, "step": 3775 }, { "epoch": 1.813640730067243, "grad_norm": 0.2839965366846324, "learning_rate": 4.054201117886695e-06, "loss": 0.0237, "step": 3776 }, { "epoch": 1.814121037463977, "grad_norm": 0.2848748645550958, "learning_rate": 4.051457202993698e-06, "loss": 0.0175, "step": 3777 }, { "epoch": 1.8146013448607108, "grad_norm": 0.46828880582303195, "learning_rate": 4.0487135843992295e-06, "loss": 0.0388, "step": 3778 }, { "epoch": 1.8150816522574447, "grad_norm": 0.317347070754061, "learning_rate": 4.04597026296032e-06, "loss": 0.024, "step": 3779 }, { "epoch": 1.8155619596541785, "grad_norm": 0.33606610600395126, "learning_rate": 4.043227239533904e-06, "loss": 0.0268, "step": 3780 }, { "epoch": 1.8160422670509124, "grad_norm": 0.39557269146750834, "learning_rate": 4.040484514976831e-06, "loss": 0.0406, "step": 3781 }, { "epoch": 1.8165225744476465, "grad_norm": 0.408364513712591, "learning_rate": 4.037742090145851e-06, "loss": 0.0236, "step": 3782 }, { "epoch": 1.8170028818443804, "grad_norm": 0.3283492347194832, "learning_rate": 4.03499996589762e-06, "loss": 0.0266, "step": 3783 }, { "epoch": 1.8174831892411143, "grad_norm": 0.4806086493799972, "learning_rate": 4.032258143088703e-06, "loss": 0.042, "step": 3784 }, { "epoch": 1.8179634966378482, "grad_norm": 0.3413971514755944, "learning_rate": 4.029516622575569e-06, "loss": 0.0255, "step": 3785 }, { "epoch": 1.8184438040345823, "grad_norm": 0.36160415603242835, "learning_rate": 4.026775405214592e-06, "loss": 0.0316, "step": 3786 }, { "epoch": 1.8189241114313162, "grad_norm": 0.35275770445724075, "learning_rate": 4.024034491862056e-06, "loss": 0.025, "step": 3787 }, { "epoch": 1.81940441882805, "grad_norm": 0.28061572307862775, "learning_rate": 4.021293883374141e-06, "loss": 0.0246, "step": 3788 }, { "epoch": 1.819884726224784, "grad_norm": 0.3847034244138903, "learning_rate": 4.018553580606943e-06, "loss": 0.0404, "step": 3789 }, { "epoch": 1.8203650336215178, "grad_norm": 0.28205919276856267, "learning_rate": 4.015813584416454e-06, "loss": 0.0174, "step": 3790 }, { "epoch": 1.8208453410182517, "grad_norm": 0.51752652122546, "learning_rate": 4.013073895658574e-06, "loss": 0.0212, "step": 3791 }, { "epoch": 1.8213256484149856, "grad_norm": 0.5677176885676724, "learning_rate": 4.010334515189106e-06, "loss": 0.0393, "step": 3792 }, { "epoch": 1.8218059558117194, "grad_norm": 0.7438369692240157, "learning_rate": 4.007595443863755e-06, "loss": 0.0223, "step": 3793 }, { "epoch": 1.8222862632084533, "grad_norm": 0.35683920463515006, "learning_rate": 4.004856682538132e-06, "loss": 0.0229, "step": 3794 }, { "epoch": 1.8227665706051872, "grad_norm": 0.44954878947547977, "learning_rate": 4.0021182320677485e-06, "loss": 0.025, "step": 3795 }, { "epoch": 1.823246878001921, "grad_norm": 0.3224265422999456, "learning_rate": 3.999380093308025e-06, "loss": 0.0234, "step": 3796 }, { "epoch": 1.8237271853986552, "grad_norm": 0.38310725788628386, "learning_rate": 3.9966422671142776e-06, "loss": 0.0312, "step": 3797 }, { "epoch": 1.824207492795389, "grad_norm": 0.7976208998414231, "learning_rate": 3.993904754341728e-06, "loss": 0.0318, "step": 3798 }, { "epoch": 1.824687800192123, "grad_norm": 0.3471420568407955, "learning_rate": 3.991167555845498e-06, "loss": 0.023, "step": 3799 }, { "epoch": 1.8251681075888568, "grad_norm": 0.33594849564837803, "learning_rate": 3.988430672480614e-06, "loss": 0.0214, "step": 3800 }, { "epoch": 1.825648414985591, "grad_norm": 0.2705193925509413, "learning_rate": 3.985694105102004e-06, "loss": 0.0215, "step": 3801 }, { "epoch": 1.8261287223823248, "grad_norm": 0.2605259637015823, "learning_rate": 3.982957854564492e-06, "loss": 0.023, "step": 3802 }, { "epoch": 1.8266090297790587, "grad_norm": 0.4439631419087424, "learning_rate": 3.980221921722811e-06, "loss": 0.0287, "step": 3803 }, { "epoch": 1.8270893371757926, "grad_norm": 0.3162963292804305, "learning_rate": 3.977486307431589e-06, "loss": 0.0233, "step": 3804 }, { "epoch": 1.8275696445725265, "grad_norm": 0.45557258170039094, "learning_rate": 3.974751012545357e-06, "loss": 0.0413, "step": 3805 }, { "epoch": 1.8280499519692603, "grad_norm": 0.36685543608188836, "learning_rate": 3.9720160379185444e-06, "loss": 0.0312, "step": 3806 }, { "epoch": 1.8285302593659942, "grad_norm": 0.4310271254521534, "learning_rate": 3.969281384405482e-06, "loss": 0.029, "step": 3807 }, { "epoch": 1.829010566762728, "grad_norm": 0.2336408991136034, "learning_rate": 3.9665470528604e-06, "loss": 0.0221, "step": 3808 }, { "epoch": 1.829490874159462, "grad_norm": 0.8779904422265264, "learning_rate": 3.963813044137427e-06, "loss": 0.045, "step": 3809 }, { "epoch": 1.8299711815561959, "grad_norm": 0.4066556028127334, "learning_rate": 3.961079359090592e-06, "loss": 0.0186, "step": 3810 }, { "epoch": 1.8304514889529298, "grad_norm": 0.33617998142517097, "learning_rate": 3.958345998573825e-06, "loss": 0.0279, "step": 3811 }, { "epoch": 1.8309317963496636, "grad_norm": 0.23884126850101794, "learning_rate": 3.955612963440949e-06, "loss": 0.0199, "step": 3812 }, { "epoch": 1.8314121037463977, "grad_norm": 0.30046742841249324, "learning_rate": 3.9528802545456904e-06, "loss": 0.0298, "step": 3813 }, { "epoch": 1.8318924111431316, "grad_norm": 0.3386885289630532, "learning_rate": 3.950147872741672e-06, "loss": 0.025, "step": 3814 }, { "epoch": 1.8323727185398655, "grad_norm": 0.333551064866117, "learning_rate": 3.9474158188824145e-06, "loss": 0.022, "step": 3815 }, { "epoch": 1.8328530259365994, "grad_norm": 0.2903793710923509, "learning_rate": 3.9446840938213334e-06, "loss": 0.0257, "step": 3816 }, { "epoch": 1.8333333333333335, "grad_norm": 0.3823197366405017, "learning_rate": 3.941952698411743e-06, "loss": 0.0287, "step": 3817 }, { "epoch": 1.8338136407300674, "grad_norm": 0.37040843093440745, "learning_rate": 3.939221633506863e-06, "loss": 0.0367, "step": 3818 }, { "epoch": 1.8342939481268012, "grad_norm": 0.30447344272841187, "learning_rate": 3.936490899959796e-06, "loss": 0.0243, "step": 3819 }, { "epoch": 1.8347742555235351, "grad_norm": 0.3217153648063274, "learning_rate": 3.93376049862355e-06, "loss": 0.0249, "step": 3820 }, { "epoch": 1.835254562920269, "grad_norm": 0.3087650030136373, "learning_rate": 3.931030430351026e-06, "loss": 0.0257, "step": 3821 }, { "epoch": 1.8357348703170029, "grad_norm": 0.38875740627024935, "learning_rate": 3.928300695995023e-06, "loss": 0.0287, "step": 3822 }, { "epoch": 1.8362151777137368, "grad_norm": 0.34510673804199987, "learning_rate": 3.925571296408233e-06, "loss": 0.0285, "step": 3823 }, { "epoch": 1.8366954851104706, "grad_norm": 0.36695695933934713, "learning_rate": 3.922842232443244e-06, "loss": 0.0289, "step": 3824 }, { "epoch": 1.8371757925072045, "grad_norm": 0.6480789807789834, "learning_rate": 3.9201135049525426e-06, "loss": 0.0338, "step": 3825 }, { "epoch": 1.8376560999039384, "grad_norm": 0.2693027651455763, "learning_rate": 3.917385114788508e-06, "loss": 0.0214, "step": 3826 }, { "epoch": 1.8381364073006723, "grad_norm": 0.40214028645184535, "learning_rate": 3.914657062803412e-06, "loss": 0.0306, "step": 3827 }, { "epoch": 1.8386167146974062, "grad_norm": 0.3705548160735817, "learning_rate": 3.9119293498494235e-06, "loss": 0.0249, "step": 3828 }, { "epoch": 1.8390970220941403, "grad_norm": 0.3326597457856765, "learning_rate": 3.909201976778605e-06, "loss": 0.0199, "step": 3829 }, { "epoch": 1.8395773294908742, "grad_norm": 0.529683498082064, "learning_rate": 3.9064749444429105e-06, "loss": 0.0517, "step": 3830 }, { "epoch": 1.840057636887608, "grad_norm": 0.2739453239221769, "learning_rate": 3.90374825369419e-06, "loss": 0.027, "step": 3831 }, { "epoch": 1.8405379442843421, "grad_norm": 0.4453050992716967, "learning_rate": 3.901021905384187e-06, "loss": 0.0494, "step": 3832 }, { "epoch": 1.841018251681076, "grad_norm": 0.31326319009933273, "learning_rate": 3.898295900364537e-06, "loss": 0.0278, "step": 3833 }, { "epoch": 1.84149855907781, "grad_norm": 0.37587083265273336, "learning_rate": 3.89557023948677e-06, "loss": 0.0357, "step": 3834 }, { "epoch": 1.8419788664745438, "grad_norm": 0.33046017958909163, "learning_rate": 3.892844923602305e-06, "loss": 0.0276, "step": 3835 }, { "epoch": 1.8424591738712777, "grad_norm": 0.45182056339290977, "learning_rate": 3.890119953562456e-06, "loss": 0.0343, "step": 3836 }, { "epoch": 1.8429394812680115, "grad_norm": 0.36749834764121453, "learning_rate": 3.887395330218429e-06, "loss": 0.034, "step": 3837 }, { "epoch": 1.8434197886647454, "grad_norm": 0.29193618714786324, "learning_rate": 3.884671054421321e-06, "loss": 0.0243, "step": 3838 }, { "epoch": 1.8439000960614793, "grad_norm": 0.30111254384090247, "learning_rate": 3.881947127022116e-06, "loss": 0.0272, "step": 3839 }, { "epoch": 1.8443804034582132, "grad_norm": 0.33503127572038627, "learning_rate": 3.8792235488717e-06, "loss": 0.0498, "step": 3840 }, { "epoch": 1.844860710854947, "grad_norm": 0.2750897028319576, "learning_rate": 3.8765003208208405e-06, "loss": 0.0169, "step": 3841 }, { "epoch": 1.845341018251681, "grad_norm": 0.3381470047983402, "learning_rate": 3.873777443720199e-06, "loss": 0.0288, "step": 3842 }, { "epoch": 1.8458213256484148, "grad_norm": 0.35404034361406816, "learning_rate": 3.871054918420326e-06, "loss": 0.0297, "step": 3843 }, { "epoch": 1.846301633045149, "grad_norm": 0.3698593438160665, "learning_rate": 3.868332745771664e-06, "loss": 0.0327, "step": 3844 }, { "epoch": 1.8467819404418828, "grad_norm": 0.351910160454034, "learning_rate": 3.865610926624544e-06, "loss": 0.0366, "step": 3845 }, { "epoch": 1.8472622478386167, "grad_norm": 0.2998021582231208, "learning_rate": 3.862889461829187e-06, "loss": 0.0217, "step": 3846 }, { "epoch": 1.8477425552353506, "grad_norm": 0.3996429985955665, "learning_rate": 3.860168352235704e-06, "loss": 0.0338, "step": 3847 }, { "epoch": 1.8482228626320847, "grad_norm": 0.32030307329334606, "learning_rate": 3.857447598694095e-06, "loss": 0.0157, "step": 3848 }, { "epoch": 1.8487031700288186, "grad_norm": 0.28932981548595954, "learning_rate": 3.854727202054246e-06, "loss": 0.0235, "step": 3849 }, { "epoch": 1.8491834774255524, "grad_norm": 0.42510912346079605, "learning_rate": 3.852007163165934e-06, "loss": 0.0429, "step": 3850 }, { "epoch": 1.8496637848222863, "grad_norm": 0.5169036037995557, "learning_rate": 3.849287482878827e-06, "loss": 0.0332, "step": 3851 }, { "epoch": 1.8501440922190202, "grad_norm": 0.29115864377184947, "learning_rate": 3.8465681620424745e-06, "loss": 0.0241, "step": 3852 }, { "epoch": 1.850624399615754, "grad_norm": 0.23568052120127375, "learning_rate": 3.843849201506318e-06, "loss": 0.0188, "step": 3853 }, { "epoch": 1.851104707012488, "grad_norm": 0.5849294795012984, "learning_rate": 3.841130602119683e-06, "loss": 0.0308, "step": 3854 }, { "epoch": 1.8515850144092219, "grad_norm": 0.32678728027694887, "learning_rate": 3.838412364731791e-06, "loss": 0.0208, "step": 3855 }, { "epoch": 1.8520653218059557, "grad_norm": 0.3028653362146022, "learning_rate": 3.835694490191738e-06, "loss": 0.0252, "step": 3856 }, { "epoch": 1.8525456292026896, "grad_norm": 0.3593186089575623, "learning_rate": 3.832976979348517e-06, "loss": 0.0275, "step": 3857 }, { "epoch": 1.8530259365994235, "grad_norm": 0.39298850421477527, "learning_rate": 3.830259833051001e-06, "loss": 0.0223, "step": 3858 }, { "epoch": 1.8535062439961574, "grad_norm": 0.28793061925562896, "learning_rate": 3.827543052147952e-06, "loss": 0.0244, "step": 3859 }, { "epoch": 1.8539865513928915, "grad_norm": 0.39024742483437797, "learning_rate": 3.824826637488017e-06, "loss": 0.0401, "step": 3860 }, { "epoch": 1.8544668587896254, "grad_norm": 0.4016447619063944, "learning_rate": 3.822110589919728e-06, "loss": 0.0329, "step": 3861 }, { "epoch": 1.8549471661863592, "grad_norm": 0.3278616014285065, "learning_rate": 3.819394910291505e-06, "loss": 0.0235, "step": 3862 }, { "epoch": 1.8554274735830933, "grad_norm": 0.2878835913125068, "learning_rate": 3.81667959945165e-06, "loss": 0.0249, "step": 3863 }, { "epoch": 1.8559077809798272, "grad_norm": 0.30390230073476815, "learning_rate": 3.8139646582483523e-06, "loss": 0.0288, "step": 3864 }, { "epoch": 1.856388088376561, "grad_norm": 0.46493930458160565, "learning_rate": 3.8112500875296825e-06, "loss": 0.0293, "step": 3865 }, { "epoch": 1.856868395773295, "grad_norm": 0.3026923805089399, "learning_rate": 3.808535888143598e-06, "loss": 0.0271, "step": 3866 }, { "epoch": 1.8573487031700289, "grad_norm": 0.30666288304379574, "learning_rate": 3.80582206093794e-06, "loss": 0.0263, "step": 3867 }, { "epoch": 1.8578290105667628, "grad_norm": 0.30397961541892105, "learning_rate": 3.803108606760431e-06, "loss": 0.0272, "step": 3868 }, { "epoch": 1.8583093179634966, "grad_norm": 0.2973570530691281, "learning_rate": 3.800395526458683e-06, "loss": 0.0312, "step": 3869 }, { "epoch": 1.8587896253602305, "grad_norm": 0.3413568622161239, "learning_rate": 3.797682820880184e-06, "loss": 0.0263, "step": 3870 }, { "epoch": 1.8592699327569644, "grad_norm": 0.3600402121493243, "learning_rate": 3.7949704908723097e-06, "loss": 0.0283, "step": 3871 }, { "epoch": 1.8597502401536983, "grad_norm": 0.8356201218388172, "learning_rate": 3.792258537282317e-06, "loss": 0.0305, "step": 3872 }, { "epoch": 1.8602305475504322, "grad_norm": 0.4373658065497144, "learning_rate": 3.7895469609573437e-06, "loss": 0.0233, "step": 3873 }, { "epoch": 1.860710854947166, "grad_norm": 0.3997870764022931, "learning_rate": 3.7868357627444143e-06, "loss": 0.029, "step": 3874 }, { "epoch": 1.8611911623439001, "grad_norm": 0.3894424413941173, "learning_rate": 3.784124943490428e-06, "loss": 0.045, "step": 3875 }, { "epoch": 1.861671469740634, "grad_norm": 0.3918975479466153, "learning_rate": 3.78141450404217e-06, "loss": 0.0346, "step": 3876 }, { "epoch": 1.862151777137368, "grad_norm": 0.28852326601846523, "learning_rate": 3.778704445246312e-06, "loss": 0.0298, "step": 3877 }, { "epoch": 1.8626320845341018, "grad_norm": 1.1455781632169493, "learning_rate": 3.7759947679493957e-06, "loss": 0.0339, "step": 3878 }, { "epoch": 1.8631123919308359, "grad_norm": 0.2837276917243785, "learning_rate": 3.773285472997852e-06, "loss": 0.0236, "step": 3879 }, { "epoch": 1.8635926993275698, "grad_norm": 0.38262537488071, "learning_rate": 3.770576561237989e-06, "loss": 0.0229, "step": 3880 }, { "epoch": 1.8640730067243036, "grad_norm": 0.3256140245496909, "learning_rate": 3.7678680335159955e-06, "loss": 0.0238, "step": 3881 }, { "epoch": 1.8645533141210375, "grad_norm": 0.6214697290845623, "learning_rate": 3.765159890677942e-06, "loss": 0.0296, "step": 3882 }, { "epoch": 1.8650336215177714, "grad_norm": 0.38124500206760514, "learning_rate": 3.7624521335697744e-06, "loss": 0.0245, "step": 3883 }, { "epoch": 1.8655139289145053, "grad_norm": 0.6878760639690874, "learning_rate": 3.759744763037324e-06, "loss": 0.0474, "step": 3884 }, { "epoch": 1.8659942363112392, "grad_norm": 0.5654443696325822, "learning_rate": 3.757037779926298e-06, "loss": 0.0269, "step": 3885 }, { "epoch": 1.866474543707973, "grad_norm": 0.36460583158840376, "learning_rate": 3.7543311850822826e-06, "loss": 0.026, "step": 3886 }, { "epoch": 1.866954851104707, "grad_norm": 0.41201166095203756, "learning_rate": 3.7516249793507444e-06, "loss": 0.0379, "step": 3887 }, { "epoch": 1.8674351585014408, "grad_norm": 0.3809971891956674, "learning_rate": 3.748919163577024e-06, "loss": 0.0237, "step": 3888 }, { "epoch": 1.8679154658981747, "grad_norm": 0.26109622219165196, "learning_rate": 3.746213738606346e-06, "loss": 0.0221, "step": 3889 }, { "epoch": 1.8683957732949086, "grad_norm": 0.4477870603998692, "learning_rate": 3.7435087052838072e-06, "loss": 0.0358, "step": 3890 }, { "epoch": 1.8688760806916427, "grad_norm": 0.42719245923983384, "learning_rate": 3.740804064454391e-06, "loss": 0.0225, "step": 3891 }, { "epoch": 1.8693563880883766, "grad_norm": 0.3324217206835271, "learning_rate": 3.7380998169629477e-06, "loss": 0.0283, "step": 3892 }, { "epoch": 1.8698366954851104, "grad_norm": 0.4624927018191462, "learning_rate": 3.7353959636542115e-06, "loss": 0.0207, "step": 3893 }, { "epoch": 1.8703170028818443, "grad_norm": 0.36341077349026807, "learning_rate": 3.73269250537279e-06, "loss": 0.032, "step": 3894 }, { "epoch": 1.8707973102785784, "grad_norm": 0.26123493653058366, "learning_rate": 3.729989442963169e-06, "loss": 0.0246, "step": 3895 }, { "epoch": 1.8712776176753123, "grad_norm": 0.43756029196519247, "learning_rate": 3.7272867772697106e-06, "loss": 0.0371, "step": 3896 }, { "epoch": 1.8717579250720462, "grad_norm": 0.42557740682453776, "learning_rate": 3.724584509136655e-06, "loss": 0.0306, "step": 3897 }, { "epoch": 1.87223823246878, "grad_norm": 0.3703611863049582, "learning_rate": 3.72188263940811e-06, "loss": 0.0317, "step": 3898 }, { "epoch": 1.872718539865514, "grad_norm": 0.33675925426184117, "learning_rate": 3.719181168928071e-06, "loss": 0.0269, "step": 3899 }, { "epoch": 1.8731988472622478, "grad_norm": 0.3356472435909721, "learning_rate": 3.7164800985404014e-06, "loss": 0.0269, "step": 3900 }, { "epoch": 1.8736791546589817, "grad_norm": 0.37640192577545567, "learning_rate": 3.7137794290888395e-06, "loss": 0.0182, "step": 3901 }, { "epoch": 1.8741594620557156, "grad_norm": 0.31443132138691954, "learning_rate": 3.711079161417e-06, "loss": 0.0212, "step": 3902 }, { "epoch": 1.8746397694524495, "grad_norm": 0.4226344164785463, "learning_rate": 3.708379296368372e-06, "loss": 0.0258, "step": 3903 }, { "epoch": 1.8751200768491834, "grad_norm": 0.33279591233743955, "learning_rate": 3.7056798347863187e-06, "loss": 0.0183, "step": 3904 }, { "epoch": 1.8756003842459172, "grad_norm": 0.37562827045873803, "learning_rate": 3.7029807775140766e-06, "loss": 0.0273, "step": 3905 }, { "epoch": 1.8760806916426513, "grad_norm": 0.3550537408993383, "learning_rate": 3.7002821253947574e-06, "loss": 0.0307, "step": 3906 }, { "epoch": 1.8765609990393852, "grad_norm": 0.23872157763149313, "learning_rate": 3.6975838792713454e-06, "loss": 0.0145, "step": 3907 }, { "epoch": 1.877041306436119, "grad_norm": 0.3659877746805539, "learning_rate": 3.6948860399866984e-06, "loss": 0.0279, "step": 3908 }, { "epoch": 1.877521613832853, "grad_norm": 0.46118466192068513, "learning_rate": 3.6921886083835447e-06, "loss": 0.0352, "step": 3909 }, { "epoch": 1.878001921229587, "grad_norm": 0.3386494722867536, "learning_rate": 3.689491585304491e-06, "loss": 0.0265, "step": 3910 }, { "epoch": 1.878482228626321, "grad_norm": 0.37038164062560897, "learning_rate": 3.6867949715920083e-06, "loss": 0.0302, "step": 3911 }, { "epoch": 1.8789625360230549, "grad_norm": 0.3289778639824961, "learning_rate": 3.6840987680884444e-06, "loss": 0.0225, "step": 3912 }, { "epoch": 1.8794428434197887, "grad_norm": 0.4000253746497561, "learning_rate": 3.681402975636023e-06, "loss": 0.0218, "step": 3913 }, { "epoch": 1.8799231508165226, "grad_norm": 0.39828674631157257, "learning_rate": 3.6787075950768337e-06, "loss": 0.0327, "step": 3914 }, { "epoch": 1.8804034582132565, "grad_norm": 0.44674741927362144, "learning_rate": 3.676012627252836e-06, "loss": 0.0394, "step": 3915 }, { "epoch": 1.8808837656099904, "grad_norm": 0.3580239025041447, "learning_rate": 3.673318073005866e-06, "loss": 0.0285, "step": 3916 }, { "epoch": 1.8813640730067243, "grad_norm": 0.41662910120411073, "learning_rate": 3.670623933177626e-06, "loss": 0.0261, "step": 3917 }, { "epoch": 1.8818443804034581, "grad_norm": 0.3436495775395609, "learning_rate": 3.6679302086096917e-06, "loss": 0.0271, "step": 3918 }, { "epoch": 1.882324687800192, "grad_norm": 0.41465582323627864, "learning_rate": 3.6652369001435082e-06, "loss": 0.0214, "step": 3919 }, { "epoch": 1.882804995196926, "grad_norm": 0.33255755010764915, "learning_rate": 3.6625440086203894e-06, "loss": 0.0293, "step": 3920 }, { "epoch": 1.8832853025936598, "grad_norm": 0.35401512212283615, "learning_rate": 3.659851534881522e-06, "loss": 0.027, "step": 3921 }, { "epoch": 1.8837656099903939, "grad_norm": 0.375116126574655, "learning_rate": 3.65715947976796e-06, "loss": 0.0253, "step": 3922 }, { "epoch": 1.8842459173871278, "grad_norm": 0.3449566060152682, "learning_rate": 3.654467844120627e-06, "loss": 0.0209, "step": 3923 }, { "epoch": 1.8847262247838616, "grad_norm": 0.37841974093081365, "learning_rate": 3.6517766287803137e-06, "loss": 0.024, "step": 3924 }, { "epoch": 1.8852065321805955, "grad_norm": 0.700967189506839, "learning_rate": 3.649085834587683e-06, "loss": 0.0278, "step": 3925 }, { "epoch": 1.8856868395773296, "grad_norm": 0.4810320029077788, "learning_rate": 3.6463954623832636e-06, "loss": 0.0279, "step": 3926 }, { "epoch": 1.8861671469740635, "grad_norm": 0.411255200427009, "learning_rate": 3.643705513007453e-06, "loss": 0.0342, "step": 3927 }, { "epoch": 1.8866474543707974, "grad_norm": 0.22126922260645185, "learning_rate": 3.6410159873005193e-06, "loss": 0.0178, "step": 3928 }, { "epoch": 1.8871277617675313, "grad_norm": 0.30746449010228016, "learning_rate": 3.6383268861025933e-06, "loss": 0.0256, "step": 3929 }, { "epoch": 1.8876080691642652, "grad_norm": 0.26052789415697986, "learning_rate": 3.6356382102536773e-06, "loss": 0.0216, "step": 3930 }, { "epoch": 1.888088376560999, "grad_norm": 0.3172037344968375, "learning_rate": 3.632949960593639e-06, "loss": 0.0239, "step": 3931 }, { "epoch": 1.888568683957733, "grad_norm": 0.33315276156856005, "learning_rate": 3.6302621379622137e-06, "loss": 0.0251, "step": 3932 }, { "epoch": 1.8890489913544668, "grad_norm": 0.3823058034913574, "learning_rate": 3.6275747431990025e-06, "loss": 0.0252, "step": 3933 }, { "epoch": 1.8895292987512007, "grad_norm": 0.36667328873787575, "learning_rate": 3.62488777714347e-06, "loss": 0.0291, "step": 3934 }, { "epoch": 1.8900096061479346, "grad_norm": 0.4883151756456679, "learning_rate": 3.622201240634955e-06, "loss": 0.0303, "step": 3935 }, { "epoch": 1.8904899135446684, "grad_norm": 0.26082014751394533, "learning_rate": 3.6195151345126556e-06, "loss": 0.0238, "step": 3936 }, { "epoch": 1.8909702209414025, "grad_norm": 0.27955914831625556, "learning_rate": 3.616829459615637e-06, "loss": 0.0212, "step": 3937 }, { "epoch": 1.8914505283381364, "grad_norm": 0.4484893094995106, "learning_rate": 3.614144216782829e-06, "loss": 0.0319, "step": 3938 }, { "epoch": 1.8919308357348703, "grad_norm": 0.3049229837764836, "learning_rate": 3.6114594068530274e-06, "loss": 0.0241, "step": 3939 }, { "epoch": 1.8924111431316042, "grad_norm": 0.33858589481284207, "learning_rate": 3.6087750306648916e-06, "loss": 0.0218, "step": 3940 }, { "epoch": 1.8928914505283383, "grad_norm": 0.39191906144435246, "learning_rate": 3.606091089056949e-06, "loss": 0.0397, "step": 3941 }, { "epoch": 1.8933717579250722, "grad_norm": 0.32376868788286933, "learning_rate": 3.603407582867585e-06, "loss": 0.0277, "step": 3942 }, { "epoch": 1.893852065321806, "grad_norm": 0.27242633409227174, "learning_rate": 3.6007245129350567e-06, "loss": 0.0194, "step": 3943 }, { "epoch": 1.89433237271854, "grad_norm": 0.26899850335898196, "learning_rate": 3.5980418800974782e-06, "loss": 0.0245, "step": 3944 }, { "epoch": 1.8948126801152738, "grad_norm": 0.3891408304510383, "learning_rate": 3.595359685192832e-06, "loss": 0.027, "step": 3945 }, { "epoch": 1.8952929875120077, "grad_norm": 0.33995008246482467, "learning_rate": 3.5926779290589596e-06, "loss": 0.029, "step": 3946 }, { "epoch": 1.8957732949087416, "grad_norm": 0.3628407606460953, "learning_rate": 3.5899966125335684e-06, "loss": 0.0309, "step": 3947 }, { "epoch": 1.8962536023054755, "grad_norm": 0.3202465039124697, "learning_rate": 3.587315736454227e-06, "loss": 0.0263, "step": 3948 }, { "epoch": 1.8967339097022093, "grad_norm": 0.3227837781541768, "learning_rate": 3.5846353016583644e-06, "loss": 0.0265, "step": 3949 }, { "epoch": 1.8972142170989432, "grad_norm": 0.32744848899831586, "learning_rate": 3.5819553089832814e-06, "loss": 0.0233, "step": 3950 }, { "epoch": 1.897694524495677, "grad_norm": 0.2973843482295719, "learning_rate": 3.5792757592661276e-06, "loss": 0.0208, "step": 3951 }, { "epoch": 1.898174831892411, "grad_norm": 0.3119779521461155, "learning_rate": 3.576596653343921e-06, "loss": 0.0305, "step": 3952 }, { "epoch": 1.898655139289145, "grad_norm": 0.302734490134068, "learning_rate": 3.5739179920535416e-06, "loss": 0.0237, "step": 3953 }, { "epoch": 1.899135446685879, "grad_norm": 0.29238504104283225, "learning_rate": 3.5712397762317284e-06, "loss": 0.0209, "step": 3954 }, { "epoch": 1.8996157540826129, "grad_norm": 0.30790172517183834, "learning_rate": 3.568562006715082e-06, "loss": 0.0207, "step": 3955 }, { "epoch": 1.9000960614793467, "grad_norm": 0.27135552562528387, "learning_rate": 3.565884684340063e-06, "loss": 0.0218, "step": 3956 }, { "epoch": 1.9005763688760808, "grad_norm": 0.25933605638600027, "learning_rate": 3.5632078099429936e-06, "loss": 0.0184, "step": 3957 }, { "epoch": 1.9010566762728147, "grad_norm": 0.5386828879952638, "learning_rate": 3.5605313843600555e-06, "loss": 0.0312, "step": 3958 }, { "epoch": 1.9015369836695486, "grad_norm": 0.2828107626245344, "learning_rate": 3.55785540842729e-06, "loss": 0.0193, "step": 3959 }, { "epoch": 1.9020172910662825, "grad_norm": 0.28587518320675603, "learning_rate": 3.555179882980597e-06, "loss": 0.0289, "step": 3960 }, { "epoch": 1.9024975984630164, "grad_norm": 0.4286365167967354, "learning_rate": 3.5525048088557366e-06, "loss": 0.0203, "step": 3961 }, { "epoch": 1.9029779058597502, "grad_norm": 0.2765241657189905, "learning_rate": 3.549830186888329e-06, "loss": 0.0191, "step": 3962 }, { "epoch": 1.9034582132564841, "grad_norm": 0.558481730355354, "learning_rate": 3.547156017913851e-06, "loss": 0.0355, "step": 3963 }, { "epoch": 1.903938520653218, "grad_norm": 0.44092998979648584, "learning_rate": 3.544482302767639e-06, "loss": 0.023, "step": 3964 }, { "epoch": 1.9044188280499519, "grad_norm": 0.37809001602597525, "learning_rate": 3.541809042284889e-06, "loss": 0.0267, "step": 3965 }, { "epoch": 1.9048991354466858, "grad_norm": 0.33514051687394775, "learning_rate": 3.539136237300653e-06, "loss": 0.0238, "step": 3966 }, { "epoch": 1.9053794428434196, "grad_norm": 0.24795118611159275, "learning_rate": 3.5364638886498405e-06, "loss": 0.0205, "step": 3967 }, { "epoch": 1.9058597502401537, "grad_norm": 0.4092894113423122, "learning_rate": 3.533791997167221e-06, "loss": 0.0326, "step": 3968 }, { "epoch": 1.9063400576368876, "grad_norm": 0.2911026261841958, "learning_rate": 3.531120563687419e-06, "loss": 0.0277, "step": 3969 }, { "epoch": 1.9068203650336215, "grad_norm": 0.4184923028330928, "learning_rate": 3.5284495890449145e-06, "loss": 0.0387, "step": 3970 }, { "epoch": 1.9073006724303554, "grad_norm": 0.5167030863056813, "learning_rate": 3.5257790740740462e-06, "loss": 0.0296, "step": 3971 }, { "epoch": 1.9077809798270895, "grad_norm": 0.3033306056423447, "learning_rate": 3.523109019609011e-06, "loss": 0.0239, "step": 3972 }, { "epoch": 1.9082612872238234, "grad_norm": 0.35608778481438325, "learning_rate": 3.5204394264838615e-06, "loss": 0.0274, "step": 3973 }, { "epoch": 1.9087415946205573, "grad_norm": 0.35338944352560575, "learning_rate": 3.5177702955325016e-06, "loss": 0.0298, "step": 3974 }, { "epoch": 1.9092219020172911, "grad_norm": 0.2951557093023686, "learning_rate": 3.515101627588695e-06, "loss": 0.022, "step": 3975 }, { "epoch": 1.909702209414025, "grad_norm": 0.29380004259716025, "learning_rate": 3.512433423486059e-06, "loss": 0.0246, "step": 3976 }, { "epoch": 1.910182516810759, "grad_norm": 0.4779275873127641, "learning_rate": 3.5097656840580686e-06, "loss": 0.0321, "step": 3977 }, { "epoch": 1.9106628242074928, "grad_norm": 0.36652572996491056, "learning_rate": 3.507098410138049e-06, "loss": 0.0301, "step": 3978 }, { "epoch": 1.9111431316042267, "grad_norm": 0.5570722383573192, "learning_rate": 3.5044316025591852e-06, "loss": 0.0315, "step": 3979 }, { "epoch": 1.9116234390009605, "grad_norm": 0.24163208586258408, "learning_rate": 3.5017652621545133e-06, "loss": 0.0176, "step": 3980 }, { "epoch": 1.9121037463976944, "grad_norm": 0.2402463291800181, "learning_rate": 3.4990993897569246e-06, "loss": 0.0212, "step": 3981 }, { "epoch": 1.9125840537944283, "grad_norm": 0.3567303728276317, "learning_rate": 3.496433986199165e-06, "loss": 0.0202, "step": 3982 }, { "epoch": 1.9130643611911622, "grad_norm": 0.3336151543561317, "learning_rate": 3.4937690523138302e-06, "loss": 0.0292, "step": 3983 }, { "epoch": 1.9135446685878963, "grad_norm": 0.32719208535927546, "learning_rate": 3.4911045889333727e-06, "loss": 0.0284, "step": 3984 }, { "epoch": 1.9140249759846302, "grad_norm": 0.38216615737457765, "learning_rate": 3.488440596890098e-06, "loss": 0.0475, "step": 3985 }, { "epoch": 1.914505283381364, "grad_norm": 0.32114817954208985, "learning_rate": 3.4857770770161613e-06, "loss": 0.0299, "step": 3986 }, { "epoch": 1.914985590778098, "grad_norm": 0.32664730708431244, "learning_rate": 3.4831140301435763e-06, "loss": 0.0268, "step": 3987 }, { "epoch": 1.915465898174832, "grad_norm": 0.3190841232877513, "learning_rate": 3.4804514571042024e-06, "loss": 0.0309, "step": 3988 }, { "epoch": 1.915946205571566, "grad_norm": 0.42848111323462085, "learning_rate": 3.4777893587297546e-06, "loss": 0.0386, "step": 3989 }, { "epoch": 1.9164265129682998, "grad_norm": 0.33000249421000766, "learning_rate": 3.4751277358517987e-06, "loss": 0.0354, "step": 3990 }, { "epoch": 1.9169068203650337, "grad_norm": 0.2827448527229141, "learning_rate": 3.4724665893017517e-06, "loss": 0.0236, "step": 3991 }, { "epoch": 1.9173871277617676, "grad_norm": 0.4478952844800793, "learning_rate": 3.4698059199108838e-06, "loss": 0.0323, "step": 3992 }, { "epoch": 1.9178674351585014, "grad_norm": 0.37356506231966413, "learning_rate": 3.46714572851031e-06, "loss": 0.03, "step": 3993 }, { "epoch": 1.9183477425552353, "grad_norm": 0.331175378607851, "learning_rate": 3.4644860159310055e-06, "loss": 0.0271, "step": 3994 }, { "epoch": 1.9188280499519692, "grad_norm": 0.2893317191376519, "learning_rate": 3.46182678300379e-06, "loss": 0.0243, "step": 3995 }, { "epoch": 1.919308357348703, "grad_norm": 0.3225560774699137, "learning_rate": 3.4591680305593333e-06, "loss": 0.0291, "step": 3996 }, { "epoch": 1.919788664745437, "grad_norm": 0.8812350960247172, "learning_rate": 3.456509759428156e-06, "loss": 0.0217, "step": 3997 }, { "epoch": 1.9202689721421708, "grad_norm": 0.33047880815438774, "learning_rate": 3.453851970440628e-06, "loss": 0.0213, "step": 3998 }, { "epoch": 1.920749279538905, "grad_norm": 0.648119907876343, "learning_rate": 3.45119466442697e-06, "loss": 0.0223, "step": 3999 }, { "epoch": 1.9212295869356388, "grad_norm": 0.26899840724961344, "learning_rate": 3.44853784221725e-06, "loss": 0.0276, "step": 4000 }, { "epoch": 1.9217098943323727, "grad_norm": 0.473109759940023, "learning_rate": 3.4458815046413875e-06, "loss": 0.022, "step": 4001 }, { "epoch": 1.9221902017291066, "grad_norm": 0.41721676774867117, "learning_rate": 3.4432256525291468e-06, "loss": 0.019, "step": 4002 }, { "epoch": 1.9226705091258407, "grad_norm": 0.372371608829545, "learning_rate": 3.440570286710144e-06, "loss": 0.023, "step": 4003 }, { "epoch": 1.9231508165225746, "grad_norm": 0.46733619570486407, "learning_rate": 3.4379154080138415e-06, "loss": 0.0262, "step": 4004 }, { "epoch": 1.9236311239193085, "grad_norm": 0.44206528105701204, "learning_rate": 3.435261017269551e-06, "loss": 0.0293, "step": 4005 }, { "epoch": 1.9241114313160423, "grad_norm": 0.32957225154923864, "learning_rate": 3.432607115306429e-06, "loss": 0.0282, "step": 4006 }, { "epoch": 1.9245917387127762, "grad_norm": 0.3273863986041303, "learning_rate": 3.4299537029534814e-06, "loss": 0.0233, "step": 4007 }, { "epoch": 1.92507204610951, "grad_norm": 0.4028759376434199, "learning_rate": 3.4273007810395585e-06, "loss": 0.0348, "step": 4008 }, { "epoch": 1.925552353506244, "grad_norm": 0.37369910621147817, "learning_rate": 3.424648350393366e-06, "loss": 0.0339, "step": 4009 }, { "epoch": 1.9260326609029779, "grad_norm": 0.3564872135614878, "learning_rate": 3.421996411843445e-06, "loss": 0.0247, "step": 4010 }, { "epoch": 1.9265129682997117, "grad_norm": 0.2856103917472692, "learning_rate": 3.4193449662181888e-06, "loss": 0.0199, "step": 4011 }, { "epoch": 1.9269932756964456, "grad_norm": 0.2804357117725187, "learning_rate": 3.416694014345836e-06, "loss": 0.0217, "step": 4012 }, { "epoch": 1.9274735830931795, "grad_norm": 0.2946671922633092, "learning_rate": 3.4140435570544708e-06, "loss": 0.0236, "step": 4013 }, { "epoch": 1.9279538904899134, "grad_norm": 0.645550113007527, "learning_rate": 3.4113935951720225e-06, "loss": 0.0191, "step": 4014 }, { "epoch": 1.9284341978866475, "grad_norm": 0.31928300283500455, "learning_rate": 3.4087441295262636e-06, "loss": 0.0262, "step": 4015 }, { "epoch": 1.9289145052833814, "grad_norm": 0.2996331720207276, "learning_rate": 3.406095160944818e-06, "loss": 0.0287, "step": 4016 }, { "epoch": 1.9293948126801153, "grad_norm": 0.28180168027216046, "learning_rate": 3.4034466902551476e-06, "loss": 0.0204, "step": 4017 }, { "epoch": 1.9298751200768491, "grad_norm": 0.44969845892214116, "learning_rate": 3.400798718284563e-06, "loss": 0.0379, "step": 4018 }, { "epoch": 1.9303554274735832, "grad_norm": 0.35287755807511056, "learning_rate": 3.3981512458602157e-06, "loss": 0.028, "step": 4019 }, { "epoch": 1.9308357348703171, "grad_norm": 0.44879188263991204, "learning_rate": 3.3955042738091033e-06, "loss": 0.0407, "step": 4020 }, { "epoch": 1.931316042267051, "grad_norm": 0.28911911587891065, "learning_rate": 3.3928578029580664e-06, "loss": 0.0251, "step": 4021 }, { "epoch": 1.9317963496637849, "grad_norm": 0.34585295867816995, "learning_rate": 3.39021183413379e-06, "loss": 0.0342, "step": 4022 }, { "epoch": 1.9322766570605188, "grad_norm": 0.3087314031090661, "learning_rate": 3.3875663681628014e-06, "loss": 0.0183, "step": 4023 }, { "epoch": 1.9327569644572526, "grad_norm": 0.2774277731742307, "learning_rate": 3.3849214058714707e-06, "loss": 0.0222, "step": 4024 }, { "epoch": 1.9332372718539865, "grad_norm": 0.5311270694719505, "learning_rate": 3.3822769480860107e-06, "loss": 0.0164, "step": 4025 }, { "epoch": 1.9337175792507204, "grad_norm": 0.33471900289746254, "learning_rate": 3.379632995632478e-06, "loss": 0.0242, "step": 4026 }, { "epoch": 1.9341978866474543, "grad_norm": 0.3809132030284768, "learning_rate": 3.3769895493367694e-06, "loss": 0.024, "step": 4027 }, { "epoch": 1.9346781940441882, "grad_norm": 0.2526395403216402, "learning_rate": 3.3743466100246257e-06, "loss": 0.0208, "step": 4028 }, { "epoch": 1.935158501440922, "grad_norm": 0.6087118434880749, "learning_rate": 3.3717041785216253e-06, "loss": 0.0309, "step": 4029 }, { "epoch": 1.9356388088376562, "grad_norm": 0.3792683109297968, "learning_rate": 3.3690622556531904e-06, "loss": 0.0229, "step": 4030 }, { "epoch": 1.93611911623439, "grad_norm": 0.5723375120246615, "learning_rate": 3.3664208422445876e-06, "loss": 0.0325, "step": 4031 }, { "epoch": 1.936599423631124, "grad_norm": 0.2692646147155132, "learning_rate": 3.3637799391209225e-06, "loss": 0.0175, "step": 4032 }, { "epoch": 1.9370797310278578, "grad_norm": 0.31878965178734764, "learning_rate": 3.361139547107136e-06, "loss": 0.0294, "step": 4033 }, { "epoch": 1.937560038424592, "grad_norm": 0.28565349883358787, "learning_rate": 3.3584996670280155e-06, "loss": 0.0213, "step": 4034 }, { "epoch": 1.9380403458213258, "grad_norm": 0.33628326485271565, "learning_rate": 3.355860299708187e-06, "loss": 0.0235, "step": 4035 }, { "epoch": 1.9385206532180597, "grad_norm": 0.3080872335858445, "learning_rate": 3.353221445972114e-06, "loss": 0.0411, "step": 4036 }, { "epoch": 1.9390009606147935, "grad_norm": 1.1380695254862925, "learning_rate": 3.3505831066441017e-06, "loss": 0.0269, "step": 4037 }, { "epoch": 1.9394812680115274, "grad_norm": 0.37469173544145695, "learning_rate": 3.347945282548297e-06, "loss": 0.0257, "step": 4038 }, { "epoch": 1.9399615754082613, "grad_norm": 0.3705707913104281, "learning_rate": 3.3453079745086813e-06, "loss": 0.0238, "step": 4039 }, { "epoch": 1.9404418828049952, "grad_norm": 0.33452812516951347, "learning_rate": 3.3426711833490766e-06, "loss": 0.0228, "step": 4040 }, { "epoch": 1.940922190201729, "grad_norm": 0.3717032465331187, "learning_rate": 3.340034909893144e-06, "loss": 0.0206, "step": 4041 }, { "epoch": 1.941402497598463, "grad_norm": 0.28737224631793307, "learning_rate": 3.3373991549643814e-06, "loss": 0.0221, "step": 4042 }, { "epoch": 1.9418828049951968, "grad_norm": 0.2531904853884708, "learning_rate": 3.334763919386127e-06, "loss": 0.0183, "step": 4043 }, { "epoch": 1.9423631123919307, "grad_norm": 0.30664628390698634, "learning_rate": 3.3321292039815524e-06, "loss": 0.0234, "step": 4044 }, { "epoch": 1.9428434197886646, "grad_norm": 0.43365631567563934, "learning_rate": 3.329495009573675e-06, "loss": 0.0355, "step": 4045 }, { "epoch": 1.9433237271853987, "grad_norm": 0.38890861270272575, "learning_rate": 3.326861336985341e-06, "loss": 0.0202, "step": 4046 }, { "epoch": 1.9438040345821326, "grad_norm": 0.3121657346493432, "learning_rate": 3.324228187039237e-06, "loss": 0.0246, "step": 4047 }, { "epoch": 1.9442843419788665, "grad_norm": 0.23529505125643185, "learning_rate": 3.3215955605578865e-06, "loss": 0.023, "step": 4048 }, { "epoch": 1.9447646493756003, "grad_norm": 0.33838803425280956, "learning_rate": 3.31896345836365e-06, "loss": 0.0192, "step": 4049 }, { "epoch": 1.9452449567723344, "grad_norm": 0.3039093886352605, "learning_rate": 3.3163318812787215e-06, "loss": 0.0209, "step": 4050 }, { "epoch": 1.9457252641690683, "grad_norm": 0.3212167238797865, "learning_rate": 3.313700830125136e-06, "loss": 0.03, "step": 4051 }, { "epoch": 1.9462055715658022, "grad_norm": 0.2799581150227413, "learning_rate": 3.311070305724756e-06, "loss": 0.0184, "step": 4052 }, { "epoch": 1.946685878962536, "grad_norm": 0.36537324695452594, "learning_rate": 3.308440308899289e-06, "loss": 0.0282, "step": 4053 }, { "epoch": 1.94716618635927, "grad_norm": 0.28279503521905, "learning_rate": 3.305810840470273e-06, "loss": 0.0266, "step": 4054 }, { "epoch": 1.9476464937560038, "grad_norm": 0.27183131638267966, "learning_rate": 3.3031819012590797e-06, "loss": 0.0255, "step": 4055 }, { "epoch": 1.9481268011527377, "grad_norm": 3.3392321300303474, "learning_rate": 3.3005534920869175e-06, "loss": 0.0294, "step": 4056 }, { "epoch": 1.9486071085494716, "grad_norm": 0.39464920082736044, "learning_rate": 3.2979256137748283e-06, "loss": 0.0287, "step": 4057 }, { "epoch": 1.9490874159462055, "grad_norm": 0.46119059129667567, "learning_rate": 3.2952982671436883e-06, "loss": 0.032, "step": 4058 }, { "epoch": 1.9495677233429394, "grad_norm": 0.27317737794771585, "learning_rate": 3.2926714530142085e-06, "loss": 0.0293, "step": 4059 }, { "epoch": 1.9500480307396733, "grad_norm": 0.2349966148201107, "learning_rate": 3.2900451722069338e-06, "loss": 0.0171, "step": 4060 }, { "epoch": 1.9505283381364071, "grad_norm": 0.2968256868103406, "learning_rate": 3.2874194255422397e-06, "loss": 0.0207, "step": 4061 }, { "epoch": 1.9510086455331412, "grad_norm": 0.30055760253467484, "learning_rate": 3.2847942138403392e-06, "loss": 0.0248, "step": 4062 }, { "epoch": 1.9514889529298751, "grad_norm": 0.32956460274726374, "learning_rate": 3.282169537921274e-06, "loss": 0.0222, "step": 4063 }, { "epoch": 1.951969260326609, "grad_norm": 0.3840535892833138, "learning_rate": 3.279545398604922e-06, "loss": 0.0363, "step": 4064 }, { "epoch": 1.952449567723343, "grad_norm": 0.28066840952018773, "learning_rate": 3.2769217967109895e-06, "loss": 0.0226, "step": 4065 }, { "epoch": 1.952929875120077, "grad_norm": 0.21585391229517653, "learning_rate": 3.274298733059016e-06, "loss": 0.0174, "step": 4066 }, { "epoch": 1.9534101825168109, "grad_norm": 0.39027680801774917, "learning_rate": 3.2716762084683783e-06, "loss": 0.0284, "step": 4067 }, { "epoch": 1.9538904899135447, "grad_norm": 0.3531315605775286, "learning_rate": 3.269054223758279e-06, "loss": 0.0214, "step": 4068 }, { "epoch": 1.9543707973102786, "grad_norm": 0.357339633621682, "learning_rate": 3.2664327797477524e-06, "loss": 0.0316, "step": 4069 }, { "epoch": 1.9548511047070125, "grad_norm": 0.498602767789916, "learning_rate": 3.263811877255666e-06, "loss": 0.0288, "step": 4070 }, { "epoch": 1.9553314121037464, "grad_norm": 0.4858665784161083, "learning_rate": 3.261191517100716e-06, "loss": 0.0321, "step": 4071 }, { "epoch": 1.9558117195004803, "grad_norm": 0.4734215773051098, "learning_rate": 3.2585717001014316e-06, "loss": 0.0295, "step": 4072 }, { "epoch": 1.9562920268972142, "grad_norm": 0.3266916928117787, "learning_rate": 3.2559524270761713e-06, "loss": 0.0274, "step": 4073 }, { "epoch": 1.956772334293948, "grad_norm": 0.26769915321031007, "learning_rate": 3.253333698843122e-06, "loss": 0.0257, "step": 4074 }, { "epoch": 1.957252641690682, "grad_norm": 0.38381219078225093, "learning_rate": 3.250715516220304e-06, "loss": 0.0296, "step": 4075 }, { "epoch": 1.9577329490874158, "grad_norm": 0.3087162866963168, "learning_rate": 3.248097880025564e-06, "loss": 0.0284, "step": 4076 }, { "epoch": 1.95821325648415, "grad_norm": 0.26241760062434705, "learning_rate": 3.2454807910765807e-06, "loss": 0.0249, "step": 4077 }, { "epoch": 1.9586935638808838, "grad_norm": 0.4287358655357203, "learning_rate": 3.2428642501908573e-06, "loss": 0.0289, "step": 4078 }, { "epoch": 1.9591738712776177, "grad_norm": 0.308637882272711, "learning_rate": 3.240248258185731e-06, "loss": 0.019, "step": 4079 }, { "epoch": 1.9596541786743515, "grad_norm": 0.3540909360906945, "learning_rate": 3.237632815878364e-06, "loss": 0.0306, "step": 4080 }, { "epoch": 1.9601344860710856, "grad_norm": 0.3228158176181618, "learning_rate": 3.2350179240857487e-06, "loss": 0.0328, "step": 4081 }, { "epoch": 1.9606147934678195, "grad_norm": 0.31700440635447713, "learning_rate": 3.232403583624706e-06, "loss": 0.0242, "step": 4082 }, { "epoch": 1.9610951008645534, "grad_norm": 0.31370132243750154, "learning_rate": 3.2297897953118817e-06, "loss": 0.0244, "step": 4083 }, { "epoch": 1.9615754082612873, "grad_norm": 0.30818874656906536, "learning_rate": 3.227176559963753e-06, "loss": 0.0206, "step": 4084 }, { "epoch": 1.9620557156580212, "grad_norm": 0.3405517514875438, "learning_rate": 3.2245638783966203e-06, "loss": 0.0225, "step": 4085 }, { "epoch": 1.962536023054755, "grad_norm": 0.3531588316121387, "learning_rate": 3.221951751426614e-06, "loss": 0.0365, "step": 4086 }, { "epoch": 1.963016330451489, "grad_norm": 0.2756386680187405, "learning_rate": 3.2193401798696915e-06, "loss": 0.0214, "step": 4087 }, { "epoch": 1.9634966378482228, "grad_norm": 0.6021017739611394, "learning_rate": 3.2167291645416314e-06, "loss": 0.023, "step": 4088 }, { "epoch": 1.9639769452449567, "grad_norm": 0.21862441614449274, "learning_rate": 3.214118706258047e-06, "loss": 0.0181, "step": 4089 }, { "epoch": 1.9644572526416906, "grad_norm": 0.2511186686077169, "learning_rate": 3.2115088058343725e-06, "loss": 0.026, "step": 4090 }, { "epoch": 1.9649375600384245, "grad_norm": 0.33812704818892003, "learning_rate": 3.2088994640858685e-06, "loss": 0.0243, "step": 4091 }, { "epoch": 1.9654178674351583, "grad_norm": 0.33939823895922366, "learning_rate": 3.2062906818276195e-06, "loss": 0.0261, "step": 4092 }, { "epoch": 1.9658981748318924, "grad_norm": 0.35584948266540317, "learning_rate": 3.2036824598745377e-06, "loss": 0.0224, "step": 4093 }, { "epoch": 1.9663784822286263, "grad_norm": 0.42826967264207494, "learning_rate": 3.2010747990413597e-06, "loss": 0.0354, "step": 4094 }, { "epoch": 1.9668587896253602, "grad_norm": 0.40724852789947147, "learning_rate": 3.198467700142647e-06, "loss": 0.0259, "step": 4095 }, { "epoch": 1.967339097022094, "grad_norm": 0.27825580510068587, "learning_rate": 3.195861163992783e-06, "loss": 0.0252, "step": 4096 }, { "epoch": 1.9678194044188282, "grad_norm": 0.24630043008779678, "learning_rate": 3.1932551914059814e-06, "loss": 0.0269, "step": 4097 }, { "epoch": 1.968299711815562, "grad_norm": 0.3150934291251204, "learning_rate": 3.190649783196273e-06, "loss": 0.0258, "step": 4098 }, { "epoch": 1.968780019212296, "grad_norm": 0.2793702991046677, "learning_rate": 3.188044940177516e-06, "loss": 0.0178, "step": 4099 }, { "epoch": 1.9692603266090298, "grad_norm": 0.32004541664393443, "learning_rate": 3.185440663163393e-06, "loss": 0.0236, "step": 4100 }, { "epoch": 1.9697406340057637, "grad_norm": 0.7207555148491791, "learning_rate": 3.182836952967405e-06, "loss": 0.0371, "step": 4101 }, { "epoch": 1.9702209414024976, "grad_norm": 0.381619302767597, "learning_rate": 3.1802338104028803e-06, "loss": 0.0224, "step": 4102 }, { "epoch": 1.9707012487992315, "grad_norm": 0.3367055682940999, "learning_rate": 3.1776312362829663e-06, "loss": 0.0199, "step": 4103 }, { "epoch": 1.9711815561959654, "grad_norm": 0.3069683597624844, "learning_rate": 3.1750292314206416e-06, "loss": 0.0226, "step": 4104 }, { "epoch": 1.9716618635926992, "grad_norm": 0.2736276484483542, "learning_rate": 3.1724277966286953e-06, "loss": 0.0206, "step": 4105 }, { "epoch": 1.9721421709894331, "grad_norm": 0.40678160862169377, "learning_rate": 3.169826932719745e-06, "loss": 0.0191, "step": 4106 }, { "epoch": 1.972622478386167, "grad_norm": 0.4048893123332616, "learning_rate": 3.167226640506228e-06, "loss": 0.0263, "step": 4107 }, { "epoch": 1.973102785782901, "grad_norm": 0.2998796656850621, "learning_rate": 3.164626920800404e-06, "loss": 0.0184, "step": 4108 }, { "epoch": 1.973583093179635, "grad_norm": 0.375484922548667, "learning_rate": 3.1620277744143547e-06, "loss": 0.0372, "step": 4109 }, { "epoch": 1.9740634005763689, "grad_norm": 0.4788026496180691, "learning_rate": 3.159429202159979e-06, "loss": 0.0292, "step": 4110 }, { "epoch": 1.9745437079731027, "grad_norm": 0.36965745308442666, "learning_rate": 3.156831204849002e-06, "loss": 0.0276, "step": 4111 }, { "epoch": 1.9750240153698368, "grad_norm": 0.33293927811431073, "learning_rate": 3.154233783292964e-06, "loss": 0.022, "step": 4112 }, { "epoch": 1.9755043227665707, "grad_norm": 0.2710489656386871, "learning_rate": 3.1516369383032285e-06, "loss": 0.0201, "step": 4113 }, { "epoch": 1.9759846301633046, "grad_norm": 0.31306909150615947, "learning_rate": 3.149040670690979e-06, "loss": 0.0231, "step": 4114 }, { "epoch": 1.9764649375600385, "grad_norm": 0.4665912407881344, "learning_rate": 3.146444981267216e-06, "loss": 0.0312, "step": 4115 }, { "epoch": 1.9769452449567724, "grad_norm": 0.3273102385827459, "learning_rate": 3.143849870842761e-06, "loss": 0.0305, "step": 4116 }, { "epoch": 1.9774255523535063, "grad_norm": 0.27448139817279515, "learning_rate": 3.1412553402282564e-06, "loss": 0.0199, "step": 4117 }, { "epoch": 1.9779058597502401, "grad_norm": 0.32139249654646346, "learning_rate": 3.1386613902341585e-06, "loss": 0.0296, "step": 4118 }, { "epoch": 1.978386167146974, "grad_norm": 0.3561843526260465, "learning_rate": 3.136068021670749e-06, "loss": 0.0267, "step": 4119 }, { "epoch": 1.978866474543708, "grad_norm": 0.3524621684986661, "learning_rate": 3.1334752353481236e-06, "loss": 0.0221, "step": 4120 }, { "epoch": 1.9793467819404418, "grad_norm": 0.32113510294441716, "learning_rate": 3.1308830320761964e-06, "loss": 0.0249, "step": 4121 }, { "epoch": 1.9798270893371757, "grad_norm": 0.3257338594535337, "learning_rate": 3.1282914126647e-06, "loss": 0.0342, "step": 4122 }, { "epoch": 1.9803073967339095, "grad_norm": 0.3797930959844205, "learning_rate": 3.125700377923186e-06, "loss": 0.0379, "step": 4123 }, { "epoch": 1.9807877041306436, "grad_norm": 0.6666343530348542, "learning_rate": 3.1231099286610197e-06, "loss": 0.0312, "step": 4124 }, { "epoch": 1.9812680115273775, "grad_norm": 0.29475559044382116, "learning_rate": 3.1205200656873845e-06, "loss": 0.0155, "step": 4125 }, { "epoch": 1.9817483189241114, "grad_norm": 0.3275071561491951, "learning_rate": 3.117930789811287e-06, "loss": 0.0351, "step": 4126 }, { "epoch": 1.9822286263208453, "grad_norm": 0.4960471128078699, "learning_rate": 3.1153421018415435e-06, "loss": 0.0257, "step": 4127 }, { "epoch": 1.9827089337175794, "grad_norm": 0.22831041493897666, "learning_rate": 3.112754002586786e-06, "loss": 0.0157, "step": 4128 }, { "epoch": 1.9831892411143133, "grad_norm": 0.2795440497212657, "learning_rate": 3.110166492855468e-06, "loss": 0.0166, "step": 4129 }, { "epoch": 1.9836695485110472, "grad_norm": 0.31795088713055114, "learning_rate": 3.1075795734558545e-06, "loss": 0.0214, "step": 4130 }, { "epoch": 1.984149855907781, "grad_norm": 0.4038132765793031, "learning_rate": 3.1049932451960274e-06, "loss": 0.0302, "step": 4131 }, { "epoch": 1.984630163304515, "grad_norm": 0.4164069748058171, "learning_rate": 3.1024075088838834e-06, "loss": 0.0263, "step": 4132 }, { "epoch": 1.9851104707012488, "grad_norm": 0.3433599284222044, "learning_rate": 3.099822365327137e-06, "loss": 0.0203, "step": 4133 }, { "epoch": 1.9855907780979827, "grad_norm": 0.3347372046012402, "learning_rate": 3.097237815333315e-06, "loss": 0.0236, "step": 4134 }, { "epoch": 1.9860710854947166, "grad_norm": 0.31241701311935466, "learning_rate": 3.0946538597097588e-06, "loss": 0.0273, "step": 4135 }, { "epoch": 1.9865513928914504, "grad_norm": 0.3126710335001475, "learning_rate": 3.092070499263625e-06, "loss": 0.0216, "step": 4136 }, { "epoch": 1.9870317002881843, "grad_norm": 0.28115442016998365, "learning_rate": 3.0894877348018835e-06, "loss": 0.0237, "step": 4137 }, { "epoch": 1.9875120076849182, "grad_norm": 0.30796739312985805, "learning_rate": 3.0869055671313193e-06, "loss": 0.022, "step": 4138 }, { "epoch": 1.9879923150816523, "grad_norm": 0.33368853239422414, "learning_rate": 3.0843239970585287e-06, "loss": 0.0208, "step": 4139 }, { "epoch": 1.9884726224783862, "grad_norm": 0.3080866608911141, "learning_rate": 3.081743025389923e-06, "loss": 0.0254, "step": 4140 }, { "epoch": 1.98895292987512, "grad_norm": 0.46796479863372126, "learning_rate": 3.0791626529317275e-06, "loss": 0.0294, "step": 4141 }, { "epoch": 1.989433237271854, "grad_norm": 0.3365510939558686, "learning_rate": 3.076582880489979e-06, "loss": 0.0342, "step": 4142 }, { "epoch": 1.989913544668588, "grad_norm": 0.26594173046286107, "learning_rate": 3.0740037088705254e-06, "loss": 0.0246, "step": 4143 }, { "epoch": 1.990393852065322, "grad_norm": 0.26166289204606125, "learning_rate": 3.071425138879031e-06, "loss": 0.0236, "step": 4144 }, { "epoch": 1.9908741594620558, "grad_norm": 0.29590791594285043, "learning_rate": 3.068847171320969e-06, "loss": 0.0227, "step": 4145 }, { "epoch": 1.9913544668587897, "grad_norm": 0.36631530607947344, "learning_rate": 3.0662698070016246e-06, "loss": 0.0276, "step": 4146 }, { "epoch": 1.9918347742555236, "grad_norm": 0.2557551584460393, "learning_rate": 3.0636930467260927e-06, "loss": 0.0254, "step": 4147 }, { "epoch": 1.9923150816522575, "grad_norm": 0.4569429110912114, "learning_rate": 3.061116891299286e-06, "loss": 0.0295, "step": 4148 }, { "epoch": 1.9927953890489913, "grad_norm": 0.2977994932770977, "learning_rate": 3.058541341525923e-06, "loss": 0.0238, "step": 4149 }, { "epoch": 1.9932756964457252, "grad_norm": 0.35545260769659914, "learning_rate": 3.0559663982105346e-06, "loss": 0.0287, "step": 4150 }, { "epoch": 1.993756003842459, "grad_norm": 0.2961122891009182, "learning_rate": 3.0533920621574597e-06, "loss": 0.0191, "step": 4151 }, { "epoch": 1.994236311239193, "grad_norm": 0.2516480006948285, "learning_rate": 3.050818334170852e-06, "loss": 0.0158, "step": 4152 }, { "epoch": 1.9947166186359269, "grad_norm": 0.37339756376024197, "learning_rate": 3.0482452150546714e-06, "loss": 0.0231, "step": 4153 }, { "epoch": 1.9951969260326607, "grad_norm": 0.3646807358109965, "learning_rate": 3.0456727056126885e-06, "loss": 0.0307, "step": 4154 }, { "epoch": 1.9956772334293948, "grad_norm": 0.33677997696622125, "learning_rate": 3.0431008066484858e-06, "loss": 0.0224, "step": 4155 }, { "epoch": 1.9961575408261287, "grad_norm": 0.37372343715108974, "learning_rate": 3.0405295189654537e-06, "loss": 0.0268, "step": 4156 }, { "epoch": 1.9966378482228626, "grad_norm": 0.2947509765904424, "learning_rate": 3.03795884336679e-06, "loss": 0.0268, "step": 4157 }, { "epoch": 1.9971181556195965, "grad_norm": 0.2888218715483949, "learning_rate": 3.0353887806555033e-06, "loss": 0.0255, "step": 4158 }, { "epoch": 1.9975984630163306, "grad_norm": 0.4784285114757389, "learning_rate": 3.0328193316344107e-06, "loss": 0.0233, "step": 4159 }, { "epoch": 1.9980787704130645, "grad_norm": 0.29220426288311824, "learning_rate": 3.0302504971061353e-06, "loss": 0.0207, "step": 4160 }, { "epoch": 1.9985590778097984, "grad_norm": 0.35432001839232613, "learning_rate": 3.0276822778731108e-06, "loss": 0.0286, "step": 4161 }, { "epoch": 1.9990393852065322, "grad_norm": 0.7302457489856938, "learning_rate": 3.025114674737576e-06, "loss": 0.038, "step": 4162 }, { "epoch": 1.9995196926032661, "grad_norm": 0.695796699374891, "learning_rate": 3.0225476885015837e-06, "loss": 0.0425, "step": 4163 }, { "epoch": 2.0, "grad_norm": 0.31847332598133876, "learning_rate": 3.019981319966985e-06, "loss": 0.03, "step": 4164 }, { "epoch": 2.0, "eval_loss": 0.026030490174889565, "eval_runtime": 506.8783, "eval_samples_per_second": 33.004, "eval_steps_per_second": 1.032, "step": 4164 } ], "logging_steps": 1, "max_steps": 6246, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.4177220461498204e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }