{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2082, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004803073967339097, "grad_norm": 12.295453677898383, "learning_rate": 1.6e-08, "loss": 0.3404, "step": 1 }, { "epoch": 0.0009606147934678194, "grad_norm": 11.115325503679177, "learning_rate": 3.2e-08, "loss": 0.2956, "step": 2 }, { "epoch": 0.001440922190201729, "grad_norm": 12.134299471519068, "learning_rate": 4.8e-08, "loss": 0.3497, "step": 3 }, { "epoch": 0.0019212295869356388, "grad_norm": 11.889325768094226, "learning_rate": 6.4e-08, "loss": 0.3239, "step": 4 }, { "epoch": 0.0024015369836695487, "grad_norm": 11.017686747111874, "learning_rate": 8e-08, "loss": 0.2895, "step": 5 }, { "epoch": 0.002881844380403458, "grad_norm": 10.764449822783085, "learning_rate": 9.6e-08, "loss": 0.3062, "step": 6 }, { "epoch": 0.0033621517771373678, "grad_norm": 9.810863684605177, "learning_rate": 1.1200000000000001e-07, "loss": 0.3056, "step": 7 }, { "epoch": 0.0038424591738712775, "grad_norm": 11.438972361023298, "learning_rate": 1.28e-07, "loss": 0.294, "step": 8 }, { "epoch": 0.004322766570605188, "grad_norm": 11.696011568247082, "learning_rate": 1.4400000000000002e-07, "loss": 0.3285, "step": 9 }, { "epoch": 0.004803073967339097, "grad_norm": 11.507512344037782, "learning_rate": 1.6e-07, "loss": 0.314, "step": 10 }, { "epoch": 0.005283381364073006, "grad_norm": 10.684581565163377, "learning_rate": 1.7600000000000001e-07, "loss": 0.2883, "step": 11 }, { "epoch": 0.005763688760806916, "grad_norm": 11.24558403809558, "learning_rate": 1.92e-07, "loss": 0.3116, "step": 12 }, { "epoch": 0.006243996157540826, "grad_norm": 10.790135146175551, "learning_rate": 2.08e-07, "loss": 0.3244, "step": 13 }, { "epoch": 0.0067243035542747355, "grad_norm": 10.608527245093223, "learning_rate": 2.2400000000000002e-07, "loss": 0.2747, "step": 14 }, { "epoch": 0.007204610951008645, "grad_norm": 10.265000231723027, "learning_rate": 2.4000000000000003e-07, "loss": 0.2674, "step": 15 }, { "epoch": 0.007684918347742555, "grad_norm": 11.298317377682416, "learning_rate": 2.56e-07, "loss": 0.2679, "step": 16 }, { "epoch": 0.008165225744476465, "grad_norm": 9.86248788668308, "learning_rate": 2.72e-07, "loss": 0.3099, "step": 17 }, { "epoch": 0.008645533141210375, "grad_norm": 9.0577072692478, "learning_rate": 2.8800000000000004e-07, "loss": 0.2674, "step": 18 }, { "epoch": 0.009125840537944284, "grad_norm": 7.238412780645614, "learning_rate": 3.04e-07, "loss": 0.2082, "step": 19 }, { "epoch": 0.009606147934678195, "grad_norm": 9.490442029212941, "learning_rate": 3.2e-07, "loss": 0.2548, "step": 20 }, { "epoch": 0.010086455331412104, "grad_norm": 7.936036846955943, "learning_rate": 3.36e-07, "loss": 0.2487, "step": 21 }, { "epoch": 0.010566762728146013, "grad_norm": 8.384911819682703, "learning_rate": 3.5200000000000003e-07, "loss": 0.2578, "step": 22 }, { "epoch": 0.011047070124879923, "grad_norm": 6.667251467184058, "learning_rate": 3.68e-07, "loss": 0.2315, "step": 23 }, { "epoch": 0.011527377521613832, "grad_norm": 5.388532747106631, "learning_rate": 3.84e-07, "loss": 0.2153, "step": 24 }, { "epoch": 0.012007684918347743, "grad_norm": 5.076276759500522, "learning_rate": 4.0000000000000003e-07, "loss": 0.1773, "step": 25 }, { "epoch": 0.012487992315081652, "grad_norm": 5.044901367454851, "learning_rate": 4.16e-07, "loss": 0.1981, "step": 26 }, { "epoch": 0.012968299711815562, "grad_norm": 4.6484570213541145, "learning_rate": 4.3200000000000006e-07, "loss": 0.1962, "step": 27 }, { "epoch": 0.013448607108549471, "grad_norm": 4.820579032408368, "learning_rate": 4.4800000000000004e-07, "loss": 0.1923, "step": 28 }, { "epoch": 0.013928914505283382, "grad_norm": 3.6954293876191535, "learning_rate": 4.64e-07, "loss": 0.1827, "step": 29 }, { "epoch": 0.01440922190201729, "grad_norm": 3.8040876250915083, "learning_rate": 4.800000000000001e-07, "loss": 0.1959, "step": 30 }, { "epoch": 0.014889529298751201, "grad_norm": 3.0166885970679043, "learning_rate": 4.96e-07, "loss": 0.1652, "step": 31 }, { "epoch": 0.01536983669548511, "grad_norm": 2.2910049629070497, "learning_rate": 5.12e-07, "loss": 0.1723, "step": 32 }, { "epoch": 0.01585014409221902, "grad_norm": 2.7933136537228744, "learning_rate": 5.280000000000001e-07, "loss": 0.1663, "step": 33 }, { "epoch": 0.01633045148895293, "grad_norm": 2.5494412148634487, "learning_rate": 5.44e-07, "loss": 0.1593, "step": 34 }, { "epoch": 0.01681075888568684, "grad_norm": 2.7365853076508504, "learning_rate": 5.6e-07, "loss": 0.1845, "step": 35 }, { "epoch": 0.01729106628242075, "grad_norm": 2.6326300548480956, "learning_rate": 5.760000000000001e-07, "loss": 0.1734, "step": 36 }, { "epoch": 0.01777137367915466, "grad_norm": 1.9178697505579982, "learning_rate": 5.920000000000001e-07, "loss": 0.1249, "step": 37 }, { "epoch": 0.01825168107588857, "grad_norm": 2.5724433031106853, "learning_rate": 6.08e-07, "loss": 0.1308, "step": 38 }, { "epoch": 0.018731988472622477, "grad_norm": 2.6912073654452016, "learning_rate": 6.24e-07, "loss": 0.1527, "step": 39 }, { "epoch": 0.01921229586935639, "grad_norm": 2.167890593402019, "learning_rate": 6.4e-07, "loss": 0.1551, "step": 40 }, { "epoch": 0.0196926032660903, "grad_norm": 1.9023821581489182, "learning_rate": 6.560000000000002e-07, "loss": 0.1264, "step": 41 }, { "epoch": 0.020172910662824207, "grad_norm": 3.3946961363245363, "learning_rate": 6.72e-07, "loss": 0.183, "step": 42 }, { "epoch": 0.020653218059558116, "grad_norm": 2.2864808587177783, "learning_rate": 6.88e-07, "loss": 0.1473, "step": 43 }, { "epoch": 0.021133525456292025, "grad_norm": 2.0274847089097765, "learning_rate": 7.040000000000001e-07, "loss": 0.125, "step": 44 }, { "epoch": 0.021613832853025938, "grad_norm": 2.315770203936131, "learning_rate": 7.2e-07, "loss": 0.1362, "step": 45 }, { "epoch": 0.022094140249759846, "grad_norm": 2.305110403527537, "learning_rate": 7.36e-07, "loss": 0.1515, "step": 46 }, { "epoch": 0.022574447646493755, "grad_norm": 2.3692793672422985, "learning_rate": 7.520000000000001e-07, "loss": 0.1427, "step": 47 }, { "epoch": 0.023054755043227664, "grad_norm": 1.853538426035806, "learning_rate": 7.68e-07, "loss": 0.13, "step": 48 }, { "epoch": 0.023535062439961577, "grad_norm": 1.7993480609748818, "learning_rate": 7.84e-07, "loss": 0.1384, "step": 49 }, { "epoch": 0.024015369836695485, "grad_norm": 1.2624888010807611, "learning_rate": 8.000000000000001e-07, "loss": 0.0812, "step": 50 }, { "epoch": 0.024495677233429394, "grad_norm": 1.654775529490419, "learning_rate": 8.160000000000001e-07, "loss": 0.1289, "step": 51 }, { "epoch": 0.024975984630163303, "grad_norm": 1.9053266889486429, "learning_rate": 8.32e-07, "loss": 0.128, "step": 52 }, { "epoch": 0.025456292026897216, "grad_norm": 1.3131685994032491, "learning_rate": 8.480000000000001e-07, "loss": 0.1099, "step": 53 }, { "epoch": 0.025936599423631124, "grad_norm": 1.8073567315806054, "learning_rate": 8.640000000000001e-07, "loss": 0.1209, "step": 54 }, { "epoch": 0.026416906820365033, "grad_norm": 1.5555744040575845, "learning_rate": 8.8e-07, "loss": 0.1268, "step": 55 }, { "epoch": 0.026897214217098942, "grad_norm": 1.1507622935499833, "learning_rate": 8.960000000000001e-07, "loss": 0.0939, "step": 56 }, { "epoch": 0.027377521613832854, "grad_norm": 1.8426592777636386, "learning_rate": 9.120000000000001e-07, "loss": 0.1267, "step": 57 }, { "epoch": 0.027857829010566763, "grad_norm": 1.8172160960989603, "learning_rate": 9.28e-07, "loss": 0.1419, "step": 58 }, { "epoch": 0.028338136407300672, "grad_norm": 1.734138032657134, "learning_rate": 9.440000000000001e-07, "loss": 0.1063, "step": 59 }, { "epoch": 0.02881844380403458, "grad_norm": 1.9904099384917207, "learning_rate": 9.600000000000001e-07, "loss": 0.1547, "step": 60 }, { "epoch": 0.029298751200768493, "grad_norm": 1.431824839460655, "learning_rate": 9.76e-07, "loss": 0.1297, "step": 61 }, { "epoch": 0.029779058597502402, "grad_norm": 1.3047846711226216, "learning_rate": 9.92e-07, "loss": 0.0861, "step": 62 }, { "epoch": 0.03025936599423631, "grad_norm": 1.9570883092765483, "learning_rate": 1.0080000000000001e-06, "loss": 0.1552, "step": 63 }, { "epoch": 0.03073967339097022, "grad_norm": 1.2651593114993294, "learning_rate": 1.024e-06, "loss": 0.0973, "step": 64 }, { "epoch": 0.03121998078770413, "grad_norm": 1.576842206277053, "learning_rate": 1.04e-06, "loss": 0.1015, "step": 65 }, { "epoch": 0.03170028818443804, "grad_norm": 1.7388488234803576, "learning_rate": 1.0560000000000001e-06, "loss": 0.1098, "step": 66 }, { "epoch": 0.03218059558117195, "grad_norm": 1.5418141689606337, "learning_rate": 1.072e-06, "loss": 0.0982, "step": 67 }, { "epoch": 0.03266090297790586, "grad_norm": 1.7260585222683362, "learning_rate": 1.088e-06, "loss": 0.1348, "step": 68 }, { "epoch": 0.03314121037463977, "grad_norm": 1.4436799175309636, "learning_rate": 1.1040000000000001e-06, "loss": 0.1192, "step": 69 }, { "epoch": 0.03362151777137368, "grad_norm": 3.222503996824014, "learning_rate": 1.12e-06, "loss": 0.1234, "step": 70 }, { "epoch": 0.034101825168107586, "grad_norm": 1.6939823082128094, "learning_rate": 1.1360000000000002e-06, "loss": 0.1165, "step": 71 }, { "epoch": 0.0345821325648415, "grad_norm": 1.8379294842379774, "learning_rate": 1.1520000000000002e-06, "loss": 0.1185, "step": 72 }, { "epoch": 0.03506243996157541, "grad_norm": 1.5569252017032666, "learning_rate": 1.168e-06, "loss": 0.1312, "step": 73 }, { "epoch": 0.03554274735830932, "grad_norm": 1.714684617371849, "learning_rate": 1.1840000000000002e-06, "loss": 0.0876, "step": 74 }, { "epoch": 0.03602305475504323, "grad_norm": 1.3225072536309823, "learning_rate": 1.2000000000000002e-06, "loss": 0.0957, "step": 75 }, { "epoch": 0.03650336215177714, "grad_norm": 1.8975516746309764, "learning_rate": 1.216e-06, "loss": 0.1211, "step": 76 }, { "epoch": 0.036983669548511046, "grad_norm": 1.5164480525596415, "learning_rate": 1.2320000000000002e-06, "loss": 0.1117, "step": 77 }, { "epoch": 0.037463976945244955, "grad_norm": 1.8598013368901916, "learning_rate": 1.248e-06, "loss": 0.0974, "step": 78 }, { "epoch": 0.037944284341978864, "grad_norm": 1.6096443930527105, "learning_rate": 1.2640000000000003e-06, "loss": 0.1066, "step": 79 }, { "epoch": 0.03842459173871278, "grad_norm": 1.1438512252152493, "learning_rate": 1.28e-06, "loss": 0.096, "step": 80 }, { "epoch": 0.03890489913544669, "grad_norm": 1.464888807639377, "learning_rate": 1.296e-06, "loss": 0.0982, "step": 81 }, { "epoch": 0.0393852065321806, "grad_norm": 1.81847293552993, "learning_rate": 1.3120000000000003e-06, "loss": 0.1419, "step": 82 }, { "epoch": 0.039865513928914506, "grad_norm": 1.7194540925734543, "learning_rate": 1.328e-06, "loss": 0.1054, "step": 83 }, { "epoch": 0.040345821325648415, "grad_norm": 1.46339302106847, "learning_rate": 1.344e-06, "loss": 0.1246, "step": 84 }, { "epoch": 0.040826128722382324, "grad_norm": 1.6380410081431438, "learning_rate": 1.3600000000000001e-06, "loss": 0.0806, "step": 85 }, { "epoch": 0.04130643611911623, "grad_norm": 1.8756328408381668, "learning_rate": 1.376e-06, "loss": 0.1188, "step": 86 }, { "epoch": 0.04178674351585014, "grad_norm": 1.4464326251601627, "learning_rate": 1.392e-06, "loss": 0.113, "step": 87 }, { "epoch": 0.04226705091258405, "grad_norm": 1.240058007444482, "learning_rate": 1.4080000000000001e-06, "loss": 0.0934, "step": 88 }, { "epoch": 0.042747358309317966, "grad_norm": 2.3178241893688645, "learning_rate": 1.424e-06, "loss": 0.1126, "step": 89 }, { "epoch": 0.043227665706051875, "grad_norm": 1.7177471908214919, "learning_rate": 1.44e-06, "loss": 0.1345, "step": 90 }, { "epoch": 0.043707973102785784, "grad_norm": 2.7314130913960812, "learning_rate": 1.4560000000000001e-06, "loss": 0.1069, "step": 91 }, { "epoch": 0.04418828049951969, "grad_norm": 1.0979463668294076, "learning_rate": 1.472e-06, "loss": 0.0966, "step": 92 }, { "epoch": 0.0446685878962536, "grad_norm": 1.4821760121201706, "learning_rate": 1.488e-06, "loss": 0.1048, "step": 93 }, { "epoch": 0.04514889529298751, "grad_norm": 1.6721548051611312, "learning_rate": 1.5040000000000001e-06, "loss": 0.1065, "step": 94 }, { "epoch": 0.04562920268972142, "grad_norm": 1.4046486532313425, "learning_rate": 1.52e-06, "loss": 0.1121, "step": 95 }, { "epoch": 0.04610951008645533, "grad_norm": 1.8920282885751272, "learning_rate": 1.536e-06, "loss": 0.093, "step": 96 }, { "epoch": 0.046589817483189244, "grad_norm": 1.3117971937414172, "learning_rate": 1.5520000000000001e-06, "loss": 0.0763, "step": 97 }, { "epoch": 0.04707012487992315, "grad_norm": 1.2226066484010771, "learning_rate": 1.568e-06, "loss": 0.0925, "step": 98 }, { "epoch": 0.04755043227665706, "grad_norm": 1.2523216029904312, "learning_rate": 1.5840000000000002e-06, "loss": 0.0886, "step": 99 }, { "epoch": 0.04803073967339097, "grad_norm": 1.2711895556124746, "learning_rate": 1.6000000000000001e-06, "loss": 0.0871, "step": 100 }, { "epoch": 0.04851104707012488, "grad_norm": 3.040299010535842, "learning_rate": 1.616e-06, "loss": 0.0745, "step": 101 }, { "epoch": 0.04899135446685879, "grad_norm": 1.6389584028051778, "learning_rate": 1.6320000000000002e-06, "loss": 0.1278, "step": 102 }, { "epoch": 0.0494716618635927, "grad_norm": 1.5552388728614976, "learning_rate": 1.6480000000000001e-06, "loss": 0.0993, "step": 103 }, { "epoch": 0.049951969260326606, "grad_norm": 1.3887590517997233, "learning_rate": 1.664e-06, "loss": 0.0846, "step": 104 }, { "epoch": 0.05043227665706052, "grad_norm": 1.5458592978622285, "learning_rate": 1.6800000000000002e-06, "loss": 0.1046, "step": 105 }, { "epoch": 0.05091258405379443, "grad_norm": 2.9179450722754434, "learning_rate": 1.6960000000000002e-06, "loss": 0.1242, "step": 106 }, { "epoch": 0.05139289145052834, "grad_norm": 2.7394754636725693, "learning_rate": 1.712e-06, "loss": 0.0819, "step": 107 }, { "epoch": 0.05187319884726225, "grad_norm": 2.077429221247708, "learning_rate": 1.7280000000000002e-06, "loss": 0.1007, "step": 108 }, { "epoch": 0.05235350624399616, "grad_norm": 1.2362929358102204, "learning_rate": 1.7440000000000002e-06, "loss": 0.0975, "step": 109 }, { "epoch": 0.052833813640730067, "grad_norm": 1.5896189455542367, "learning_rate": 1.76e-06, "loss": 0.1118, "step": 110 }, { "epoch": 0.053314121037463975, "grad_norm": 1.8111208272150052, "learning_rate": 1.7760000000000002e-06, "loss": 0.1173, "step": 111 }, { "epoch": 0.053794428434197884, "grad_norm": 1.1388547833141291, "learning_rate": 1.7920000000000002e-06, "loss": 0.0715, "step": 112 }, { "epoch": 0.05427473583093179, "grad_norm": 1.637903840464677, "learning_rate": 1.808e-06, "loss": 0.1148, "step": 113 }, { "epoch": 0.05475504322766571, "grad_norm": 1.6505805474404067, "learning_rate": 1.8240000000000002e-06, "loss": 0.1086, "step": 114 }, { "epoch": 0.05523535062439962, "grad_norm": 2.027298880252341, "learning_rate": 1.8400000000000002e-06, "loss": 0.1137, "step": 115 }, { "epoch": 0.05571565802113353, "grad_norm": 1.9867450378269047, "learning_rate": 1.856e-06, "loss": 0.0949, "step": 116 }, { "epoch": 0.056195965417867436, "grad_norm": 1.0287961975246656, "learning_rate": 1.8720000000000002e-06, "loss": 0.0993, "step": 117 }, { "epoch": 0.056676272814601344, "grad_norm": 3.6886267076116726, "learning_rate": 1.8880000000000002e-06, "loss": 0.0734, "step": 118 }, { "epoch": 0.05715658021133525, "grad_norm": 1.4209403055185024, "learning_rate": 1.9040000000000003e-06, "loss": 0.088, "step": 119 }, { "epoch": 0.05763688760806916, "grad_norm": 1.5252469546994676, "learning_rate": 1.9200000000000003e-06, "loss": 0.1195, "step": 120 }, { "epoch": 0.05811719500480307, "grad_norm": 1.2043288229655742, "learning_rate": 1.936e-06, "loss": 0.0929, "step": 121 }, { "epoch": 0.05859750240153699, "grad_norm": 4.538691137978197, "learning_rate": 1.952e-06, "loss": 0.0773, "step": 122 }, { "epoch": 0.059077809798270896, "grad_norm": 1.6211192048515701, "learning_rate": 1.968e-06, "loss": 0.1006, "step": 123 }, { "epoch": 0.059558117195004805, "grad_norm": 1.1742642998496757, "learning_rate": 1.984e-06, "loss": 0.0682, "step": 124 }, { "epoch": 0.060038424591738714, "grad_norm": 1.3758629673989613, "learning_rate": 2.0000000000000003e-06, "loss": 0.0732, "step": 125 }, { "epoch": 0.06051873198847262, "grad_norm": 1.4595665452207671, "learning_rate": 2.0160000000000003e-06, "loss": 0.0885, "step": 126 }, { "epoch": 0.06099903938520653, "grad_norm": 1.2343001806608365, "learning_rate": 2.032e-06, "loss": 0.0871, "step": 127 }, { "epoch": 0.06147934678194044, "grad_norm": 1.4944430814568856, "learning_rate": 2.048e-06, "loss": 0.0987, "step": 128 }, { "epoch": 0.06195965417867435, "grad_norm": 1.1669826735751045, "learning_rate": 2.064e-06, "loss": 0.0828, "step": 129 }, { "epoch": 0.06243996157540826, "grad_norm": 2.050815392469096, "learning_rate": 2.08e-06, "loss": 0.1436, "step": 130 }, { "epoch": 0.06292026897214217, "grad_norm": 2.6189886490073837, "learning_rate": 2.0960000000000003e-06, "loss": 0.1121, "step": 131 }, { "epoch": 0.06340057636887608, "grad_norm": 1.465151382295218, "learning_rate": 2.1120000000000003e-06, "loss": 0.1569, "step": 132 }, { "epoch": 0.06388088376560999, "grad_norm": 1.6059405590984388, "learning_rate": 2.128e-06, "loss": 0.0891, "step": 133 }, { "epoch": 0.0643611911623439, "grad_norm": 1.3606090814821632, "learning_rate": 2.144e-06, "loss": 0.104, "step": 134 }, { "epoch": 0.06484149855907781, "grad_norm": 1.1881719471401748, "learning_rate": 2.16e-06, "loss": 0.0775, "step": 135 }, { "epoch": 0.06532180595581172, "grad_norm": 1.205747071113525, "learning_rate": 2.176e-06, "loss": 0.084, "step": 136 }, { "epoch": 0.06580211335254563, "grad_norm": 1.2301202707311236, "learning_rate": 2.1920000000000004e-06, "loss": 0.0647, "step": 137 }, { "epoch": 0.06628242074927954, "grad_norm": 1.154403895526943, "learning_rate": 2.2080000000000003e-06, "loss": 0.0884, "step": 138 }, { "epoch": 0.06676272814601344, "grad_norm": 1.1013298512945302, "learning_rate": 2.2240000000000002e-06, "loss": 0.0643, "step": 139 }, { "epoch": 0.06724303554274735, "grad_norm": 1.0932013932642193, "learning_rate": 2.24e-06, "loss": 0.092, "step": 140 }, { "epoch": 0.06772334293948126, "grad_norm": 1.0454452002652364, "learning_rate": 2.256e-06, "loss": 0.0799, "step": 141 }, { "epoch": 0.06820365033621517, "grad_norm": 1.4743566119826792, "learning_rate": 2.2720000000000004e-06, "loss": 0.1136, "step": 142 }, { "epoch": 0.0686839577329491, "grad_norm": 1.4039096246861968, "learning_rate": 2.2880000000000004e-06, "loss": 0.0909, "step": 143 }, { "epoch": 0.069164265129683, "grad_norm": 1.2657755183130568, "learning_rate": 2.3040000000000003e-06, "loss": 0.1117, "step": 144 }, { "epoch": 0.06964457252641691, "grad_norm": 1.5851847672258597, "learning_rate": 2.3200000000000002e-06, "loss": 0.0951, "step": 145 }, { "epoch": 0.07012487992315082, "grad_norm": 1.2581561491117204, "learning_rate": 2.336e-06, "loss": 0.0878, "step": 146 }, { "epoch": 0.07060518731988473, "grad_norm": 1.420979749511269, "learning_rate": 2.352e-06, "loss": 0.0936, "step": 147 }, { "epoch": 0.07108549471661864, "grad_norm": 1.3031371111764456, "learning_rate": 2.3680000000000005e-06, "loss": 0.0909, "step": 148 }, { "epoch": 0.07156580211335255, "grad_norm": 1.4573325610550967, "learning_rate": 2.3840000000000004e-06, "loss": 0.0827, "step": 149 }, { "epoch": 0.07204610951008646, "grad_norm": 1.3047911271077164, "learning_rate": 2.4000000000000003e-06, "loss": 0.0851, "step": 150 }, { "epoch": 0.07252641690682037, "grad_norm": 1.4176849309242068, "learning_rate": 2.4160000000000002e-06, "loss": 0.0831, "step": 151 }, { "epoch": 0.07300672430355427, "grad_norm": 2.5041223053166903, "learning_rate": 2.432e-06, "loss": 0.0963, "step": 152 }, { "epoch": 0.07348703170028818, "grad_norm": 1.2243042228153846, "learning_rate": 2.448e-06, "loss": 0.092, "step": 153 }, { "epoch": 0.07396733909702209, "grad_norm": 1.3595133894472542, "learning_rate": 2.4640000000000005e-06, "loss": 0.1088, "step": 154 }, { "epoch": 0.074447646493756, "grad_norm": 1.8046494765641843, "learning_rate": 2.4800000000000004e-06, "loss": 0.1067, "step": 155 }, { "epoch": 0.07492795389048991, "grad_norm": 1.3961366152512769, "learning_rate": 2.496e-06, "loss": 0.1066, "step": 156 }, { "epoch": 0.07540826128722382, "grad_norm": 1.569757837719253, "learning_rate": 2.512e-06, "loss": 0.116, "step": 157 }, { "epoch": 0.07588856868395773, "grad_norm": 1.2274453659822118, "learning_rate": 2.5280000000000006e-06, "loss": 0.0888, "step": 158 }, { "epoch": 0.07636887608069164, "grad_norm": 1.3105009460198556, "learning_rate": 2.5440000000000005e-06, "loss": 0.109, "step": 159 }, { "epoch": 0.07684918347742556, "grad_norm": 1.1721145729988776, "learning_rate": 2.56e-06, "loss": 0.0994, "step": 160 }, { "epoch": 0.07732949087415947, "grad_norm": 1.2096805266515924, "learning_rate": 2.576e-06, "loss": 0.0887, "step": 161 }, { "epoch": 0.07780979827089338, "grad_norm": 2.25466141210399, "learning_rate": 2.592e-06, "loss": 0.0977, "step": 162 }, { "epoch": 0.07829010566762729, "grad_norm": 1.2284935839699218, "learning_rate": 2.608e-06, "loss": 0.0991, "step": 163 }, { "epoch": 0.0787704130643612, "grad_norm": 1.1932140220390712, "learning_rate": 2.6240000000000006e-06, "loss": 0.0866, "step": 164 }, { "epoch": 0.0792507204610951, "grad_norm": 1.0789509823335703, "learning_rate": 2.64e-06, "loss": 0.0996, "step": 165 }, { "epoch": 0.07973102785782901, "grad_norm": 0.9371763943071141, "learning_rate": 2.656e-06, "loss": 0.0582, "step": 166 }, { "epoch": 0.08021133525456292, "grad_norm": 1.7368430972984232, "learning_rate": 2.672e-06, "loss": 0.1086, "step": 167 }, { "epoch": 0.08069164265129683, "grad_norm": 1.6755808286564804, "learning_rate": 2.688e-06, "loss": 0.0873, "step": 168 }, { "epoch": 0.08117195004803074, "grad_norm": 1.3121940443924218, "learning_rate": 2.704e-06, "loss": 0.0805, "step": 169 }, { "epoch": 0.08165225744476465, "grad_norm": 1.3271980163235704, "learning_rate": 2.7200000000000002e-06, "loss": 0.0813, "step": 170 }, { "epoch": 0.08213256484149856, "grad_norm": 1.1932639416175121, "learning_rate": 2.736e-06, "loss": 0.0823, "step": 171 }, { "epoch": 0.08261287223823247, "grad_norm": 1.1588476048524006, "learning_rate": 2.752e-06, "loss": 0.0987, "step": 172 }, { "epoch": 0.08309317963496637, "grad_norm": 0.9072215194037728, "learning_rate": 2.768e-06, "loss": 0.067, "step": 173 }, { "epoch": 0.08357348703170028, "grad_norm": 1.356010402076568, "learning_rate": 2.784e-06, "loss": 0.0868, "step": 174 }, { "epoch": 0.08405379442843419, "grad_norm": 1.2650989878632668, "learning_rate": 2.8000000000000003e-06, "loss": 0.0812, "step": 175 }, { "epoch": 0.0845341018251681, "grad_norm": 1.4802205189991515, "learning_rate": 2.8160000000000002e-06, "loss": 0.099, "step": 176 }, { "epoch": 0.08501440922190202, "grad_norm": 1.1417060742721667, "learning_rate": 2.832e-06, "loss": 0.0815, "step": 177 }, { "epoch": 0.08549471661863593, "grad_norm": 1.3705511538151176, "learning_rate": 2.848e-06, "loss": 0.1102, "step": 178 }, { "epoch": 0.08597502401536984, "grad_norm": 1.3163532047784325, "learning_rate": 2.864e-06, "loss": 0.0755, "step": 179 }, { "epoch": 0.08645533141210375, "grad_norm": 1.47238236129405, "learning_rate": 2.88e-06, "loss": 0.0949, "step": 180 }, { "epoch": 0.08693563880883766, "grad_norm": 0.9260065978895016, "learning_rate": 2.8960000000000003e-06, "loss": 0.0586, "step": 181 }, { "epoch": 0.08741594620557157, "grad_norm": 1.311717165904964, "learning_rate": 2.9120000000000002e-06, "loss": 0.0816, "step": 182 }, { "epoch": 0.08789625360230548, "grad_norm": 1.3185427170913515, "learning_rate": 2.928e-06, "loss": 0.1061, "step": 183 }, { "epoch": 0.08837656099903939, "grad_norm": 1.0037072726769718, "learning_rate": 2.944e-06, "loss": 0.062, "step": 184 }, { "epoch": 0.0888568683957733, "grad_norm": 1.0985735941649346, "learning_rate": 2.96e-06, "loss": 0.0805, "step": 185 }, { "epoch": 0.0893371757925072, "grad_norm": 1.3125670658691977, "learning_rate": 2.976e-06, "loss": 0.0931, "step": 186 }, { "epoch": 0.08981748318924111, "grad_norm": 1.0838144485116263, "learning_rate": 2.9920000000000003e-06, "loss": 0.0834, "step": 187 }, { "epoch": 0.09029779058597502, "grad_norm": 1.799900406869807, "learning_rate": 3.0080000000000003e-06, "loss": 0.0919, "step": 188 }, { "epoch": 0.09077809798270893, "grad_norm": 1.173117411096857, "learning_rate": 3.024e-06, "loss": 0.0721, "step": 189 }, { "epoch": 0.09125840537944284, "grad_norm": 1.373335051247715, "learning_rate": 3.04e-06, "loss": 0.0846, "step": 190 }, { "epoch": 0.09173871277617675, "grad_norm": 1.3981230024483795, "learning_rate": 3.056e-06, "loss": 0.1008, "step": 191 }, { "epoch": 0.09221902017291066, "grad_norm": 1.4513236326035954, "learning_rate": 3.072e-06, "loss": 0.1059, "step": 192 }, { "epoch": 0.09269932756964457, "grad_norm": 1.2186053621072894, "learning_rate": 3.0880000000000003e-06, "loss": 0.09, "step": 193 }, { "epoch": 0.09317963496637849, "grad_norm": 1.1068806010368781, "learning_rate": 3.1040000000000003e-06, "loss": 0.0718, "step": 194 }, { "epoch": 0.0936599423631124, "grad_norm": 1.237536933577925, "learning_rate": 3.12e-06, "loss": 0.0768, "step": 195 }, { "epoch": 0.0941402497598463, "grad_norm": 1.2585614650979184, "learning_rate": 3.136e-06, "loss": 0.0757, "step": 196 }, { "epoch": 0.09462055715658022, "grad_norm": 0.9595246063304191, "learning_rate": 3.152e-06, "loss": 0.0696, "step": 197 }, { "epoch": 0.09510086455331412, "grad_norm": 1.1929913387812179, "learning_rate": 3.1680000000000004e-06, "loss": 0.0867, "step": 198 }, { "epoch": 0.09558117195004803, "grad_norm": 1.0572272850541955, "learning_rate": 3.1840000000000003e-06, "loss": 0.0816, "step": 199 }, { "epoch": 0.09606147934678194, "grad_norm": 1.160822305103066, "learning_rate": 3.2000000000000003e-06, "loss": 0.085, "step": 200 }, { "epoch": 0.09654178674351585, "grad_norm": 1.3295668411317592, "learning_rate": 3.216e-06, "loss": 0.1003, "step": 201 }, { "epoch": 0.09702209414024976, "grad_norm": 1.3117059163085465, "learning_rate": 3.232e-06, "loss": 0.0814, "step": 202 }, { "epoch": 0.09750240153698367, "grad_norm": 1.4613138653883198, "learning_rate": 3.248e-06, "loss": 0.1041, "step": 203 }, { "epoch": 0.09798270893371758, "grad_norm": 1.3385478082564994, "learning_rate": 3.2640000000000004e-06, "loss": 0.0846, "step": 204 }, { "epoch": 0.09846301633045149, "grad_norm": 1.271829712055238, "learning_rate": 3.2800000000000004e-06, "loss": 0.0986, "step": 205 }, { "epoch": 0.0989433237271854, "grad_norm": 1.1382933856150044, "learning_rate": 3.2960000000000003e-06, "loss": 0.0662, "step": 206 }, { "epoch": 0.0994236311239193, "grad_norm": 1.2777612246188284, "learning_rate": 3.3120000000000002e-06, "loss": 0.0944, "step": 207 }, { "epoch": 0.09990393852065321, "grad_norm": 1.2569659513450397, "learning_rate": 3.328e-06, "loss": 0.0688, "step": 208 }, { "epoch": 0.10038424591738712, "grad_norm": 1.2019771035760043, "learning_rate": 3.344e-06, "loss": 0.092, "step": 209 }, { "epoch": 0.10086455331412104, "grad_norm": 1.0705863361886445, "learning_rate": 3.3600000000000004e-06, "loss": 0.085, "step": 210 }, { "epoch": 0.10134486071085495, "grad_norm": 1.1235244108380247, "learning_rate": 3.3760000000000004e-06, "loss": 0.0798, "step": 211 }, { "epoch": 0.10182516810758886, "grad_norm": 1.0830560530059452, "learning_rate": 3.3920000000000003e-06, "loss": 0.0745, "step": 212 }, { "epoch": 0.10230547550432277, "grad_norm": 1.2281697980226327, "learning_rate": 3.4080000000000002e-06, "loss": 0.0822, "step": 213 }, { "epoch": 0.10278578290105668, "grad_norm": 1.0293384591155774, "learning_rate": 3.424e-06, "loss": 0.0699, "step": 214 }, { "epoch": 0.10326609029779059, "grad_norm": 1.1168765812283847, "learning_rate": 3.44e-06, "loss": 0.0903, "step": 215 }, { "epoch": 0.1037463976945245, "grad_norm": 1.2232503580307106, "learning_rate": 3.4560000000000005e-06, "loss": 0.073, "step": 216 }, { "epoch": 0.1042267050912584, "grad_norm": 1.1853245402764723, "learning_rate": 3.4720000000000004e-06, "loss": 0.0804, "step": 217 }, { "epoch": 0.10470701248799232, "grad_norm": 1.0349173303797323, "learning_rate": 3.4880000000000003e-06, "loss": 0.0791, "step": 218 }, { "epoch": 0.10518731988472622, "grad_norm": 0.9123748906672083, "learning_rate": 3.5040000000000002e-06, "loss": 0.0846, "step": 219 }, { "epoch": 0.10566762728146013, "grad_norm": 1.1862034175014786, "learning_rate": 3.52e-06, "loss": 0.0968, "step": 220 }, { "epoch": 0.10614793467819404, "grad_norm": 1.2703272165951818, "learning_rate": 3.5360000000000005e-06, "loss": 0.106, "step": 221 }, { "epoch": 0.10662824207492795, "grad_norm": 0.825924476735835, "learning_rate": 3.5520000000000005e-06, "loss": 0.0722, "step": 222 }, { "epoch": 0.10710854947166186, "grad_norm": 1.0687515686341, "learning_rate": 3.5680000000000004e-06, "loss": 0.0716, "step": 223 }, { "epoch": 0.10758885686839577, "grad_norm": 1.0805753001019858, "learning_rate": 3.5840000000000003e-06, "loss": 0.0691, "step": 224 }, { "epoch": 0.10806916426512968, "grad_norm": 1.6287239686502346, "learning_rate": 3.6000000000000003e-06, "loss": 0.067, "step": 225 }, { "epoch": 0.10854947166186359, "grad_norm": 1.9814573884606366, "learning_rate": 3.616e-06, "loss": 0.0752, "step": 226 }, { "epoch": 0.10902977905859751, "grad_norm": 1.3374463525520308, "learning_rate": 3.6320000000000005e-06, "loss": 0.0876, "step": 227 }, { "epoch": 0.10951008645533142, "grad_norm": 1.452473041749981, "learning_rate": 3.6480000000000005e-06, "loss": 0.0788, "step": 228 }, { "epoch": 0.10999039385206533, "grad_norm": 0.9767761821599609, "learning_rate": 3.6640000000000004e-06, "loss": 0.059, "step": 229 }, { "epoch": 0.11047070124879924, "grad_norm": 2.0951330953014837, "learning_rate": 3.6800000000000003e-06, "loss": 0.1257, "step": 230 }, { "epoch": 0.11095100864553314, "grad_norm": 1.4760857161454637, "learning_rate": 3.6960000000000003e-06, "loss": 0.0694, "step": 231 }, { "epoch": 0.11143131604226705, "grad_norm": 1.426490577687766, "learning_rate": 3.712e-06, "loss": 0.078, "step": 232 }, { "epoch": 0.11191162343900096, "grad_norm": 1.049787450994003, "learning_rate": 3.7280000000000006e-06, "loss": 0.0679, "step": 233 }, { "epoch": 0.11239193083573487, "grad_norm": 0.8796776572576731, "learning_rate": 3.7440000000000005e-06, "loss": 0.0607, "step": 234 }, { "epoch": 0.11287223823246878, "grad_norm": 1.3931219386969078, "learning_rate": 3.7600000000000004e-06, "loss": 0.102, "step": 235 }, { "epoch": 0.11335254562920269, "grad_norm": 1.393636110259964, "learning_rate": 3.7760000000000004e-06, "loss": 0.111, "step": 236 }, { "epoch": 0.1138328530259366, "grad_norm": 0.9780090176809716, "learning_rate": 3.7920000000000003e-06, "loss": 0.0947, "step": 237 }, { "epoch": 0.1143131604226705, "grad_norm": 1.3598074761781376, "learning_rate": 3.8080000000000006e-06, "loss": 0.1047, "step": 238 }, { "epoch": 0.11479346781940442, "grad_norm": 1.413503721034128, "learning_rate": 3.824e-06, "loss": 0.0731, "step": 239 }, { "epoch": 0.11527377521613832, "grad_norm": 1.0636156845849574, "learning_rate": 3.8400000000000005e-06, "loss": 0.0947, "step": 240 }, { "epoch": 0.11575408261287223, "grad_norm": 1.214860626930102, "learning_rate": 3.856e-06, "loss": 0.0652, "step": 241 }, { "epoch": 0.11623439000960614, "grad_norm": 1.203838132160498, "learning_rate": 3.872e-06, "loss": 0.0772, "step": 242 }, { "epoch": 0.11671469740634005, "grad_norm": 1.4337623413545681, "learning_rate": 3.888e-06, "loss": 0.1038, "step": 243 }, { "epoch": 0.11719500480307397, "grad_norm": 1.8506896303135723, "learning_rate": 3.904e-06, "loss": 0.0974, "step": 244 }, { "epoch": 0.11767531219980788, "grad_norm": 1.0209624715713772, "learning_rate": 3.920000000000001e-06, "loss": 0.0668, "step": 245 }, { "epoch": 0.11815561959654179, "grad_norm": 1.0430831232311266, "learning_rate": 3.936e-06, "loss": 0.0693, "step": 246 }, { "epoch": 0.1186359269932757, "grad_norm": 0.9781386213920611, "learning_rate": 3.9520000000000004e-06, "loss": 0.0922, "step": 247 }, { "epoch": 0.11911623439000961, "grad_norm": 1.3090670400806688, "learning_rate": 3.968e-06, "loss": 0.0697, "step": 248 }, { "epoch": 0.11959654178674352, "grad_norm": 1.6565002477397985, "learning_rate": 3.984e-06, "loss": 0.1017, "step": 249 }, { "epoch": 0.12007684918347743, "grad_norm": 1.013045916484816, "learning_rate": 4.000000000000001e-06, "loss": 0.0601, "step": 250 }, { "epoch": 0.12055715658021134, "grad_norm": 0.9922526880061665, "learning_rate": 4.016e-06, "loss": 0.0567, "step": 251 }, { "epoch": 0.12103746397694524, "grad_norm": 1.1410675081208677, "learning_rate": 4.0320000000000005e-06, "loss": 0.0766, "step": 252 }, { "epoch": 0.12151777137367915, "grad_norm": 1.3175828458287309, "learning_rate": 4.048e-06, "loss": 0.1039, "step": 253 }, { "epoch": 0.12199807877041306, "grad_norm": 1.3696186687199499, "learning_rate": 4.064e-06, "loss": 0.068, "step": 254 }, { "epoch": 0.12247838616714697, "grad_norm": 1.5050717359950978, "learning_rate": 4.08e-06, "loss": 0.0797, "step": 255 }, { "epoch": 0.12295869356388088, "grad_norm": 1.3400957977202135, "learning_rate": 4.096e-06, "loss": 0.0999, "step": 256 }, { "epoch": 0.12343900096061479, "grad_norm": 1.2617471427133966, "learning_rate": 4.112000000000001e-06, "loss": 0.0822, "step": 257 }, { "epoch": 0.1239193083573487, "grad_norm": 1.267747053897931, "learning_rate": 4.128e-06, "loss": 0.0773, "step": 258 }, { "epoch": 0.12439961575408261, "grad_norm": 1.2676946090954653, "learning_rate": 4.1440000000000005e-06, "loss": 0.0754, "step": 259 }, { "epoch": 0.12487992315081652, "grad_norm": 1.2292713163126525, "learning_rate": 4.16e-06, "loss": 0.09, "step": 260 }, { "epoch": 0.12536023054755044, "grad_norm": 1.0012628385996274, "learning_rate": 4.176e-06, "loss": 0.0697, "step": 261 }, { "epoch": 0.12584053794428435, "grad_norm": 1.3874865197803345, "learning_rate": 4.192000000000001e-06, "loss": 0.0932, "step": 262 }, { "epoch": 0.12632084534101826, "grad_norm": 1.177205882683919, "learning_rate": 4.208e-06, "loss": 0.0664, "step": 263 }, { "epoch": 0.12680115273775217, "grad_norm": 1.2350489439494747, "learning_rate": 4.2240000000000006e-06, "loss": 0.0898, "step": 264 }, { "epoch": 0.12728146013448607, "grad_norm": 1.1559020122519823, "learning_rate": 4.24e-06, "loss": 0.0786, "step": 265 }, { "epoch": 0.12776176753121998, "grad_norm": 1.3539373299195447, "learning_rate": 4.256e-06, "loss": 0.0728, "step": 266 }, { "epoch": 0.1282420749279539, "grad_norm": 1.2165821416393527, "learning_rate": 4.272000000000001e-06, "loss": 0.0665, "step": 267 }, { "epoch": 0.1287223823246878, "grad_norm": 0.8998352503566164, "learning_rate": 4.288e-06, "loss": 0.0572, "step": 268 }, { "epoch": 0.1292026897214217, "grad_norm": 1.332170024000591, "learning_rate": 4.304000000000001e-06, "loss": 0.0896, "step": 269 }, { "epoch": 0.12968299711815562, "grad_norm": 0.8702695667096485, "learning_rate": 4.32e-06, "loss": 0.0579, "step": 270 }, { "epoch": 0.13016330451488953, "grad_norm": 1.1346194333159956, "learning_rate": 4.3360000000000005e-06, "loss": 0.0809, "step": 271 }, { "epoch": 0.13064361191162344, "grad_norm": 1.3242195848254044, "learning_rate": 4.352e-06, "loss": 0.0953, "step": 272 }, { "epoch": 0.13112391930835735, "grad_norm": 0.947922780884019, "learning_rate": 4.368e-06, "loss": 0.0726, "step": 273 }, { "epoch": 0.13160422670509125, "grad_norm": 0.8590229026058185, "learning_rate": 4.384000000000001e-06, "loss": 0.0859, "step": 274 }, { "epoch": 0.13208453410182516, "grad_norm": 2.0364307950864897, "learning_rate": 4.4e-06, "loss": 0.1065, "step": 275 }, { "epoch": 0.13256484149855907, "grad_norm": 0.9495154556473616, "learning_rate": 4.416000000000001e-06, "loss": 0.0539, "step": 276 }, { "epoch": 0.13304514889529298, "grad_norm": 1.274982611643209, "learning_rate": 4.432e-06, "loss": 0.0847, "step": 277 }, { "epoch": 0.1335254562920269, "grad_norm": 1.0179278688325417, "learning_rate": 4.4480000000000004e-06, "loss": 0.0781, "step": 278 }, { "epoch": 0.1340057636887608, "grad_norm": 1.0777338379659434, "learning_rate": 4.464000000000001e-06, "loss": 0.0823, "step": 279 }, { "epoch": 0.1344860710854947, "grad_norm": 1.7837833704627275, "learning_rate": 4.48e-06, "loss": 0.0809, "step": 280 }, { "epoch": 0.13496637848222862, "grad_norm": 1.1491936699966703, "learning_rate": 4.496000000000001e-06, "loss": 0.0726, "step": 281 }, { "epoch": 0.13544668587896252, "grad_norm": 1.1891782145869991, "learning_rate": 4.512e-06, "loss": 0.0872, "step": 282 }, { "epoch": 0.13592699327569643, "grad_norm": 0.8682236854456676, "learning_rate": 4.5280000000000005e-06, "loss": 0.0739, "step": 283 }, { "epoch": 0.13640730067243034, "grad_norm": 2.941844018714875, "learning_rate": 4.544000000000001e-06, "loss": 0.0862, "step": 284 }, { "epoch": 0.13688760806916425, "grad_norm": 1.1758035570138663, "learning_rate": 4.56e-06, "loss": 0.094, "step": 285 }, { "epoch": 0.1373679154658982, "grad_norm": 1.213798829069968, "learning_rate": 4.576000000000001e-06, "loss": 0.0582, "step": 286 }, { "epoch": 0.1378482228626321, "grad_norm": 1.1521708031630107, "learning_rate": 4.592e-06, "loss": 0.0961, "step": 287 }, { "epoch": 0.138328530259366, "grad_norm": 1.5167984093098619, "learning_rate": 4.608000000000001e-06, "loss": 0.0886, "step": 288 }, { "epoch": 0.13880883765609991, "grad_norm": 1.334195306364946, "learning_rate": 4.624e-06, "loss": 0.0926, "step": 289 }, { "epoch": 0.13928914505283382, "grad_norm": 1.201794873892803, "learning_rate": 4.6400000000000005e-06, "loss": 0.0714, "step": 290 }, { "epoch": 0.13976945244956773, "grad_norm": 1.1161034977945354, "learning_rate": 4.656000000000001e-06, "loss": 0.0835, "step": 291 }, { "epoch": 0.14024975984630164, "grad_norm": 1.1187497922543908, "learning_rate": 4.672e-06, "loss": 0.0768, "step": 292 }, { "epoch": 0.14073006724303555, "grad_norm": 1.135592941087255, "learning_rate": 4.688000000000001e-06, "loss": 0.0987, "step": 293 }, { "epoch": 0.14121037463976946, "grad_norm": 1.0116456228936113, "learning_rate": 4.704e-06, "loss": 0.0868, "step": 294 }, { "epoch": 0.14169068203650337, "grad_norm": 0.8988345290651037, "learning_rate": 4.7200000000000005e-06, "loss": 0.0729, "step": 295 }, { "epoch": 0.14217098943323728, "grad_norm": 1.4920548715651283, "learning_rate": 4.736000000000001e-06, "loss": 0.0689, "step": 296 }, { "epoch": 0.14265129682997119, "grad_norm": 1.1491177326584943, "learning_rate": 4.752e-06, "loss": 0.0822, "step": 297 }, { "epoch": 0.1431316042267051, "grad_norm": 1.3710614031545738, "learning_rate": 4.768000000000001e-06, "loss": 0.1054, "step": 298 }, { "epoch": 0.143611911623439, "grad_norm": 0.798935770934357, "learning_rate": 4.784e-06, "loss": 0.0609, "step": 299 }, { "epoch": 0.1440922190201729, "grad_norm": 1.0553443465463967, "learning_rate": 4.800000000000001e-06, "loss": 0.0588, "step": 300 }, { "epoch": 0.14457252641690682, "grad_norm": 0.862660611904342, "learning_rate": 4.816e-06, "loss": 0.0607, "step": 301 }, { "epoch": 0.14505283381364073, "grad_norm": 1.0888178319055062, "learning_rate": 4.8320000000000005e-06, "loss": 0.0888, "step": 302 }, { "epoch": 0.14553314121037464, "grad_norm": 0.8383673824701234, "learning_rate": 4.848000000000001e-06, "loss": 0.0474, "step": 303 }, { "epoch": 0.14601344860710855, "grad_norm": 0.946622673839637, "learning_rate": 4.864e-06, "loss": 0.0644, "step": 304 }, { "epoch": 0.14649375600384246, "grad_norm": 0.9082218680745726, "learning_rate": 4.880000000000001e-06, "loss": 0.0639, "step": 305 }, { "epoch": 0.14697406340057637, "grad_norm": 1.3359634218439083, "learning_rate": 4.896e-06, "loss": 0.0808, "step": 306 }, { "epoch": 0.14745437079731027, "grad_norm": 2.4941362749992577, "learning_rate": 4.9120000000000006e-06, "loss": 0.1029, "step": 307 }, { "epoch": 0.14793467819404418, "grad_norm": 0.8768474072222495, "learning_rate": 4.928000000000001e-06, "loss": 0.0603, "step": 308 }, { "epoch": 0.1484149855907781, "grad_norm": 1.0191777482177848, "learning_rate": 4.9440000000000004e-06, "loss": 0.1195, "step": 309 }, { "epoch": 0.148895292987512, "grad_norm": 1.3776289338364824, "learning_rate": 4.960000000000001e-06, "loss": 0.0929, "step": 310 }, { "epoch": 0.1493756003842459, "grad_norm": 1.118324563424117, "learning_rate": 4.976e-06, "loss": 0.0994, "step": 311 }, { "epoch": 0.14985590778097982, "grad_norm": 0.94374520838527, "learning_rate": 4.992e-06, "loss": 0.0641, "step": 312 }, { "epoch": 0.15033621517771373, "grad_norm": 1.2295166440217464, "learning_rate": 5.008000000000001e-06, "loss": 0.0764, "step": 313 }, { "epoch": 0.15081652257444764, "grad_norm": 1.0038813920972378, "learning_rate": 5.024e-06, "loss": 0.0894, "step": 314 }, { "epoch": 0.15129682997118155, "grad_norm": 0.9564388991346412, "learning_rate": 5.04e-06, "loss": 0.0646, "step": 315 }, { "epoch": 0.15177713736791545, "grad_norm": 1.001140276754152, "learning_rate": 5.056000000000001e-06, "loss": 0.0665, "step": 316 }, { "epoch": 0.15225744476464936, "grad_norm": 1.6640587209098594, "learning_rate": 5.072e-06, "loss": 0.0996, "step": 317 }, { "epoch": 0.15273775216138327, "grad_norm": 0.9225089106833559, "learning_rate": 5.088000000000001e-06, "loss": 0.0636, "step": 318 }, { "epoch": 0.15321805955811718, "grad_norm": 0.8945005100522845, "learning_rate": 5.104e-06, "loss": 0.0769, "step": 319 }, { "epoch": 0.15369836695485112, "grad_norm": 1.7987667845062436, "learning_rate": 5.12e-06, "loss": 0.1081, "step": 320 }, { "epoch": 0.15417867435158503, "grad_norm": 1.255685447274647, "learning_rate": 5.136e-06, "loss": 0.0614, "step": 321 }, { "epoch": 0.15465898174831894, "grad_norm": 0.9074627196357872, "learning_rate": 5.152e-06, "loss": 0.0713, "step": 322 }, { "epoch": 0.15513928914505284, "grad_norm": 0.8841836941406669, "learning_rate": 5.168000000000001e-06, "loss": 0.0644, "step": 323 }, { "epoch": 0.15561959654178675, "grad_norm": 1.119142780438519, "learning_rate": 5.184e-06, "loss": 0.1067, "step": 324 }, { "epoch": 0.15609990393852066, "grad_norm": 0.7376552219638878, "learning_rate": 5.2e-06, "loss": 0.0584, "step": 325 }, { "epoch": 0.15658021133525457, "grad_norm": 0.7093197189610518, "learning_rate": 5.216e-06, "loss": 0.067, "step": 326 }, { "epoch": 0.15706051873198848, "grad_norm": 0.9146977097269845, "learning_rate": 5.232e-06, "loss": 0.0878, "step": 327 }, { "epoch": 0.1575408261287224, "grad_norm": 0.893881736147892, "learning_rate": 5.248000000000001e-06, "loss": 0.0749, "step": 328 }, { "epoch": 0.1580211335254563, "grad_norm": 1.0133364596640484, "learning_rate": 5.264e-06, "loss": 0.0669, "step": 329 }, { "epoch": 0.1585014409221902, "grad_norm": 1.0188513118607696, "learning_rate": 5.28e-06, "loss": 0.0744, "step": 330 }, { "epoch": 0.15898174831892412, "grad_norm": 1.3769922031431308, "learning_rate": 5.296e-06, "loss": 0.0804, "step": 331 }, { "epoch": 0.15946205571565802, "grad_norm": 1.0924350131956813, "learning_rate": 5.312e-06, "loss": 0.073, "step": 332 }, { "epoch": 0.15994236311239193, "grad_norm": 1.1474380032242861, "learning_rate": 5.328000000000001e-06, "loss": 0.0748, "step": 333 }, { "epoch": 0.16042267050912584, "grad_norm": 1.2007135376136377, "learning_rate": 5.344e-06, "loss": 0.0633, "step": 334 }, { "epoch": 0.16090297790585975, "grad_norm": 0.8615599372907458, "learning_rate": 5.36e-06, "loss": 0.0565, "step": 335 }, { "epoch": 0.16138328530259366, "grad_norm": 1.239088822730045, "learning_rate": 5.376e-06, "loss": 0.0774, "step": 336 }, { "epoch": 0.16186359269932757, "grad_norm": 1.2276765585158387, "learning_rate": 5.392e-06, "loss": 0.0798, "step": 337 }, { "epoch": 0.16234390009606148, "grad_norm": 1.4151473611051535, "learning_rate": 5.408e-06, "loss": 0.0946, "step": 338 }, { "epoch": 0.1628242074927954, "grad_norm": 1.55213536433148, "learning_rate": 5.424e-06, "loss": 0.081, "step": 339 }, { "epoch": 0.1633045148895293, "grad_norm": 1.2190236388499174, "learning_rate": 5.4400000000000004e-06, "loss": 0.1009, "step": 340 }, { "epoch": 0.1637848222862632, "grad_norm": 0.9398509713441388, "learning_rate": 5.456e-06, "loss": 0.0597, "step": 341 }, { "epoch": 0.1642651296829971, "grad_norm": 2.110512888312876, "learning_rate": 5.472e-06, "loss": 0.1204, "step": 342 }, { "epoch": 0.16474543707973102, "grad_norm": 1.05455486733766, "learning_rate": 5.488e-06, "loss": 0.0927, "step": 343 }, { "epoch": 0.16522574447646493, "grad_norm": 0.6714424760276676, "learning_rate": 5.504e-06, "loss": 0.0605, "step": 344 }, { "epoch": 0.16570605187319884, "grad_norm": 0.8868751693597045, "learning_rate": 5.5200000000000005e-06, "loss": 0.0592, "step": 345 }, { "epoch": 0.16618635926993275, "grad_norm": 1.3695330733027493, "learning_rate": 5.536e-06, "loss": 0.0836, "step": 346 }, { "epoch": 0.16666666666666666, "grad_norm": 0.6064272072499131, "learning_rate": 5.552e-06, "loss": 0.051, "step": 347 }, { "epoch": 0.16714697406340057, "grad_norm": 1.1286927064113377, "learning_rate": 5.568e-06, "loss": 0.0813, "step": 348 }, { "epoch": 0.16762728146013448, "grad_norm": 2.9709430641912284, "learning_rate": 5.584e-06, "loss": 0.0718, "step": 349 }, { "epoch": 0.16810758885686838, "grad_norm": 1.1789706938875752, "learning_rate": 5.600000000000001e-06, "loss": 0.0912, "step": 350 }, { "epoch": 0.1685878962536023, "grad_norm": 1.0699393776208648, "learning_rate": 5.616e-06, "loss": 0.0832, "step": 351 }, { "epoch": 0.1690682036503362, "grad_norm": 1.4723575935028883, "learning_rate": 5.6320000000000005e-06, "loss": 0.0807, "step": 352 }, { "epoch": 0.16954851104707014, "grad_norm": 0.9417323601320572, "learning_rate": 5.648e-06, "loss": 0.0745, "step": 353 }, { "epoch": 0.17002881844380405, "grad_norm": 1.0015388420248077, "learning_rate": 5.664e-06, "loss": 0.0742, "step": 354 }, { "epoch": 0.17050912584053796, "grad_norm": 0.846437738871243, "learning_rate": 5.68e-06, "loss": 0.0738, "step": 355 }, { "epoch": 0.17098943323727187, "grad_norm": 1.0340220342143194, "learning_rate": 5.696e-06, "loss": 0.1003, "step": 356 }, { "epoch": 0.17146974063400577, "grad_norm": 1.050210067042245, "learning_rate": 5.7120000000000005e-06, "loss": 0.0944, "step": 357 }, { "epoch": 0.17195004803073968, "grad_norm": 1.1441788701822333, "learning_rate": 5.728e-06, "loss": 0.0935, "step": 358 }, { "epoch": 0.1724303554274736, "grad_norm": 0.8578595357010815, "learning_rate": 5.744e-06, "loss": 0.065, "step": 359 }, { "epoch": 0.1729106628242075, "grad_norm": 1.0827485015540093, "learning_rate": 5.76e-06, "loss": 0.061, "step": 360 }, { "epoch": 0.1733909702209414, "grad_norm": 0.9247919383729161, "learning_rate": 5.776e-06, "loss": 0.0566, "step": 361 }, { "epoch": 0.17387127761767532, "grad_norm": 0.9359918484086721, "learning_rate": 5.792000000000001e-06, "loss": 0.0918, "step": 362 }, { "epoch": 0.17435158501440923, "grad_norm": 0.6461989395520733, "learning_rate": 5.808e-06, "loss": 0.0596, "step": 363 }, { "epoch": 0.17483189241114314, "grad_norm": 1.2658049954240356, "learning_rate": 5.8240000000000005e-06, "loss": 0.0921, "step": 364 }, { "epoch": 0.17531219980787704, "grad_norm": 1.3185639772784372, "learning_rate": 5.84e-06, "loss": 0.0949, "step": 365 }, { "epoch": 0.17579250720461095, "grad_norm": 1.147542350202481, "learning_rate": 5.856e-06, "loss": 0.0644, "step": 366 }, { "epoch": 0.17627281460134486, "grad_norm": 1.2428899445451909, "learning_rate": 5.872000000000001e-06, "loss": 0.0923, "step": 367 }, { "epoch": 0.17675312199807877, "grad_norm": 1.0123606541011576, "learning_rate": 5.888e-06, "loss": 0.062, "step": 368 }, { "epoch": 0.17723342939481268, "grad_norm": 1.2676040664006363, "learning_rate": 5.9040000000000006e-06, "loss": 0.0993, "step": 369 }, { "epoch": 0.1777137367915466, "grad_norm": 1.7674197429016132, "learning_rate": 5.92e-06, "loss": 0.1028, "step": 370 }, { "epoch": 0.1781940441882805, "grad_norm": 1.3206360765520475, "learning_rate": 5.9360000000000004e-06, "loss": 0.0909, "step": 371 }, { "epoch": 0.1786743515850144, "grad_norm": 1.397633543771018, "learning_rate": 5.952e-06, "loss": 0.0922, "step": 372 }, { "epoch": 0.17915465898174832, "grad_norm": 1.4356961352059268, "learning_rate": 5.968e-06, "loss": 0.0789, "step": 373 }, { "epoch": 0.17963496637848222, "grad_norm": 1.0525284488409845, "learning_rate": 5.984000000000001e-06, "loss": 0.0883, "step": 374 }, { "epoch": 0.18011527377521613, "grad_norm": 1.6818878970224407, "learning_rate": 6e-06, "loss": 0.0957, "step": 375 }, { "epoch": 0.18059558117195004, "grad_norm": 1.0922291599097236, "learning_rate": 6.0160000000000005e-06, "loss": 0.0648, "step": 376 }, { "epoch": 0.18107588856868395, "grad_norm": 0.9542123025634616, "learning_rate": 6.032e-06, "loss": 0.0799, "step": 377 }, { "epoch": 0.18155619596541786, "grad_norm": 1.173846664966484, "learning_rate": 6.048e-06, "loss": 0.0663, "step": 378 }, { "epoch": 0.18203650336215177, "grad_norm": 0.8431515012151173, "learning_rate": 6.064000000000001e-06, "loss": 0.0746, "step": 379 }, { "epoch": 0.18251681075888568, "grad_norm": 0.8101682237804423, "learning_rate": 6.08e-06, "loss": 0.0692, "step": 380 }, { "epoch": 0.1829971181556196, "grad_norm": 0.8310188133668261, "learning_rate": 6.096000000000001e-06, "loss": 0.072, "step": 381 }, { "epoch": 0.1834774255523535, "grad_norm": 1.275506422840803, "learning_rate": 6.112e-06, "loss": 0.1017, "step": 382 }, { "epoch": 0.1839577329490874, "grad_norm": 1.2320868444882946, "learning_rate": 6.1280000000000005e-06, "loss": 0.0851, "step": 383 }, { "epoch": 0.1844380403458213, "grad_norm": 0.9892285232343796, "learning_rate": 6.144e-06, "loss": 0.0902, "step": 384 }, { "epoch": 0.18491834774255522, "grad_norm": 1.2195514742095586, "learning_rate": 6.16e-06, "loss": 0.0784, "step": 385 }, { "epoch": 0.18539865513928913, "grad_norm": 0.7206845127032119, "learning_rate": 6.176000000000001e-06, "loss": 0.0486, "step": 386 }, { "epoch": 0.18587896253602307, "grad_norm": 0.9671084909417331, "learning_rate": 6.192e-06, "loss": 0.0772, "step": 387 }, { "epoch": 0.18635926993275698, "grad_norm": 1.0054776144188855, "learning_rate": 6.2080000000000005e-06, "loss": 0.0693, "step": 388 }, { "epoch": 0.18683957732949089, "grad_norm": 0.9245309107680109, "learning_rate": 6.224e-06, "loss": 0.0739, "step": 389 }, { "epoch": 0.1873198847262248, "grad_norm": 1.0103439605117195, "learning_rate": 6.24e-06, "loss": 0.0853, "step": 390 }, { "epoch": 0.1878001921229587, "grad_norm": 0.7676096813994673, "learning_rate": 6.256000000000001e-06, "loss": 0.0765, "step": 391 }, { "epoch": 0.1882804995196926, "grad_norm": 0.9670272222035949, "learning_rate": 6.272e-06, "loss": 0.073, "step": 392 }, { "epoch": 0.18876080691642652, "grad_norm": 1.5221890168980567, "learning_rate": 6.288000000000001e-06, "loss": 0.0796, "step": 393 }, { "epoch": 0.18924111431316043, "grad_norm": 0.7006607860700146, "learning_rate": 6.304e-06, "loss": 0.0626, "step": 394 }, { "epoch": 0.18972142170989434, "grad_norm": 1.5246564977183437, "learning_rate": 6.3200000000000005e-06, "loss": 0.0722, "step": 395 }, { "epoch": 0.19020172910662825, "grad_norm": 1.9379386031947312, "learning_rate": 6.336000000000001e-06, "loss": 0.0807, "step": 396 }, { "epoch": 0.19068203650336216, "grad_norm": 1.1381971297694942, "learning_rate": 6.352e-06, "loss": 0.0882, "step": 397 }, { "epoch": 0.19116234390009607, "grad_norm": 1.051013291042444, "learning_rate": 6.368000000000001e-06, "loss": 0.0742, "step": 398 }, { "epoch": 0.19164265129682997, "grad_norm": 0.8171391283247772, "learning_rate": 6.384e-06, "loss": 0.055, "step": 399 }, { "epoch": 0.19212295869356388, "grad_norm": 0.9833424794189063, "learning_rate": 6.4000000000000006e-06, "loss": 0.0681, "step": 400 }, { "epoch": 0.1926032660902978, "grad_norm": 1.6364622173487873, "learning_rate": 6.416e-06, "loss": 0.0785, "step": 401 }, { "epoch": 0.1930835734870317, "grad_norm": 1.0615403955397404, "learning_rate": 6.432e-06, "loss": 0.0991, "step": 402 }, { "epoch": 0.1935638808837656, "grad_norm": 0.7691979544920202, "learning_rate": 6.448000000000001e-06, "loss": 0.0748, "step": 403 }, { "epoch": 0.19404418828049952, "grad_norm": 1.1926569318031923, "learning_rate": 6.464e-06, "loss": 0.0644, "step": 404 }, { "epoch": 0.19452449567723343, "grad_norm": 0.5496991194443854, "learning_rate": 6.480000000000001e-06, "loss": 0.0531, "step": 405 }, { "epoch": 0.19500480307396734, "grad_norm": 0.8326919307353674, "learning_rate": 6.496e-06, "loss": 0.0725, "step": 406 }, { "epoch": 0.19548511047070125, "grad_norm": 1.0612929395385298, "learning_rate": 6.5120000000000005e-06, "loss": 0.0786, "step": 407 }, { "epoch": 0.19596541786743515, "grad_norm": 0.718075701725119, "learning_rate": 6.528000000000001e-06, "loss": 0.0585, "step": 408 }, { "epoch": 0.19644572526416906, "grad_norm": 1.2693919034433871, "learning_rate": 6.544e-06, "loss": 0.0737, "step": 409 }, { "epoch": 0.19692603266090297, "grad_norm": 0.9667230149007086, "learning_rate": 6.560000000000001e-06, "loss": 0.0939, "step": 410 }, { "epoch": 0.19740634005763688, "grad_norm": 0.7618960818418478, "learning_rate": 6.576e-06, "loss": 0.0451, "step": 411 }, { "epoch": 0.1978866474543708, "grad_norm": 0.9832215410415592, "learning_rate": 6.592000000000001e-06, "loss": 0.0671, "step": 412 }, { "epoch": 0.1983669548511047, "grad_norm": 0.757140075019698, "learning_rate": 6.608000000000001e-06, "loss": 0.072, "step": 413 }, { "epoch": 0.1988472622478386, "grad_norm": 0.8925384905186106, "learning_rate": 6.6240000000000004e-06, "loss": 0.0603, "step": 414 }, { "epoch": 0.19932756964457252, "grad_norm": 1.1216976394709526, "learning_rate": 6.640000000000001e-06, "loss": 0.073, "step": 415 }, { "epoch": 0.19980787704130643, "grad_norm": 0.635084003555822, "learning_rate": 6.656e-06, "loss": 0.05, "step": 416 }, { "epoch": 0.20028818443804033, "grad_norm": 0.7700738336691185, "learning_rate": 6.672000000000001e-06, "loss": 0.0723, "step": 417 }, { "epoch": 0.20076849183477424, "grad_norm": 1.2073339284961622, "learning_rate": 6.688e-06, "loss": 0.0789, "step": 418 }, { "epoch": 0.20124879923150815, "grad_norm": 1.0303414284152617, "learning_rate": 6.7040000000000005e-06, "loss": 0.0942, "step": 419 }, { "epoch": 0.2017291066282421, "grad_norm": 1.4496720069236764, "learning_rate": 6.720000000000001e-06, "loss": 0.0934, "step": 420 }, { "epoch": 0.202209414024976, "grad_norm": 1.39926810164135, "learning_rate": 6.736e-06, "loss": 0.0943, "step": 421 }, { "epoch": 0.2026897214217099, "grad_norm": 0.9605472208895933, "learning_rate": 6.752000000000001e-06, "loss": 0.07, "step": 422 }, { "epoch": 0.20317002881844382, "grad_norm": 1.1676905060892429, "learning_rate": 6.768e-06, "loss": 0.0957, "step": 423 }, { "epoch": 0.20365033621517772, "grad_norm": 1.0226460242714408, "learning_rate": 6.784000000000001e-06, "loss": 0.0924, "step": 424 }, { "epoch": 0.20413064361191163, "grad_norm": 0.7672962042725741, "learning_rate": 6.800000000000001e-06, "loss": 0.0731, "step": 425 }, { "epoch": 0.20461095100864554, "grad_norm": 1.1567284889374434, "learning_rate": 6.8160000000000005e-06, "loss": 0.0783, "step": 426 }, { "epoch": 0.20509125840537945, "grad_norm": 1.2828166133488417, "learning_rate": 6.832000000000001e-06, "loss": 0.085, "step": 427 }, { "epoch": 0.20557156580211336, "grad_norm": 0.9948993350398787, "learning_rate": 6.848e-06, "loss": 0.0712, "step": 428 }, { "epoch": 0.20605187319884727, "grad_norm": 1.0013503870526177, "learning_rate": 6.864000000000001e-06, "loss": 0.0725, "step": 429 }, { "epoch": 0.20653218059558118, "grad_norm": 0.828738359422558, "learning_rate": 6.88e-06, "loss": 0.0793, "step": 430 }, { "epoch": 0.2070124879923151, "grad_norm": 1.2375082026111115, "learning_rate": 6.8960000000000006e-06, "loss": 0.0868, "step": 431 }, { "epoch": 0.207492795389049, "grad_norm": 0.9272139309771383, "learning_rate": 6.912000000000001e-06, "loss": 0.0498, "step": 432 }, { "epoch": 0.2079731027857829, "grad_norm": 1.1885661004953512, "learning_rate": 6.928e-06, "loss": 0.0644, "step": 433 }, { "epoch": 0.2084534101825168, "grad_norm": 0.9603791498315314, "learning_rate": 6.944000000000001e-06, "loss": 0.0652, "step": 434 }, { "epoch": 0.20893371757925072, "grad_norm": 1.9606166639543452, "learning_rate": 6.96e-06, "loss": 0.0883, "step": 435 }, { "epoch": 0.20941402497598463, "grad_norm": 1.042036621430657, "learning_rate": 6.976000000000001e-06, "loss": 0.0773, "step": 436 }, { "epoch": 0.20989433237271854, "grad_norm": 0.807705104762043, "learning_rate": 6.992000000000001e-06, "loss": 0.0663, "step": 437 }, { "epoch": 0.21037463976945245, "grad_norm": 0.644829498274957, "learning_rate": 7.0080000000000005e-06, "loss": 0.0532, "step": 438 }, { "epoch": 0.21085494716618636, "grad_norm": 1.3697814725304436, "learning_rate": 7.024000000000001e-06, "loss": 0.0971, "step": 439 }, { "epoch": 0.21133525456292027, "grad_norm": 0.8499179196327713, "learning_rate": 7.04e-06, "loss": 0.06, "step": 440 }, { "epoch": 0.21181556195965417, "grad_norm": 1.1863374964467173, "learning_rate": 7.056000000000001e-06, "loss": 0.0835, "step": 441 }, { "epoch": 0.21229586935638808, "grad_norm": 1.025816763411157, "learning_rate": 7.072000000000001e-06, "loss": 0.0663, "step": 442 }, { "epoch": 0.212776176753122, "grad_norm": 0.7405994529015164, "learning_rate": 7.088000000000001e-06, "loss": 0.0598, "step": 443 }, { "epoch": 0.2132564841498559, "grad_norm": 0.6945613328918526, "learning_rate": 7.104000000000001e-06, "loss": 0.0455, "step": 444 }, { "epoch": 0.2137367915465898, "grad_norm": 1.2263272059064698, "learning_rate": 7.1200000000000004e-06, "loss": 0.0892, "step": 445 }, { "epoch": 0.21421709894332372, "grad_norm": 0.8174840860440725, "learning_rate": 7.136000000000001e-06, "loss": 0.1028, "step": 446 }, { "epoch": 0.21469740634005763, "grad_norm": 0.8623153464251558, "learning_rate": 7.152e-06, "loss": 0.0851, "step": 447 }, { "epoch": 0.21517771373679154, "grad_norm": 0.8772215697627417, "learning_rate": 7.168000000000001e-06, "loss": 0.0631, "step": 448 }, { "epoch": 0.21565802113352545, "grad_norm": 1.1867765756821822, "learning_rate": 7.184000000000001e-06, "loss": 0.071, "step": 449 }, { "epoch": 0.21613832853025935, "grad_norm": 1.100610239534095, "learning_rate": 7.2000000000000005e-06, "loss": 0.0819, "step": 450 }, { "epoch": 0.21661863592699326, "grad_norm": 1.0692001780240707, "learning_rate": 7.216000000000001e-06, "loss": 0.0752, "step": 451 }, { "epoch": 0.21709894332372717, "grad_norm": 1.8257869421442983, "learning_rate": 7.232e-06, "loss": 0.0961, "step": 452 }, { "epoch": 0.21757925072046108, "grad_norm": 0.8717548117040241, "learning_rate": 7.248000000000001e-06, "loss": 0.0705, "step": 453 }, { "epoch": 0.21805955811719502, "grad_norm": 0.679515897488406, "learning_rate": 7.264000000000001e-06, "loss": 0.0595, "step": 454 }, { "epoch": 0.21853986551392893, "grad_norm": 0.895857495998295, "learning_rate": 7.280000000000001e-06, "loss": 0.0897, "step": 455 }, { "epoch": 0.21902017291066284, "grad_norm": 0.8679817923298461, "learning_rate": 7.296000000000001e-06, "loss": 0.0767, "step": 456 }, { "epoch": 0.21950048030739674, "grad_norm": 0.9327609488192921, "learning_rate": 7.3120000000000005e-06, "loss": 0.0806, "step": 457 }, { "epoch": 0.21998078770413065, "grad_norm": 0.5822697098791232, "learning_rate": 7.328000000000001e-06, "loss": 0.0568, "step": 458 }, { "epoch": 0.22046109510086456, "grad_norm": 0.8136477286314635, "learning_rate": 7.344000000000001e-06, "loss": 0.0794, "step": 459 }, { "epoch": 0.22094140249759847, "grad_norm": 0.6332136039248887, "learning_rate": 7.360000000000001e-06, "loss": 0.0576, "step": 460 }, { "epoch": 0.22142170989433238, "grad_norm": 0.9815300986796249, "learning_rate": 7.376000000000001e-06, "loss": 0.0643, "step": 461 }, { "epoch": 0.2219020172910663, "grad_norm": 0.724954981826669, "learning_rate": 7.3920000000000005e-06, "loss": 0.0788, "step": 462 }, { "epoch": 0.2223823246878002, "grad_norm": 0.9480652345961487, "learning_rate": 7.408000000000001e-06, "loss": 0.0818, "step": 463 }, { "epoch": 0.2228626320845341, "grad_norm": 1.3054685818951983, "learning_rate": 7.424e-06, "loss": 0.0919, "step": 464 }, { "epoch": 0.22334293948126802, "grad_norm": 1.0576795770417882, "learning_rate": 7.440000000000001e-06, "loss": 0.0724, "step": 465 }, { "epoch": 0.22382324687800192, "grad_norm": 0.7169105130347482, "learning_rate": 7.456000000000001e-06, "loss": 0.0516, "step": 466 }, { "epoch": 0.22430355427473583, "grad_norm": 0.9174559479694644, "learning_rate": 7.472000000000001e-06, "loss": 0.0465, "step": 467 }, { "epoch": 0.22478386167146974, "grad_norm": 1.2190080584481207, "learning_rate": 7.488000000000001e-06, "loss": 0.0813, "step": 468 }, { "epoch": 0.22526416906820365, "grad_norm": 0.8257736438539969, "learning_rate": 7.5040000000000005e-06, "loss": 0.0665, "step": 469 }, { "epoch": 0.22574447646493756, "grad_norm": 1.005516446834955, "learning_rate": 7.520000000000001e-06, "loss": 0.0591, "step": 470 }, { "epoch": 0.22622478386167147, "grad_norm": 0.9928080490901438, "learning_rate": 7.536000000000001e-06, "loss": 0.0864, "step": 471 }, { "epoch": 0.22670509125840538, "grad_norm": 1.4816159445977666, "learning_rate": 7.552000000000001e-06, "loss": 0.0794, "step": 472 }, { "epoch": 0.2271853986551393, "grad_norm": 1.758118832520925, "learning_rate": 7.568000000000001e-06, "loss": 0.0671, "step": 473 }, { "epoch": 0.2276657060518732, "grad_norm": 1.2359986745074605, "learning_rate": 7.5840000000000006e-06, "loss": 0.0862, "step": 474 }, { "epoch": 0.2281460134486071, "grad_norm": 1.065041005575724, "learning_rate": 7.600000000000001e-06, "loss": 0.0867, "step": 475 }, { "epoch": 0.228626320845341, "grad_norm": 1.1386518653125688, "learning_rate": 7.616000000000001e-06, "loss": 0.0601, "step": 476 }, { "epoch": 0.22910662824207492, "grad_norm": 0.9844587250229394, "learning_rate": 7.632e-06, "loss": 0.0646, "step": 477 }, { "epoch": 0.22958693563880883, "grad_norm": 1.2058332109509622, "learning_rate": 7.648e-06, "loss": 0.0708, "step": 478 }, { "epoch": 0.23006724303554274, "grad_norm": 1.081013183286517, "learning_rate": 7.664e-06, "loss": 0.0701, "step": 479 }, { "epoch": 0.23054755043227665, "grad_norm": 0.9951838717376329, "learning_rate": 7.680000000000001e-06, "loss": 0.0769, "step": 480 }, { "epoch": 0.23102785782901056, "grad_norm": 0.6593691281860705, "learning_rate": 7.696e-06, "loss": 0.0624, "step": 481 }, { "epoch": 0.23150816522574447, "grad_norm": 1.1041552206575123, "learning_rate": 7.712e-06, "loss": 0.0786, "step": 482 }, { "epoch": 0.23198847262247838, "grad_norm": 1.1024525564632526, "learning_rate": 7.728000000000001e-06, "loss": 0.0825, "step": 483 }, { "epoch": 0.23246878001921228, "grad_norm": 0.7562432101013394, "learning_rate": 7.744e-06, "loss": 0.0626, "step": 484 }, { "epoch": 0.2329490874159462, "grad_norm": 0.7159892284435388, "learning_rate": 7.76e-06, "loss": 0.0666, "step": 485 }, { "epoch": 0.2334293948126801, "grad_norm": 0.7637031784203081, "learning_rate": 7.776e-06, "loss": 0.0535, "step": 486 }, { "epoch": 0.23390970220941404, "grad_norm": 1.341218149155002, "learning_rate": 7.792000000000001e-06, "loss": 0.0841, "step": 487 }, { "epoch": 0.23439000960614795, "grad_norm": 1.0381162643143098, "learning_rate": 7.808e-06, "loss": 0.0522, "step": 488 }, { "epoch": 0.23487031700288186, "grad_norm": 1.1102483912786174, "learning_rate": 7.824e-06, "loss": 0.0839, "step": 489 }, { "epoch": 0.23535062439961577, "grad_norm": 1.1434606793180935, "learning_rate": 7.840000000000001e-06, "loss": 0.0691, "step": 490 }, { "epoch": 0.23583093179634967, "grad_norm": 2.1962165048575533, "learning_rate": 7.856e-06, "loss": 0.0763, "step": 491 }, { "epoch": 0.23631123919308358, "grad_norm": 0.890647664698546, "learning_rate": 7.872e-06, "loss": 0.0718, "step": 492 }, { "epoch": 0.2367915465898175, "grad_norm": 0.8203501888126637, "learning_rate": 7.888e-06, "loss": 0.0726, "step": 493 }, { "epoch": 0.2372718539865514, "grad_norm": 1.2395973157128906, "learning_rate": 7.904000000000001e-06, "loss": 0.0898, "step": 494 }, { "epoch": 0.2377521613832853, "grad_norm": 0.9214838558224117, "learning_rate": 7.92e-06, "loss": 0.0657, "step": 495 }, { "epoch": 0.23823246878001922, "grad_norm": 1.0269834431751539, "learning_rate": 7.936e-06, "loss": 0.0807, "step": 496 }, { "epoch": 0.23871277617675313, "grad_norm": 1.020546331564402, "learning_rate": 7.952000000000001e-06, "loss": 0.0756, "step": 497 }, { "epoch": 0.23919308357348704, "grad_norm": 0.8476975296985142, "learning_rate": 7.968e-06, "loss": 0.0643, "step": 498 }, { "epoch": 0.23967339097022095, "grad_norm": 1.1388832811626284, "learning_rate": 7.984e-06, "loss": 0.0849, "step": 499 }, { "epoch": 0.24015369836695485, "grad_norm": 1.0929897122086174, "learning_rate": 8.000000000000001e-06, "loss": 0.0807, "step": 500 }, { "epoch": 0.24063400576368876, "grad_norm": 1.1481077522909218, "learning_rate": 8.016e-06, "loss": 0.0809, "step": 501 }, { "epoch": 0.24111431316042267, "grad_norm": 0.65682707360465, "learning_rate": 8.032e-06, "loss": 0.0654, "step": 502 }, { "epoch": 0.24159462055715658, "grad_norm": 0.8221527194893445, "learning_rate": 8.048e-06, "loss": 0.0723, "step": 503 }, { "epoch": 0.2420749279538905, "grad_norm": 1.0897467045485394, "learning_rate": 8.064000000000001e-06, "loss": 0.0678, "step": 504 }, { "epoch": 0.2425552353506244, "grad_norm": 0.7475058575087895, "learning_rate": 8.08e-06, "loss": 0.0516, "step": 505 }, { "epoch": 0.2430355427473583, "grad_norm": 0.7036037103827971, "learning_rate": 8.096e-06, "loss": 0.0538, "step": 506 }, { "epoch": 0.24351585014409222, "grad_norm": 0.8406021606293904, "learning_rate": 8.112000000000001e-06, "loss": 0.0588, "step": 507 }, { "epoch": 0.24399615754082613, "grad_norm": 0.9956392385750449, "learning_rate": 8.128e-06, "loss": 0.0529, "step": 508 }, { "epoch": 0.24447646493756003, "grad_norm": 1.1472735203009934, "learning_rate": 8.144e-06, "loss": 0.0935, "step": 509 }, { "epoch": 0.24495677233429394, "grad_norm": 1.024997413494147, "learning_rate": 8.16e-06, "loss": 0.0776, "step": 510 }, { "epoch": 0.24543707973102785, "grad_norm": 0.6571287147481466, "learning_rate": 8.176000000000001e-06, "loss": 0.0628, "step": 511 }, { "epoch": 0.24591738712776176, "grad_norm": 1.3063911540302608, "learning_rate": 8.192e-06, "loss": 0.0717, "step": 512 }, { "epoch": 0.24639769452449567, "grad_norm": 1.4230401674267745, "learning_rate": 8.208e-06, "loss": 0.0764, "step": 513 }, { "epoch": 0.24687800192122958, "grad_norm": 0.8048952076569829, "learning_rate": 8.224000000000001e-06, "loss": 0.0571, "step": 514 }, { "epoch": 0.2473583093179635, "grad_norm": 0.6650682219974673, "learning_rate": 8.24e-06, "loss": 0.0565, "step": 515 }, { "epoch": 0.2478386167146974, "grad_norm": 1.2997870431175331, "learning_rate": 8.256e-06, "loss": 0.0837, "step": 516 }, { "epoch": 0.2483189241114313, "grad_norm": 4.337586159697626, "learning_rate": 8.272000000000001e-06, "loss": 0.0467, "step": 517 }, { "epoch": 0.24879923150816521, "grad_norm": 0.8239655704265273, "learning_rate": 8.288000000000001e-06, "loss": 0.0647, "step": 518 }, { "epoch": 0.24927953890489912, "grad_norm": 0.8494250080431497, "learning_rate": 8.304e-06, "loss": 0.0526, "step": 519 }, { "epoch": 0.24975984630163303, "grad_norm": 0.7032539553571172, "learning_rate": 8.32e-06, "loss": 0.0504, "step": 520 }, { "epoch": 0.25024015369836694, "grad_norm": 0.7697835483160133, "learning_rate": 8.336000000000001e-06, "loss": 0.0508, "step": 521 }, { "epoch": 0.2507204610951009, "grad_norm": 0.8341875541080136, "learning_rate": 8.352e-06, "loss": 0.0807, "step": 522 }, { "epoch": 0.25120076849183476, "grad_norm": 0.8726308431077691, "learning_rate": 8.368e-06, "loss": 0.0636, "step": 523 }, { "epoch": 0.2516810758885687, "grad_norm": 0.9286195806386074, "learning_rate": 8.384000000000001e-06, "loss": 0.1007, "step": 524 }, { "epoch": 0.2521613832853026, "grad_norm": 1.0477568055323367, "learning_rate": 8.400000000000001e-06, "loss": 0.0653, "step": 525 }, { "epoch": 0.2526416906820365, "grad_norm": 0.907504117294368, "learning_rate": 8.416e-06, "loss": 0.0687, "step": 526 }, { "epoch": 0.2531219980787704, "grad_norm": 0.6672546812595275, "learning_rate": 8.432e-06, "loss": 0.0833, "step": 527 }, { "epoch": 0.25360230547550433, "grad_norm": 0.8481460299512423, "learning_rate": 8.448000000000001e-06, "loss": 0.069, "step": 528 }, { "epoch": 0.2540826128722382, "grad_norm": 0.7153223957448147, "learning_rate": 8.464e-06, "loss": 0.0685, "step": 529 }, { "epoch": 0.25456292026897215, "grad_norm": 0.6977196547557283, "learning_rate": 8.48e-06, "loss": 0.0642, "step": 530 }, { "epoch": 0.25504322766570603, "grad_norm": 0.8261907792245994, "learning_rate": 8.496000000000001e-06, "loss": 0.0675, "step": 531 }, { "epoch": 0.25552353506243997, "grad_norm": 0.7198555590025666, "learning_rate": 8.512e-06, "loss": 0.0851, "step": 532 }, { "epoch": 0.25600384245917385, "grad_norm": 0.6637599827890456, "learning_rate": 8.528e-06, "loss": 0.0899, "step": 533 }, { "epoch": 0.2564841498559078, "grad_norm": 0.7512929164073144, "learning_rate": 8.544000000000002e-06, "loss": 0.0865, "step": 534 }, { "epoch": 0.25696445725264166, "grad_norm": 1.8750413633075549, "learning_rate": 8.560000000000001e-06, "loss": 0.0611, "step": 535 }, { "epoch": 0.2574447646493756, "grad_norm": 0.8162254344386675, "learning_rate": 8.576e-06, "loss": 0.082, "step": 536 }, { "epoch": 0.2579250720461095, "grad_norm": 0.7407860981655993, "learning_rate": 8.592e-06, "loss": 0.0659, "step": 537 }, { "epoch": 0.2584053794428434, "grad_norm": 0.8614219120120522, "learning_rate": 8.608000000000001e-06, "loss": 0.1007, "step": 538 }, { "epoch": 0.25888568683957736, "grad_norm": 1.2916018437852648, "learning_rate": 8.624e-06, "loss": 0.0711, "step": 539 }, { "epoch": 0.25936599423631124, "grad_norm": 1.0968982870807809, "learning_rate": 8.64e-06, "loss": 0.0494, "step": 540 }, { "epoch": 0.2598463016330452, "grad_norm": 0.8333925089939908, "learning_rate": 8.656000000000001e-06, "loss": 0.0718, "step": 541 }, { "epoch": 0.26032660902977905, "grad_norm": 1.5580346095928794, "learning_rate": 8.672000000000001e-06, "loss": 0.0728, "step": 542 }, { "epoch": 0.260806916426513, "grad_norm": 1.3177727190392534, "learning_rate": 8.688e-06, "loss": 0.068, "step": 543 }, { "epoch": 0.2612872238232469, "grad_norm": 1.1607948509472454, "learning_rate": 8.704e-06, "loss": 0.0691, "step": 544 }, { "epoch": 0.2617675312199808, "grad_norm": 0.9590351041286705, "learning_rate": 8.720000000000001e-06, "loss": 0.0613, "step": 545 }, { "epoch": 0.2622478386167147, "grad_norm": 0.830353272020343, "learning_rate": 8.736e-06, "loss": 0.0661, "step": 546 }, { "epoch": 0.2627281460134486, "grad_norm": 0.8307084029819353, "learning_rate": 8.752e-06, "loss": 0.0557, "step": 547 }, { "epoch": 0.2632084534101825, "grad_norm": 0.8137293430136122, "learning_rate": 8.768000000000001e-06, "loss": 0.0625, "step": 548 }, { "epoch": 0.26368876080691644, "grad_norm": 1.6678230574606077, "learning_rate": 8.784000000000001e-06, "loss": 0.0663, "step": 549 }, { "epoch": 0.2641690682036503, "grad_norm": 0.8131640866998433, "learning_rate": 8.8e-06, "loss": 0.0758, "step": 550 }, { "epoch": 0.26464937560038426, "grad_norm": 0.746830009216251, "learning_rate": 8.816000000000002e-06, "loss": 0.0594, "step": 551 }, { "epoch": 0.26512968299711814, "grad_norm": 1.0612071437391581, "learning_rate": 8.832000000000001e-06, "loss": 0.1148, "step": 552 }, { "epoch": 0.2656099903938521, "grad_norm": 1.0736052555532878, "learning_rate": 8.848e-06, "loss": 0.0795, "step": 553 }, { "epoch": 0.26609029779058596, "grad_norm": 1.7811521568482815, "learning_rate": 8.864e-06, "loss": 0.0708, "step": 554 }, { "epoch": 0.2665706051873199, "grad_norm": 1.1601873615680562, "learning_rate": 8.880000000000001e-06, "loss": 0.0632, "step": 555 }, { "epoch": 0.2670509125840538, "grad_norm": 0.9443323090818715, "learning_rate": 8.896000000000001e-06, "loss": 0.0726, "step": 556 }, { "epoch": 0.2675312199807877, "grad_norm": 0.751021625854935, "learning_rate": 8.912e-06, "loss": 0.0496, "step": 557 }, { "epoch": 0.2680115273775216, "grad_norm": 0.93106805297644, "learning_rate": 8.928000000000002e-06, "loss": 0.0777, "step": 558 }, { "epoch": 0.26849183477425553, "grad_norm": 0.635298715287655, "learning_rate": 8.944000000000001e-06, "loss": 0.059, "step": 559 }, { "epoch": 0.2689721421709894, "grad_norm": 0.8466300533119754, "learning_rate": 8.96e-06, "loss": 0.0788, "step": 560 }, { "epoch": 0.26945244956772335, "grad_norm": 0.7748167130306874, "learning_rate": 8.976e-06, "loss": 0.0599, "step": 561 }, { "epoch": 0.26993275696445723, "grad_norm": 0.7427736105339172, "learning_rate": 8.992000000000001e-06, "loss": 0.0614, "step": 562 }, { "epoch": 0.27041306436119117, "grad_norm": 0.8320001022304129, "learning_rate": 9.008e-06, "loss": 0.0674, "step": 563 }, { "epoch": 0.27089337175792505, "grad_norm": 0.7704318557146501, "learning_rate": 9.024e-06, "loss": 0.0702, "step": 564 }, { "epoch": 0.271373679154659, "grad_norm": 0.7407765317427476, "learning_rate": 9.040000000000002e-06, "loss": 0.0744, "step": 565 }, { "epoch": 0.27185398655139287, "grad_norm": 0.7206956721264949, "learning_rate": 9.056000000000001e-06, "loss": 0.0543, "step": 566 }, { "epoch": 0.2723342939481268, "grad_norm": 0.9451040246693897, "learning_rate": 9.072e-06, "loss": 0.0807, "step": 567 }, { "epoch": 0.2728146013448607, "grad_norm": 0.9009649199487116, "learning_rate": 9.088000000000002e-06, "loss": 0.062, "step": 568 }, { "epoch": 0.2732949087415946, "grad_norm": 1.0113937709014764, "learning_rate": 9.104000000000001e-06, "loss": 0.0648, "step": 569 }, { "epoch": 0.2737752161383285, "grad_norm": 1.0501699409202674, "learning_rate": 9.12e-06, "loss": 0.0537, "step": 570 }, { "epoch": 0.27425552353506244, "grad_norm": 0.5693860840353209, "learning_rate": 9.136e-06, "loss": 0.0448, "step": 571 }, { "epoch": 0.2747358309317964, "grad_norm": 1.0055693738637874, "learning_rate": 9.152000000000001e-06, "loss": 0.0747, "step": 572 }, { "epoch": 0.27521613832853026, "grad_norm": 1.2257150323649562, "learning_rate": 9.168000000000001e-06, "loss": 0.0971, "step": 573 }, { "epoch": 0.2756964457252642, "grad_norm": 0.8489461117348059, "learning_rate": 9.184e-06, "loss": 0.067, "step": 574 }, { "epoch": 0.2761767531219981, "grad_norm": 0.8112579026025699, "learning_rate": 9.200000000000002e-06, "loss": 0.0682, "step": 575 }, { "epoch": 0.276657060518732, "grad_norm": 0.8857859004688043, "learning_rate": 9.216000000000001e-06, "loss": 0.0799, "step": 576 }, { "epoch": 0.2771373679154659, "grad_norm": 0.7523380576301403, "learning_rate": 9.232e-06, "loss": 0.0682, "step": 577 }, { "epoch": 0.27761767531219983, "grad_norm": 0.8161110604162131, "learning_rate": 9.248e-06, "loss": 0.0713, "step": 578 }, { "epoch": 0.2780979827089337, "grad_norm": 1.2335737488024285, "learning_rate": 9.264000000000001e-06, "loss": 0.0976, "step": 579 }, { "epoch": 0.27857829010566765, "grad_norm": 0.8454243600862699, "learning_rate": 9.280000000000001e-06, "loss": 0.0606, "step": 580 }, { "epoch": 0.27905859750240153, "grad_norm": 1.2912157131289221, "learning_rate": 9.296e-06, "loss": 0.0864, "step": 581 }, { "epoch": 0.27953890489913547, "grad_norm": 1.0229081590491909, "learning_rate": 9.312000000000002e-06, "loss": 0.0914, "step": 582 }, { "epoch": 0.28001921229586935, "grad_norm": 1.206253943899943, "learning_rate": 9.328000000000001e-06, "loss": 0.0862, "step": 583 }, { "epoch": 0.2804995196926033, "grad_norm": 0.9742697094872371, "learning_rate": 9.344e-06, "loss": 0.0534, "step": 584 }, { "epoch": 0.28097982708933716, "grad_norm": 1.3994880281576707, "learning_rate": 9.360000000000002e-06, "loss": 0.0942, "step": 585 }, { "epoch": 0.2814601344860711, "grad_norm": 0.7863832094391792, "learning_rate": 9.376000000000001e-06, "loss": 0.0651, "step": 586 }, { "epoch": 0.281940441882805, "grad_norm": 1.790391699352207, "learning_rate": 9.392000000000001e-06, "loss": 0.0837, "step": 587 }, { "epoch": 0.2824207492795389, "grad_norm": 0.7167809240107046, "learning_rate": 9.408e-06, "loss": 0.0641, "step": 588 }, { "epoch": 0.2829010566762728, "grad_norm": 1.3108752667323915, "learning_rate": 9.424000000000002e-06, "loss": 0.0711, "step": 589 }, { "epoch": 0.28338136407300674, "grad_norm": 0.752914709706173, "learning_rate": 9.440000000000001e-06, "loss": 0.0584, "step": 590 }, { "epoch": 0.2838616714697406, "grad_norm": 0.8764860102695597, "learning_rate": 9.456e-06, "loss": 0.0477, "step": 591 }, { "epoch": 0.28434197886647455, "grad_norm": 0.6356131868385272, "learning_rate": 9.472000000000002e-06, "loss": 0.0522, "step": 592 }, { "epoch": 0.28482228626320844, "grad_norm": 1.263517402410837, "learning_rate": 9.488000000000001e-06, "loss": 0.0873, "step": 593 }, { "epoch": 0.28530259365994237, "grad_norm": 0.6481285314183274, "learning_rate": 9.504e-06, "loss": 0.0594, "step": 594 }, { "epoch": 0.28578290105667625, "grad_norm": 0.6520090713036953, "learning_rate": 9.52e-06, "loss": 0.0574, "step": 595 }, { "epoch": 0.2862632084534102, "grad_norm": 1.2576547381797791, "learning_rate": 9.536000000000002e-06, "loss": 0.0782, "step": 596 }, { "epoch": 0.28674351585014407, "grad_norm": 1.0281958680840946, "learning_rate": 9.552000000000001e-06, "loss": 0.0884, "step": 597 }, { "epoch": 0.287223823246878, "grad_norm": 2.0780498256230904, "learning_rate": 9.568e-06, "loss": 0.0686, "step": 598 }, { "epoch": 0.2877041306436119, "grad_norm": 0.7251320029199013, "learning_rate": 9.584000000000002e-06, "loss": 0.0667, "step": 599 }, { "epoch": 0.2881844380403458, "grad_norm": 0.9843387438340672, "learning_rate": 9.600000000000001e-06, "loss": 0.0696, "step": 600 }, { "epoch": 0.2886647454370797, "grad_norm": 1.5091693125190424, "learning_rate": 9.616e-06, "loss": 0.11, "step": 601 }, { "epoch": 0.28914505283381364, "grad_norm": 0.7606667303666816, "learning_rate": 9.632e-06, "loss": 0.0645, "step": 602 }, { "epoch": 0.2896253602305475, "grad_norm": 0.8702218966486639, "learning_rate": 9.648000000000001e-06, "loss": 0.0695, "step": 603 }, { "epoch": 0.29010566762728146, "grad_norm": 0.9692656471462159, "learning_rate": 9.664000000000001e-06, "loss": 0.071, "step": 604 }, { "epoch": 0.2905859750240154, "grad_norm": 1.152104986956867, "learning_rate": 9.68e-06, "loss": 0.0621, "step": 605 }, { "epoch": 0.2910662824207493, "grad_norm": 0.7344517356138976, "learning_rate": 9.696000000000002e-06, "loss": 0.0485, "step": 606 }, { "epoch": 0.2915465898174832, "grad_norm": 0.9928918365561824, "learning_rate": 9.712e-06, "loss": 0.0719, "step": 607 }, { "epoch": 0.2920268972142171, "grad_norm": 0.7667129001690043, "learning_rate": 9.728e-06, "loss": 0.0679, "step": 608 }, { "epoch": 0.29250720461095103, "grad_norm": 0.9950000199696363, "learning_rate": 9.744000000000002e-06, "loss": 0.0723, "step": 609 }, { "epoch": 0.2929875120076849, "grad_norm": 1.4085105451496738, "learning_rate": 9.760000000000001e-06, "loss": 0.0798, "step": 610 }, { "epoch": 0.29346781940441885, "grad_norm": 0.849429025717193, "learning_rate": 9.776000000000001e-06, "loss": 0.0573, "step": 611 }, { "epoch": 0.29394812680115273, "grad_norm": 0.8950057496274839, "learning_rate": 9.792e-06, "loss": 0.0718, "step": 612 }, { "epoch": 0.29442843419788667, "grad_norm": 0.728848993249414, "learning_rate": 9.808000000000002e-06, "loss": 0.0397, "step": 613 }, { "epoch": 0.29490874159462055, "grad_norm": 0.9156226984501676, "learning_rate": 9.824000000000001e-06, "loss": 0.0722, "step": 614 }, { "epoch": 0.2953890489913545, "grad_norm": 1.3199838058869244, "learning_rate": 9.84e-06, "loss": 0.0663, "step": 615 }, { "epoch": 0.29586935638808837, "grad_norm": 0.6241326568020441, "learning_rate": 9.856000000000002e-06, "loss": 0.0477, "step": 616 }, { "epoch": 0.2963496637848223, "grad_norm": 0.8937661400674575, "learning_rate": 9.872e-06, "loss": 0.0655, "step": 617 }, { "epoch": 0.2968299711815562, "grad_norm": 0.7018639448793483, "learning_rate": 9.888000000000001e-06, "loss": 0.0666, "step": 618 }, { "epoch": 0.2973102785782901, "grad_norm": 0.8792158375314773, "learning_rate": 9.904e-06, "loss": 0.0806, "step": 619 }, { "epoch": 0.297790585975024, "grad_norm": 0.8101543945686708, "learning_rate": 9.920000000000002e-06, "loss": 0.0628, "step": 620 }, { "epoch": 0.29827089337175794, "grad_norm": 0.6958139936512852, "learning_rate": 9.936000000000001e-06, "loss": 0.0586, "step": 621 }, { "epoch": 0.2987512007684918, "grad_norm": 0.8118776422093879, "learning_rate": 9.952e-06, "loss": 0.0696, "step": 622 }, { "epoch": 0.29923150816522576, "grad_norm": 0.801044484279383, "learning_rate": 9.968000000000002e-06, "loss": 0.0587, "step": 623 }, { "epoch": 0.29971181556195964, "grad_norm": 0.732515461824863, "learning_rate": 9.984e-06, "loss": 0.0564, "step": 624 }, { "epoch": 0.3001921229586936, "grad_norm": 0.8322604839391531, "learning_rate": 1e-05, "loss": 0.0812, "step": 625 }, { "epoch": 0.30067243035542746, "grad_norm": 0.7657461534156353, "learning_rate": 9.999999219069164e-06, "loss": 0.0553, "step": 626 }, { "epoch": 0.3011527377521614, "grad_norm": 0.96087209630369, "learning_rate": 9.999996876276899e-06, "loss": 0.0722, "step": 627 }, { "epoch": 0.3016330451488953, "grad_norm": 0.8664306047674389, "learning_rate": 9.999992971623935e-06, "loss": 0.0699, "step": 628 }, { "epoch": 0.3021133525456292, "grad_norm": 0.7586378661950982, "learning_rate": 9.999987505111493e-06, "loss": 0.0547, "step": 629 }, { "epoch": 0.3025936599423631, "grad_norm": 0.8770352678016571, "learning_rate": 9.999980476741282e-06, "loss": 0.0544, "step": 630 }, { "epoch": 0.30307396733909703, "grad_norm": 2.875494497735063, "learning_rate": 9.999971886515496e-06, "loss": 0.0706, "step": 631 }, { "epoch": 0.3035542747358309, "grad_norm": 1.2573657771022355, "learning_rate": 9.999961734436818e-06, "loss": 0.0687, "step": 632 }, { "epoch": 0.30403458213256485, "grad_norm": 0.683322107768531, "learning_rate": 9.99995002050842e-06, "loss": 0.0598, "step": 633 }, { "epoch": 0.3045148895292987, "grad_norm": 0.8277715966275165, "learning_rate": 9.99993674473396e-06, "loss": 0.0723, "step": 634 }, { "epoch": 0.30499519692603266, "grad_norm": 2.8948604349281584, "learning_rate": 9.999921907117588e-06, "loss": 0.1244, "step": 635 }, { "epoch": 0.30547550432276654, "grad_norm": 0.8638445515605259, "learning_rate": 9.999905507663936e-06, "loss": 0.0752, "step": 636 }, { "epoch": 0.3059558117195005, "grad_norm": 0.6727302040122436, "learning_rate": 9.999887546378127e-06, "loss": 0.0621, "step": 637 }, { "epoch": 0.30643611911623436, "grad_norm": 1.0952285566093234, "learning_rate": 9.999868023265772e-06, "loss": 0.095, "step": 638 }, { "epoch": 0.3069164265129683, "grad_norm": 0.957687462950951, "learning_rate": 9.99984693833297e-06, "loss": 0.0802, "step": 639 }, { "epoch": 0.30739673390970224, "grad_norm": 1.0874976971451398, "learning_rate": 9.999824291586306e-06, "loss": 0.0964, "step": 640 }, { "epoch": 0.3078770413064361, "grad_norm": 0.826343746073476, "learning_rate": 9.999800083032856e-06, "loss": 0.0553, "step": 641 }, { "epoch": 0.30835734870317005, "grad_norm": 0.8615917787952223, "learning_rate": 9.999774312680182e-06, "loss": 0.0674, "step": 642 }, { "epoch": 0.30883765609990393, "grad_norm": 0.9536454074667529, "learning_rate": 9.999746980536332e-06, "loss": 0.0754, "step": 643 }, { "epoch": 0.30931796349663787, "grad_norm": 0.8678398758854321, "learning_rate": 9.999718086609845e-06, "loss": 0.0536, "step": 644 }, { "epoch": 0.30979827089337175, "grad_norm": 0.8195920103808025, "learning_rate": 9.999687630909748e-06, "loss": 0.0676, "step": 645 }, { "epoch": 0.3102785782901057, "grad_norm": 0.833916110606186, "learning_rate": 9.999655613445552e-06, "loss": 0.0662, "step": 646 }, { "epoch": 0.31075888568683957, "grad_norm": 0.8552019334840019, "learning_rate": 9.99962203422726e-06, "loss": 0.0601, "step": 647 }, { "epoch": 0.3112391930835735, "grad_norm": 1.1244393216373334, "learning_rate": 9.99958689326536e-06, "loss": 0.0835, "step": 648 }, { "epoch": 0.3117195004803074, "grad_norm": 1.0921918261230938, "learning_rate": 9.99955019057083e-06, "loss": 0.0735, "step": 649 }, { "epoch": 0.3121998078770413, "grad_norm": 0.9476055236594563, "learning_rate": 9.999511926155135e-06, "loss": 0.0568, "step": 650 }, { "epoch": 0.3126801152737752, "grad_norm": 0.955427263476322, "learning_rate": 9.999472100030227e-06, "loss": 0.0682, "step": 651 }, { "epoch": 0.31316042267050914, "grad_norm": 0.6227107477762361, "learning_rate": 9.999430712208548e-06, "loss": 0.0515, "step": 652 }, { "epoch": 0.313640730067243, "grad_norm": 0.8819662452007495, "learning_rate": 9.999387762703025e-06, "loss": 0.0708, "step": 653 }, { "epoch": 0.31412103746397696, "grad_norm": 0.7140416262289825, "learning_rate": 9.999343251527076e-06, "loss": 0.0546, "step": 654 }, { "epoch": 0.31460134486071084, "grad_norm": 0.7721756355018631, "learning_rate": 9.999297178694603e-06, "loss": 0.0819, "step": 655 }, { "epoch": 0.3150816522574448, "grad_norm": 0.6792124284969571, "learning_rate": 9.999249544219998e-06, "loss": 0.0626, "step": 656 }, { "epoch": 0.31556195965417866, "grad_norm": 0.8965139505972938, "learning_rate": 9.999200348118142e-06, "loss": 0.0793, "step": 657 }, { "epoch": 0.3160422670509126, "grad_norm": 0.6875270287873301, "learning_rate": 9.9991495904044e-06, "loss": 0.0841, "step": 658 }, { "epoch": 0.3165225744476465, "grad_norm": 0.9499599493614728, "learning_rate": 9.999097271094632e-06, "loss": 0.0746, "step": 659 }, { "epoch": 0.3170028818443804, "grad_norm": 0.9142388374355066, "learning_rate": 9.999043390205176e-06, "loss": 0.0789, "step": 660 }, { "epoch": 0.3174831892411143, "grad_norm": 2.344335044128447, "learning_rate": 9.998987947752866e-06, "loss": 0.0992, "step": 661 }, { "epoch": 0.31796349663784823, "grad_norm": 1.318190298500077, "learning_rate": 9.99893094375502e-06, "loss": 0.0673, "step": 662 }, { "epoch": 0.3184438040345821, "grad_norm": 0.7219015180259919, "learning_rate": 9.998872378229444e-06, "loss": 0.0557, "step": 663 }, { "epoch": 0.31892411143131605, "grad_norm": 0.9108960437724677, "learning_rate": 9.998812251194432e-06, "loss": 0.0701, "step": 664 }, { "epoch": 0.31940441882804993, "grad_norm": 0.8017801457442163, "learning_rate": 9.998750562668767e-06, "loss": 0.0607, "step": 665 }, { "epoch": 0.31988472622478387, "grad_norm": 0.9355161886203067, "learning_rate": 9.99868731267172e-06, "loss": 0.0838, "step": 666 }, { "epoch": 0.32036503362151775, "grad_norm": 1.1545057767987545, "learning_rate": 9.998622501223045e-06, "loss": 0.0935, "step": 667 }, { "epoch": 0.3208453410182517, "grad_norm": 0.6776024404099317, "learning_rate": 9.998556128342989e-06, "loss": 0.0527, "step": 668 }, { "epoch": 0.32132564841498557, "grad_norm": 0.9245439985854373, "learning_rate": 9.998488194052287e-06, "loss": 0.0815, "step": 669 }, { "epoch": 0.3218059558117195, "grad_norm": 0.9108820872007813, "learning_rate": 9.998418698372156e-06, "loss": 0.0806, "step": 670 }, { "epoch": 0.3222862632084534, "grad_norm": 0.6413003064759893, "learning_rate": 9.998347641324309e-06, "loss": 0.0567, "step": 671 }, { "epoch": 0.3227665706051873, "grad_norm": 0.7124154469417349, "learning_rate": 9.998275022930937e-06, "loss": 0.0622, "step": 672 }, { "epoch": 0.32324687800192126, "grad_norm": 0.729795364251075, "learning_rate": 9.99820084321473e-06, "loss": 0.0549, "step": 673 }, { "epoch": 0.32372718539865514, "grad_norm": 0.7072881688429468, "learning_rate": 9.998125102198855e-06, "loss": 0.057, "step": 674 }, { "epoch": 0.3242074927953891, "grad_norm": 0.9404196385002046, "learning_rate": 9.998047799906972e-06, "loss": 0.0731, "step": 675 }, { "epoch": 0.32468780019212296, "grad_norm": 2.2283889506644803, "learning_rate": 9.99796893636323e-06, "loss": 0.0611, "step": 676 }, { "epoch": 0.3251681075888569, "grad_norm": 0.9898738372811073, "learning_rate": 9.997888511592262e-06, "loss": 0.075, "step": 677 }, { "epoch": 0.3256484149855908, "grad_norm": 0.5669213992706328, "learning_rate": 9.997806525619191e-06, "loss": 0.0462, "step": 678 }, { "epoch": 0.3261287223823247, "grad_norm": 0.7769207156534598, "learning_rate": 9.997722978469629e-06, "loss": 0.0598, "step": 679 }, { "epoch": 0.3266090297790586, "grad_norm": 1.14913919174941, "learning_rate": 9.997637870169673e-06, "loss": 0.0802, "step": 680 }, { "epoch": 0.3270893371757925, "grad_norm": 0.5579221498365345, "learning_rate": 9.997551200745905e-06, "loss": 0.0513, "step": 681 }, { "epoch": 0.3275696445725264, "grad_norm": 1.1622979766569141, "learning_rate": 9.997462970225402e-06, "loss": 0.0831, "step": 682 }, { "epoch": 0.32804995196926034, "grad_norm": 0.5843385711909048, "learning_rate": 9.997373178635723e-06, "loss": 0.0463, "step": 683 }, { "epoch": 0.3285302593659942, "grad_norm": 0.6003237420367146, "learning_rate": 9.997281826004919e-06, "loss": 0.0463, "step": 684 }, { "epoch": 0.32901056676272816, "grad_norm": 0.707379073392421, "learning_rate": 9.997188912361522e-06, "loss": 0.0743, "step": 685 }, { "epoch": 0.32949087415946204, "grad_norm": 0.6053436647816546, "learning_rate": 9.997094437734558e-06, "loss": 0.0586, "step": 686 }, { "epoch": 0.329971181556196, "grad_norm": 0.6165862274607972, "learning_rate": 9.99699840215354e-06, "loss": 0.0615, "step": 687 }, { "epoch": 0.33045148895292986, "grad_norm": 0.7162733421968076, "learning_rate": 9.996900805648462e-06, "loss": 0.0664, "step": 688 }, { "epoch": 0.3309317963496638, "grad_norm": 0.9782090778876608, "learning_rate": 9.996801648249815e-06, "loss": 0.0694, "step": 689 }, { "epoch": 0.3314121037463977, "grad_norm": 0.9345924479490764, "learning_rate": 9.996700929988571e-06, "loss": 0.0703, "step": 690 }, { "epoch": 0.3318924111431316, "grad_norm": 0.4641992659191584, "learning_rate": 9.996598650896191e-06, "loss": 0.0581, "step": 691 }, { "epoch": 0.3323727185398655, "grad_norm": 0.730369918229153, "learning_rate": 9.996494811004626e-06, "loss": 0.0671, "step": 692 }, { "epoch": 0.33285302593659943, "grad_norm": 0.7429226769702214, "learning_rate": 9.996389410346312e-06, "loss": 0.0603, "step": 693 }, { "epoch": 0.3333333333333333, "grad_norm": 0.8374173727738586, "learning_rate": 9.996282448954173e-06, "loss": 0.0686, "step": 694 }, { "epoch": 0.33381364073006725, "grad_norm": 0.7712716340055917, "learning_rate": 9.99617392686162e-06, "loss": 0.0606, "step": 695 }, { "epoch": 0.33429394812680113, "grad_norm": 0.5623336642825997, "learning_rate": 9.996063844102555e-06, "loss": 0.069, "step": 696 }, { "epoch": 0.33477425552353507, "grad_norm": 0.6995521178496045, "learning_rate": 9.995952200711361e-06, "loss": 0.0621, "step": 697 }, { "epoch": 0.33525456292026895, "grad_norm": 0.9184035660471785, "learning_rate": 9.995838996722916e-06, "loss": 0.0693, "step": 698 }, { "epoch": 0.3357348703170029, "grad_norm": 0.9083685686418375, "learning_rate": 9.995724232172578e-06, "loss": 0.0619, "step": 699 }, { "epoch": 0.33621517771373677, "grad_norm": 0.7795460332437362, "learning_rate": 9.995607907096198e-06, "loss": 0.0579, "step": 700 }, { "epoch": 0.3366954851104707, "grad_norm": 0.9597351909663773, "learning_rate": 9.995490021530116e-06, "loss": 0.0635, "step": 701 }, { "epoch": 0.3371757925072046, "grad_norm": 0.777710780224125, "learning_rate": 9.995370575511151e-06, "loss": 0.0644, "step": 702 }, { "epoch": 0.3376560999039385, "grad_norm": 0.8524311441817263, "learning_rate": 9.995249569076617e-06, "loss": 0.0649, "step": 703 }, { "epoch": 0.3381364073006724, "grad_norm": 0.9788665595689388, "learning_rate": 9.995127002264313e-06, "loss": 0.0483, "step": 704 }, { "epoch": 0.33861671469740634, "grad_norm": 0.9043257189131158, "learning_rate": 9.995002875112525e-06, "loss": 0.0802, "step": 705 }, { "epoch": 0.3390970220941403, "grad_norm": 1.0447948509065825, "learning_rate": 9.994877187660028e-06, "loss": 0.1116, "step": 706 }, { "epoch": 0.33957732949087416, "grad_norm": 0.6843801841988756, "learning_rate": 9.994749939946082e-06, "loss": 0.0445, "step": 707 }, { "epoch": 0.3400576368876081, "grad_norm": 0.8386099307338503, "learning_rate": 9.994621132010439e-06, "loss": 0.0761, "step": 708 }, { "epoch": 0.340537944284342, "grad_norm": 0.8431393706231558, "learning_rate": 9.994490763893328e-06, "loss": 0.0719, "step": 709 }, { "epoch": 0.3410182516810759, "grad_norm": 0.6771024567973309, "learning_rate": 9.994358835635477e-06, "loss": 0.0568, "step": 710 }, { "epoch": 0.3414985590778098, "grad_norm": 0.8961034176669026, "learning_rate": 9.9942253472781e-06, "loss": 0.0688, "step": 711 }, { "epoch": 0.34197886647454373, "grad_norm": 1.0453002901140704, "learning_rate": 9.99409029886289e-06, "loss": 0.1101, "step": 712 }, { "epoch": 0.3424591738712776, "grad_norm": 0.786975549589651, "learning_rate": 9.993953690432032e-06, "loss": 0.0554, "step": 713 }, { "epoch": 0.34293948126801155, "grad_norm": 0.7042757673435847, "learning_rate": 9.993815522028203e-06, "loss": 0.0617, "step": 714 }, { "epoch": 0.34341978866474543, "grad_norm": 0.9449988070860224, "learning_rate": 9.993675793694558e-06, "loss": 0.0615, "step": 715 }, { "epoch": 0.34390009606147937, "grad_norm": 1.2419899707910662, "learning_rate": 9.99353450547475e-06, "loss": 0.0724, "step": 716 }, { "epoch": 0.34438040345821325, "grad_norm": 1.1846845186388448, "learning_rate": 9.993391657412908e-06, "loss": 0.0892, "step": 717 }, { "epoch": 0.3448607108549472, "grad_norm": 0.9397604288615065, "learning_rate": 9.993247249553656e-06, "loss": 0.0561, "step": 718 }, { "epoch": 0.34534101825168106, "grad_norm": 0.6412293473440506, "learning_rate": 9.993101281942103e-06, "loss": 0.0506, "step": 719 }, { "epoch": 0.345821325648415, "grad_norm": 0.6410026176064426, "learning_rate": 9.992953754623847e-06, "loss": 0.0556, "step": 720 }, { "epoch": 0.3463016330451489, "grad_norm": 0.9400115622092615, "learning_rate": 9.992804667644969e-06, "loss": 0.0713, "step": 721 }, { "epoch": 0.3467819404418828, "grad_norm": 0.8307344024344647, "learning_rate": 9.99265402105204e-06, "loss": 0.0749, "step": 722 }, { "epoch": 0.3472622478386167, "grad_norm": 0.6264072104068256, "learning_rate": 9.992501814892118e-06, "loss": 0.0572, "step": 723 }, { "epoch": 0.34774255523535064, "grad_norm": 0.8873301693627772, "learning_rate": 9.99234804921275e-06, "loss": 0.0636, "step": 724 }, { "epoch": 0.3482228626320845, "grad_norm": 0.5551682372717668, "learning_rate": 9.992192724061965e-06, "loss": 0.0614, "step": 725 }, { "epoch": 0.34870317002881845, "grad_norm": 0.6741323301483185, "learning_rate": 9.992035839488283e-06, "loss": 0.0598, "step": 726 }, { "epoch": 0.34918347742555234, "grad_norm": 0.7478055633816404, "learning_rate": 9.991877395540714e-06, "loss": 0.0698, "step": 727 }, { "epoch": 0.34966378482228627, "grad_norm": 0.7204195910532974, "learning_rate": 9.991717392268747e-06, "loss": 0.0684, "step": 728 }, { "epoch": 0.35014409221902015, "grad_norm": 0.680949486584263, "learning_rate": 9.991555829722363e-06, "loss": 0.0723, "step": 729 }, { "epoch": 0.3506243996157541, "grad_norm": 0.6473589893605443, "learning_rate": 9.991392707952032e-06, "loss": 0.0595, "step": 730 }, { "epoch": 0.35110470701248797, "grad_norm": 0.9261456172897032, "learning_rate": 9.991228027008708e-06, "loss": 0.0842, "step": 731 }, { "epoch": 0.3515850144092219, "grad_norm": 1.0662113215148616, "learning_rate": 9.991061786943832e-06, "loss": 0.0724, "step": 732 }, { "epoch": 0.3520653218059558, "grad_norm": 0.9090716830219397, "learning_rate": 9.990893987809334e-06, "loss": 0.0574, "step": 733 }, { "epoch": 0.3525456292026897, "grad_norm": 0.5908937589147102, "learning_rate": 9.990724629657628e-06, "loss": 0.0514, "step": 734 }, { "epoch": 0.3530259365994236, "grad_norm": 1.0249840050472094, "learning_rate": 9.990553712541617e-06, "loss": 0.0838, "step": 735 }, { "epoch": 0.35350624399615754, "grad_norm": 0.7254263684691535, "learning_rate": 9.990381236514694e-06, "loss": 0.0568, "step": 736 }, { "epoch": 0.3539865513928914, "grad_norm": 0.7557297240778922, "learning_rate": 9.99020720163073e-06, "loss": 0.0613, "step": 737 }, { "epoch": 0.35446685878962536, "grad_norm": 0.7212710346277188, "learning_rate": 9.990031607944095e-06, "loss": 0.0615, "step": 738 }, { "epoch": 0.3549471661863593, "grad_norm": 0.5429849169379788, "learning_rate": 9.989854455509636e-06, "loss": 0.0508, "step": 739 }, { "epoch": 0.3554274735830932, "grad_norm": 0.790313789150935, "learning_rate": 9.98967574438269e-06, "loss": 0.0607, "step": 740 }, { "epoch": 0.3559077809798271, "grad_norm": 0.6588326074450415, "learning_rate": 9.989495474619084e-06, "loss": 0.0501, "step": 741 }, { "epoch": 0.356388088376561, "grad_norm": 1.0216588961310469, "learning_rate": 9.989313646275127e-06, "loss": 0.0817, "step": 742 }, { "epoch": 0.35686839577329493, "grad_norm": 1.2144514747608683, "learning_rate": 9.989130259407617e-06, "loss": 0.095, "step": 743 }, { "epoch": 0.3573487031700288, "grad_norm": 0.8230958178941028, "learning_rate": 9.988945314073842e-06, "loss": 0.0675, "step": 744 }, { "epoch": 0.35782901056676275, "grad_norm": 0.5442331145018258, "learning_rate": 9.988758810331572e-06, "loss": 0.0472, "step": 745 }, { "epoch": 0.35830931796349663, "grad_norm": 0.9420681725327574, "learning_rate": 9.988570748239062e-06, "loss": 0.0609, "step": 746 }, { "epoch": 0.35878962536023057, "grad_norm": 0.7063636013049791, "learning_rate": 9.988381127855063e-06, "loss": 0.0612, "step": 747 }, { "epoch": 0.35926993275696445, "grad_norm": 0.8868618317047221, "learning_rate": 9.988189949238804e-06, "loss": 0.0604, "step": 748 }, { "epoch": 0.3597502401536984, "grad_norm": 0.7054240383568051, "learning_rate": 9.987997212450007e-06, "loss": 0.0641, "step": 749 }, { "epoch": 0.36023054755043227, "grad_norm": 0.7475702380405793, "learning_rate": 9.987802917548874e-06, "loss": 0.0554, "step": 750 }, { "epoch": 0.3607108549471662, "grad_norm": 0.8420433761309497, "learning_rate": 9.9876070645961e-06, "loss": 0.0646, "step": 751 }, { "epoch": 0.3611911623439001, "grad_norm": 1.151439713731852, "learning_rate": 9.98740965365286e-06, "loss": 0.0894, "step": 752 }, { "epoch": 0.361671469740634, "grad_norm": 0.9861736223093036, "learning_rate": 9.987210684780826e-06, "loss": 0.0653, "step": 753 }, { "epoch": 0.3621517771373679, "grad_norm": 1.023603091918796, "learning_rate": 9.987010158042145e-06, "loss": 0.0716, "step": 754 }, { "epoch": 0.36263208453410184, "grad_norm": 0.7427517829264801, "learning_rate": 9.986808073499459e-06, "loss": 0.0588, "step": 755 }, { "epoch": 0.3631123919308357, "grad_norm": 1.0002779227618477, "learning_rate": 9.98660443121589e-06, "loss": 0.0601, "step": 756 }, { "epoch": 0.36359269932756966, "grad_norm": 0.7758536271834264, "learning_rate": 9.986399231255057e-06, "loss": 0.0629, "step": 757 }, { "epoch": 0.36407300672430354, "grad_norm": 0.5275682172040984, "learning_rate": 9.98619247368105e-06, "loss": 0.0591, "step": 758 }, { "epoch": 0.3645533141210375, "grad_norm": 0.8592188357318262, "learning_rate": 9.985984158558462e-06, "loss": 0.0686, "step": 759 }, { "epoch": 0.36503362151777136, "grad_norm": 0.6429739999465831, "learning_rate": 9.985774285952362e-06, "loss": 0.0875, "step": 760 }, { "epoch": 0.3655139289145053, "grad_norm": 0.9068989203363063, "learning_rate": 9.985562855928309e-06, "loss": 0.0728, "step": 761 }, { "epoch": 0.3659942363112392, "grad_norm": 0.8958890702630246, "learning_rate": 9.985349868552343e-06, "loss": 0.0762, "step": 762 }, { "epoch": 0.3664745437079731, "grad_norm": 0.9260397876251805, "learning_rate": 9.985135323891002e-06, "loss": 0.0639, "step": 763 }, { "epoch": 0.366954851104707, "grad_norm": 0.46145582448125044, "learning_rate": 9.984919222011301e-06, "loss": 0.0392, "step": 764 }, { "epoch": 0.36743515850144093, "grad_norm": 0.7368863607093822, "learning_rate": 9.984701562980745e-06, "loss": 0.0675, "step": 765 }, { "epoch": 0.3679154658981748, "grad_norm": 0.8374165642976674, "learning_rate": 9.984482346867325e-06, "loss": 0.0724, "step": 766 }, { "epoch": 0.36839577329490875, "grad_norm": 0.5004955641611937, "learning_rate": 9.984261573739515e-06, "loss": 0.0559, "step": 767 }, { "epoch": 0.3688760806916426, "grad_norm": 0.6896996348098048, "learning_rate": 9.984039243666284e-06, "loss": 0.0683, "step": 768 }, { "epoch": 0.36935638808837656, "grad_norm": 0.4917654383061967, "learning_rate": 9.983815356717075e-06, "loss": 0.0509, "step": 769 }, { "epoch": 0.36983669548511044, "grad_norm": 0.6841919810633339, "learning_rate": 9.983589912961828e-06, "loss": 0.0693, "step": 770 }, { "epoch": 0.3703170028818444, "grad_norm": 0.6557223578797912, "learning_rate": 9.983362912470967e-06, "loss": 0.064, "step": 771 }, { "epoch": 0.37079731027857826, "grad_norm": 0.6242477698953485, "learning_rate": 9.983134355315397e-06, "loss": 0.0587, "step": 772 }, { "epoch": 0.3712776176753122, "grad_norm": 0.5988595426851467, "learning_rate": 9.982904241566515e-06, "loss": 0.0634, "step": 773 }, { "epoch": 0.37175792507204614, "grad_norm": 0.6754592703701289, "learning_rate": 9.982672571296201e-06, "loss": 0.0629, "step": 774 }, { "epoch": 0.37223823246878, "grad_norm": 0.6078844744556268, "learning_rate": 9.982439344576824e-06, "loss": 0.0508, "step": 775 }, { "epoch": 0.37271853986551395, "grad_norm": 0.6857139355602675, "learning_rate": 9.982204561481237e-06, "loss": 0.0466, "step": 776 }, { "epoch": 0.37319884726224783, "grad_norm": 0.6033713609538075, "learning_rate": 9.981968222082778e-06, "loss": 0.0537, "step": 777 }, { "epoch": 0.37367915465898177, "grad_norm": 0.5844207725067214, "learning_rate": 9.981730326455275e-06, "loss": 0.0434, "step": 778 }, { "epoch": 0.37415946205571565, "grad_norm": 0.9540307943992555, "learning_rate": 9.98149087467304e-06, "loss": 0.0525, "step": 779 }, { "epoch": 0.3746397694524496, "grad_norm": 0.7214661073713216, "learning_rate": 9.98124986681087e-06, "loss": 0.0743, "step": 780 }, { "epoch": 0.37512007684918347, "grad_norm": 0.6813082935520749, "learning_rate": 9.981007302944048e-06, "loss": 0.0826, "step": 781 }, { "epoch": 0.3756003842459174, "grad_norm": 0.5582899213242017, "learning_rate": 9.980763183148347e-06, "loss": 0.0603, "step": 782 }, { "epoch": 0.3760806916426513, "grad_norm": 0.5713120406122318, "learning_rate": 9.980517507500023e-06, "loss": 0.0418, "step": 783 }, { "epoch": 0.3765609990393852, "grad_norm": 0.5631392163416006, "learning_rate": 9.980270276075816e-06, "loss": 0.076, "step": 784 }, { "epoch": 0.3770413064361191, "grad_norm": 0.6842884389848988, "learning_rate": 9.980021488952957e-06, "loss": 0.0555, "step": 785 }, { "epoch": 0.37752161383285304, "grad_norm": 0.6012427638288268, "learning_rate": 9.979771146209159e-06, "loss": 0.0581, "step": 786 }, { "epoch": 0.3780019212295869, "grad_norm": 0.6380907947348216, "learning_rate": 9.97951924792262e-06, "loss": 0.0558, "step": 787 }, { "epoch": 0.37848222862632086, "grad_norm": 0.4499455933552752, "learning_rate": 9.979265794172029e-06, "loss": 0.045, "step": 788 }, { "epoch": 0.37896253602305474, "grad_norm": 0.9207107334074435, "learning_rate": 9.979010785036557e-06, "loss": 0.0746, "step": 789 }, { "epoch": 0.3794428434197887, "grad_norm": 0.6001083473477673, "learning_rate": 9.978754220595861e-06, "loss": 0.0578, "step": 790 }, { "epoch": 0.37992315081652256, "grad_norm": 0.5596962433628107, "learning_rate": 9.978496100930086e-06, "loss": 0.0584, "step": 791 }, { "epoch": 0.3804034582132565, "grad_norm": 0.5021012713607019, "learning_rate": 9.978236426119862e-06, "loss": 0.0538, "step": 792 }, { "epoch": 0.3808837656099904, "grad_norm": 0.5102988447573241, "learning_rate": 9.977975196246302e-06, "loss": 0.0498, "step": 793 }, { "epoch": 0.3813640730067243, "grad_norm": 0.6861671569949817, "learning_rate": 9.97771241139101e-06, "loss": 0.0644, "step": 794 }, { "epoch": 0.3818443804034582, "grad_norm": 0.6357301541893875, "learning_rate": 9.977448071636068e-06, "loss": 0.0601, "step": 795 }, { "epoch": 0.38232468780019213, "grad_norm": 0.4024807592618827, "learning_rate": 9.977182177064053e-06, "loss": 0.0379, "step": 796 }, { "epoch": 0.382804995196926, "grad_norm": 1.1288610789705666, "learning_rate": 9.97691472775802e-06, "loss": 0.071, "step": 797 }, { "epoch": 0.38328530259365995, "grad_norm": 0.6469760760858952, "learning_rate": 9.976645723801515e-06, "loss": 0.0654, "step": 798 }, { "epoch": 0.38376560999039383, "grad_norm": 0.5560215918537325, "learning_rate": 9.976375165278567e-06, "loss": 0.0512, "step": 799 }, { "epoch": 0.38424591738712777, "grad_norm": 0.7975603385140326, "learning_rate": 9.976103052273689e-06, "loss": 0.0883, "step": 800 }, { "epoch": 0.38472622478386165, "grad_norm": 0.7728034217886376, "learning_rate": 9.975829384871884e-06, "loss": 0.072, "step": 801 }, { "epoch": 0.3852065321805956, "grad_norm": 0.7767874847251652, "learning_rate": 9.975554163158636e-06, "loss": 0.0464, "step": 802 }, { "epoch": 0.38568683957732947, "grad_norm": 0.6266192997429625, "learning_rate": 9.975277387219919e-06, "loss": 0.0528, "step": 803 }, { "epoch": 0.3861671469740634, "grad_norm": 0.7250311644638058, "learning_rate": 9.97499905714219e-06, "loss": 0.0686, "step": 804 }, { "epoch": 0.3866474543707973, "grad_norm": 1.1253149275559908, "learning_rate": 9.974719173012388e-06, "loss": 0.049, "step": 805 }, { "epoch": 0.3871277617675312, "grad_norm": 0.6222049434177256, "learning_rate": 9.974437734917945e-06, "loss": 0.0653, "step": 806 }, { "epoch": 0.38760806916426516, "grad_norm": 0.6636209039456584, "learning_rate": 9.974154742946775e-06, "loss": 0.0619, "step": 807 }, { "epoch": 0.38808837656099904, "grad_norm": 1.347074867858616, "learning_rate": 9.973870197187272e-06, "loss": 0.0659, "step": 808 }, { "epoch": 0.388568683957733, "grad_norm": 0.685156022786215, "learning_rate": 9.973584097728325e-06, "loss": 0.0545, "step": 809 }, { "epoch": 0.38904899135446686, "grad_norm": 0.8480519439599661, "learning_rate": 9.973296444659301e-06, "loss": 0.0659, "step": 810 }, { "epoch": 0.3895292987512008, "grad_norm": 0.4931591852346543, "learning_rate": 9.973007238070057e-06, "loss": 0.0543, "step": 811 }, { "epoch": 0.3900096061479347, "grad_norm": 0.410538917518561, "learning_rate": 9.97271647805093e-06, "loss": 0.0356, "step": 812 }, { "epoch": 0.3904899135446686, "grad_norm": 0.6183835853418933, "learning_rate": 9.972424164692748e-06, "loss": 0.0548, "step": 813 }, { "epoch": 0.3909702209414025, "grad_norm": 0.5518914943760818, "learning_rate": 9.972130298086821e-06, "loss": 0.0615, "step": 814 }, { "epoch": 0.3914505283381364, "grad_norm": 0.48864613714652044, "learning_rate": 9.971834878324944e-06, "loss": 0.0505, "step": 815 }, { "epoch": 0.3919308357348703, "grad_norm": 0.5320681967319949, "learning_rate": 9.971537905499397e-06, "loss": 0.0501, "step": 816 }, { "epoch": 0.39241114313160425, "grad_norm": 0.45876250806739416, "learning_rate": 9.971239379702951e-06, "loss": 0.0385, "step": 817 }, { "epoch": 0.3928914505283381, "grad_norm": 0.8069704061567374, "learning_rate": 9.970939301028853e-06, "loss": 0.0805, "step": 818 }, { "epoch": 0.39337175792507206, "grad_norm": 0.8977429261025726, "learning_rate": 9.970637669570838e-06, "loss": 0.0594, "step": 819 }, { "epoch": 0.39385206532180594, "grad_norm": 0.509233161828096, "learning_rate": 9.97033448542313e-06, "loss": 0.0577, "step": 820 }, { "epoch": 0.3943323727185399, "grad_norm": 0.5471204223282794, "learning_rate": 9.970029748680437e-06, "loss": 0.0433, "step": 821 }, { "epoch": 0.39481268011527376, "grad_norm": 0.8632034689344821, "learning_rate": 9.969723459437945e-06, "loss": 0.0597, "step": 822 }, { "epoch": 0.3952929875120077, "grad_norm": 0.9564865645949854, "learning_rate": 9.969415617791336e-06, "loss": 0.0627, "step": 823 }, { "epoch": 0.3957732949087416, "grad_norm": 0.7450452763817546, "learning_rate": 9.969106223836766e-06, "loss": 0.0547, "step": 824 }, { "epoch": 0.3962536023054755, "grad_norm": 0.5891613173133876, "learning_rate": 9.968795277670886e-06, "loss": 0.0482, "step": 825 }, { "epoch": 0.3967339097022094, "grad_norm": 0.5741708509441278, "learning_rate": 9.968482779390824e-06, "loss": 0.0699, "step": 826 }, { "epoch": 0.39721421709894333, "grad_norm": 0.5752534750406278, "learning_rate": 9.968168729094197e-06, "loss": 0.0513, "step": 827 }, { "epoch": 0.3976945244956772, "grad_norm": 0.5909076992409328, "learning_rate": 9.967853126879103e-06, "loss": 0.0584, "step": 828 }, { "epoch": 0.39817483189241115, "grad_norm": 0.5478980970759338, "learning_rate": 9.967535972844131e-06, "loss": 0.0578, "step": 829 }, { "epoch": 0.39865513928914503, "grad_norm": 0.43802378334815034, "learning_rate": 9.96721726708835e-06, "loss": 0.047, "step": 830 }, { "epoch": 0.39913544668587897, "grad_norm": 0.5964278773959507, "learning_rate": 9.966897009711314e-06, "loss": 0.0553, "step": 831 }, { "epoch": 0.39961575408261285, "grad_norm": 0.8075938997085979, "learning_rate": 9.966575200813064e-06, "loss": 0.0671, "step": 832 }, { "epoch": 0.4000960614793468, "grad_norm": 0.5724293589628411, "learning_rate": 9.966251840494123e-06, "loss": 0.0565, "step": 833 }, { "epoch": 0.40057636887608067, "grad_norm": 0.8710191021054293, "learning_rate": 9.965926928855498e-06, "loss": 0.0723, "step": 834 }, { "epoch": 0.4010566762728146, "grad_norm": 0.5363225318409294, "learning_rate": 9.965600465998686e-06, "loss": 0.0458, "step": 835 }, { "epoch": 0.4015369836695485, "grad_norm": 0.7851083772033061, "learning_rate": 9.965272452025666e-06, "loss": 0.044, "step": 836 }, { "epoch": 0.4020172910662824, "grad_norm": 0.7693177422501187, "learning_rate": 9.964942887038893e-06, "loss": 0.0475, "step": 837 }, { "epoch": 0.4024975984630163, "grad_norm": 0.7987401635767456, "learning_rate": 9.964611771141322e-06, "loss": 0.0651, "step": 838 }, { "epoch": 0.40297790585975024, "grad_norm": 0.8276640756002739, "learning_rate": 9.96427910443638e-06, "loss": 0.0785, "step": 839 }, { "epoch": 0.4034582132564842, "grad_norm": 0.6809367893973843, "learning_rate": 9.963944887027985e-06, "loss": 0.0715, "step": 840 }, { "epoch": 0.40393852065321806, "grad_norm": 0.6804973064252685, "learning_rate": 9.963609119020538e-06, "loss": 0.0585, "step": 841 }, { "epoch": 0.404418828049952, "grad_norm": 0.5534861907234978, "learning_rate": 9.963271800518921e-06, "loss": 0.0535, "step": 842 }, { "epoch": 0.4048991354466859, "grad_norm": 1.0547311716146044, "learning_rate": 9.962932931628504e-06, "loss": 0.0651, "step": 843 }, { "epoch": 0.4053794428434198, "grad_norm": 0.6295639846864554, "learning_rate": 9.96259251245514e-06, "loss": 0.0568, "step": 844 }, { "epoch": 0.4058597502401537, "grad_norm": 0.5719452618102057, "learning_rate": 9.962250543105167e-06, "loss": 0.0418, "step": 845 }, { "epoch": 0.40634005763688763, "grad_norm": 0.7395158167833543, "learning_rate": 9.961907023685407e-06, "loss": 0.0567, "step": 846 }, { "epoch": 0.4068203650336215, "grad_norm": 0.5391369878497289, "learning_rate": 9.961561954303164e-06, "loss": 0.0506, "step": 847 }, { "epoch": 0.40730067243035545, "grad_norm": 0.8066572923047143, "learning_rate": 9.961215335066232e-06, "loss": 0.0883, "step": 848 }, { "epoch": 0.40778097982708933, "grad_norm": 0.5151043209208244, "learning_rate": 9.960867166082884e-06, "loss": 0.044, "step": 849 }, { "epoch": 0.40826128722382327, "grad_norm": 0.6317828023573119, "learning_rate": 9.960517447461875e-06, "loss": 0.0468, "step": 850 }, { "epoch": 0.40874159462055715, "grad_norm": 0.5968432172723486, "learning_rate": 9.96016617931245e-06, "loss": 0.05, "step": 851 }, { "epoch": 0.4092219020172911, "grad_norm": 0.48351796440258127, "learning_rate": 9.959813361744337e-06, "loss": 0.0422, "step": 852 }, { "epoch": 0.40970220941402496, "grad_norm": 0.45514792871791687, "learning_rate": 9.959458994867744e-06, "loss": 0.052, "step": 853 }, { "epoch": 0.4101825168107589, "grad_norm": 0.8035467442104323, "learning_rate": 9.959103078793364e-06, "loss": 0.0897, "step": 854 }, { "epoch": 0.4106628242074928, "grad_norm": 0.7097960300705127, "learning_rate": 9.95874561363238e-06, "loss": 0.0551, "step": 855 }, { "epoch": 0.4111431316042267, "grad_norm": 0.40391671652511296, "learning_rate": 9.95838659949645e-06, "loss": 0.0414, "step": 856 }, { "epoch": 0.4116234390009606, "grad_norm": 0.6170051420564876, "learning_rate": 9.958026036497723e-06, "loss": 0.0559, "step": 857 }, { "epoch": 0.41210374639769454, "grad_norm": 0.7115787557160136, "learning_rate": 9.957663924748828e-06, "loss": 0.0565, "step": 858 }, { "epoch": 0.4125840537944284, "grad_norm": 0.8705757145217116, "learning_rate": 9.957300264362878e-06, "loss": 0.0674, "step": 859 }, { "epoch": 0.41306436119116235, "grad_norm": 0.5500919289906383, "learning_rate": 9.95693505545347e-06, "loss": 0.0621, "step": 860 }, { "epoch": 0.41354466858789624, "grad_norm": 0.660836487601902, "learning_rate": 9.956568298134687e-06, "loss": 0.0632, "step": 861 }, { "epoch": 0.4140249759846302, "grad_norm": 0.9931232470867721, "learning_rate": 9.956199992521092e-06, "loss": 0.0653, "step": 862 }, { "epoch": 0.41450528338136405, "grad_norm": 0.7024061823998546, "learning_rate": 9.955830138727736e-06, "loss": 0.0593, "step": 863 }, { "epoch": 0.414985590778098, "grad_norm": 0.4764259054817133, "learning_rate": 9.955458736870148e-06, "loss": 0.0526, "step": 864 }, { "epoch": 0.41546589817483187, "grad_norm": 0.7066580129944023, "learning_rate": 9.955085787064344e-06, "loss": 0.0541, "step": 865 }, { "epoch": 0.4159462055715658, "grad_norm": 0.665310242989133, "learning_rate": 9.954711289426826e-06, "loss": 0.061, "step": 866 }, { "epoch": 0.4164265129682997, "grad_norm": 0.5484058455441252, "learning_rate": 9.954335244074575e-06, "loss": 0.0459, "step": 867 }, { "epoch": 0.4169068203650336, "grad_norm": 0.5666272682660084, "learning_rate": 9.953957651125056e-06, "loss": 0.0557, "step": 868 }, { "epoch": 0.4173871277617675, "grad_norm": 0.8668633540680836, "learning_rate": 9.95357851069622e-06, "loss": 0.0768, "step": 869 }, { "epoch": 0.41786743515850144, "grad_norm": 0.5410731748495422, "learning_rate": 9.9531978229065e-06, "loss": 0.0627, "step": 870 }, { "epoch": 0.4183477425552353, "grad_norm": 0.5775107068931661, "learning_rate": 9.952815587874811e-06, "loss": 0.0496, "step": 871 }, { "epoch": 0.41882804995196926, "grad_norm": 0.690239624690725, "learning_rate": 9.952431805720555e-06, "loss": 0.0751, "step": 872 }, { "epoch": 0.41930835734870314, "grad_norm": 0.49376413841588845, "learning_rate": 9.952046476563614e-06, "loss": 0.0552, "step": 873 }, { "epoch": 0.4197886647454371, "grad_norm": 0.6948681019679253, "learning_rate": 9.951659600524353e-06, "loss": 0.0527, "step": 874 }, { "epoch": 0.420268972142171, "grad_norm": 0.49290241630164605, "learning_rate": 9.951271177723623e-06, "loss": 0.0506, "step": 875 }, { "epoch": 0.4207492795389049, "grad_norm": 0.8721248981892464, "learning_rate": 9.950881208282755e-06, "loss": 0.0569, "step": 876 }, { "epoch": 0.42122958693563883, "grad_norm": 0.5497841692019572, "learning_rate": 9.950489692323564e-06, "loss": 0.0633, "step": 877 }, { "epoch": 0.4217098943323727, "grad_norm": 0.6397458240250244, "learning_rate": 9.950096629968353e-06, "loss": 0.0537, "step": 878 }, { "epoch": 0.42219020172910665, "grad_norm": 0.5682428472860261, "learning_rate": 9.949702021339897e-06, "loss": 0.0558, "step": 879 }, { "epoch": 0.42267050912584053, "grad_norm": 1.0280188654825122, "learning_rate": 9.949305866561468e-06, "loss": 0.0732, "step": 880 }, { "epoch": 0.42315081652257447, "grad_norm": 0.6451938425896641, "learning_rate": 9.94890816575681e-06, "loss": 0.0668, "step": 881 }, { "epoch": 0.42363112391930835, "grad_norm": 0.45108905962672585, "learning_rate": 9.948508919050153e-06, "loss": 0.0309, "step": 882 }, { "epoch": 0.4241114313160423, "grad_norm": 0.7378292399931741, "learning_rate": 9.948108126566213e-06, "loss": 0.0805, "step": 883 }, { "epoch": 0.42459173871277617, "grad_norm": 0.5519105751376474, "learning_rate": 9.947705788430185e-06, "loss": 0.0521, "step": 884 }, { "epoch": 0.4250720461095101, "grad_norm": 0.6856313077407358, "learning_rate": 9.94730190476775e-06, "loss": 0.0644, "step": 885 }, { "epoch": 0.425552353506244, "grad_norm": 0.5969546888876337, "learning_rate": 9.946896475705067e-06, "loss": 0.0434, "step": 886 }, { "epoch": 0.4260326609029779, "grad_norm": 0.45619128279304955, "learning_rate": 9.946489501368783e-06, "loss": 0.051, "step": 887 }, { "epoch": 0.4265129682997118, "grad_norm": 0.6592474450888789, "learning_rate": 9.946080981886025e-06, "loss": 0.0482, "step": 888 }, { "epoch": 0.42699327569644574, "grad_norm": 0.8000680158848036, "learning_rate": 9.945670917384404e-06, "loss": 0.0779, "step": 889 }, { "epoch": 0.4274735830931796, "grad_norm": 0.5334860843524303, "learning_rate": 9.94525930799201e-06, "loss": 0.0489, "step": 890 }, { "epoch": 0.42795389048991356, "grad_norm": 0.5254575930785399, "learning_rate": 9.944846153837423e-06, "loss": 0.0397, "step": 891 }, { "epoch": 0.42843419788664744, "grad_norm": 0.5660738613547326, "learning_rate": 9.944431455049697e-06, "loss": 0.0529, "step": 892 }, { "epoch": 0.4289145052833814, "grad_norm": 0.7532191005217946, "learning_rate": 9.944015211758375e-06, "loss": 0.0567, "step": 893 }, { "epoch": 0.42939481268011526, "grad_norm": 0.6751729764036046, "learning_rate": 9.943597424093477e-06, "loss": 0.0536, "step": 894 }, { "epoch": 0.4298751200768492, "grad_norm": 0.7399414072106839, "learning_rate": 9.943178092185511e-06, "loss": 0.0627, "step": 895 }, { "epoch": 0.4303554274735831, "grad_norm": 0.5319921340444789, "learning_rate": 9.942757216165464e-06, "loss": 0.0526, "step": 896 }, { "epoch": 0.430835734870317, "grad_norm": 0.9231172091302492, "learning_rate": 9.942334796164805e-06, "loss": 0.0611, "step": 897 }, { "epoch": 0.4313160422670509, "grad_norm": 0.7768175094389944, "learning_rate": 9.941910832315488e-06, "loss": 0.0639, "step": 898 }, { "epoch": 0.43179634966378483, "grad_norm": 0.4825476064702085, "learning_rate": 9.941485324749947e-06, "loss": 0.0518, "step": 899 }, { "epoch": 0.4322766570605187, "grad_norm": 0.7021828010660213, "learning_rate": 9.941058273601097e-06, "loss": 0.0593, "step": 900 }, { "epoch": 0.43275696445725265, "grad_norm": 0.5871945041167091, "learning_rate": 9.94062967900234e-06, "loss": 0.0574, "step": 901 }, { "epoch": 0.4332372718539865, "grad_norm": 0.7091892488086937, "learning_rate": 9.940199541087554e-06, "loss": 0.0709, "step": 902 }, { "epoch": 0.43371757925072046, "grad_norm": 0.5164796945559618, "learning_rate": 9.939767859991104e-06, "loss": 0.0567, "step": 903 }, { "epoch": 0.43419788664745435, "grad_norm": 1.0053328543134505, "learning_rate": 9.939334635847834e-06, "loss": 0.0623, "step": 904 }, { "epoch": 0.4346781940441883, "grad_norm": 1.014246583976263, "learning_rate": 9.938899868793074e-06, "loss": 0.0678, "step": 905 }, { "epoch": 0.43515850144092216, "grad_norm": 0.620672410400972, "learning_rate": 9.93846355896263e-06, "loss": 0.0499, "step": 906 }, { "epoch": 0.4356388088376561, "grad_norm": 0.6018522052869041, "learning_rate": 9.938025706492796e-06, "loss": 0.0633, "step": 907 }, { "epoch": 0.43611911623439004, "grad_norm": 0.7802965838379325, "learning_rate": 9.937586311520342e-06, "loss": 0.0542, "step": 908 }, { "epoch": 0.4365994236311239, "grad_norm": 0.7890750457426956, "learning_rate": 9.937145374182523e-06, "loss": 0.0674, "step": 909 }, { "epoch": 0.43707973102785785, "grad_norm": 0.6932375338768142, "learning_rate": 9.936702894617081e-06, "loss": 0.0705, "step": 910 }, { "epoch": 0.43756003842459174, "grad_norm": 0.6257587333406103, "learning_rate": 9.936258872962229e-06, "loss": 0.0628, "step": 911 }, { "epoch": 0.43804034582132567, "grad_norm": 0.5607233207353541, "learning_rate": 9.935813309356666e-06, "loss": 0.0544, "step": 912 }, { "epoch": 0.43852065321805955, "grad_norm": 0.6321356409882269, "learning_rate": 9.935366203939579e-06, "loss": 0.0491, "step": 913 }, { "epoch": 0.4390009606147935, "grad_norm": 0.6757767770961596, "learning_rate": 9.934917556850625e-06, "loss": 0.0434, "step": 914 }, { "epoch": 0.43948126801152737, "grad_norm": 0.7906338628745093, "learning_rate": 9.934467368229955e-06, "loss": 0.0638, "step": 915 }, { "epoch": 0.4399615754082613, "grad_norm": 0.4652920811833901, "learning_rate": 9.934015638218193e-06, "loss": 0.0519, "step": 916 }, { "epoch": 0.4404418828049952, "grad_norm": 0.5470520415253206, "learning_rate": 9.933562366956445e-06, "loss": 0.0574, "step": 917 }, { "epoch": 0.4409221902017291, "grad_norm": 0.7607179274563025, "learning_rate": 9.933107554586303e-06, "loss": 0.0675, "step": 918 }, { "epoch": 0.441402497598463, "grad_norm": 0.5532199345436009, "learning_rate": 9.93265120124984e-06, "loss": 0.0446, "step": 919 }, { "epoch": 0.44188280499519694, "grad_norm": 0.501284318665828, "learning_rate": 9.932193307089602e-06, "loss": 0.0518, "step": 920 }, { "epoch": 0.4423631123919308, "grad_norm": 0.6013902688897093, "learning_rate": 9.931733872248626e-06, "loss": 0.0584, "step": 921 }, { "epoch": 0.44284341978866476, "grad_norm": 0.5972118740317204, "learning_rate": 9.931272896870427e-06, "loss": 0.0476, "step": 922 }, { "epoch": 0.44332372718539864, "grad_norm": 2.253298791198679, "learning_rate": 9.930810381098999e-06, "loss": 0.0561, "step": 923 }, { "epoch": 0.4438040345821326, "grad_norm": 0.6102253348891266, "learning_rate": 9.93034632507882e-06, "loss": 0.0482, "step": 924 }, { "epoch": 0.44428434197886646, "grad_norm": 0.4713494577341782, "learning_rate": 9.929880728954853e-06, "loss": 0.0371, "step": 925 }, { "epoch": 0.4447646493756004, "grad_norm": 0.6627531547942727, "learning_rate": 9.92941359287253e-06, "loss": 0.0509, "step": 926 }, { "epoch": 0.4452449567723343, "grad_norm": 0.5428810469885857, "learning_rate": 9.928944916977775e-06, "loss": 0.051, "step": 927 }, { "epoch": 0.4457252641690682, "grad_norm": 0.6643297934290499, "learning_rate": 9.92847470141699e-06, "loss": 0.0765, "step": 928 }, { "epoch": 0.4462055715658021, "grad_norm": 0.5681242371551549, "learning_rate": 9.928002946337055e-06, "loss": 0.0487, "step": 929 }, { "epoch": 0.44668587896253603, "grad_norm": 0.45349350065521055, "learning_rate": 9.927529651885334e-06, "loss": 0.0548, "step": 930 }, { "epoch": 0.4471661863592699, "grad_norm": 0.42679372029366797, "learning_rate": 9.92705481820967e-06, "loss": 0.0529, "step": 931 }, { "epoch": 0.44764649375600385, "grad_norm": 0.4695476492659101, "learning_rate": 9.926578445458393e-06, "loss": 0.0414, "step": 932 }, { "epoch": 0.44812680115273773, "grad_norm": 0.5300464019739584, "learning_rate": 9.926100533780304e-06, "loss": 0.0439, "step": 933 }, { "epoch": 0.44860710854947167, "grad_norm": 0.4180296006214648, "learning_rate": 9.92562108332469e-06, "loss": 0.0391, "step": 934 }, { "epoch": 0.44908741594620555, "grad_norm": 0.40241583405442505, "learning_rate": 9.92514009424132e-06, "loss": 0.0523, "step": 935 }, { "epoch": 0.4495677233429395, "grad_norm": 0.4954138376589334, "learning_rate": 9.924657566680438e-06, "loss": 0.058, "step": 936 }, { "epoch": 0.45004803073967337, "grad_norm": 0.6273768947654529, "learning_rate": 9.924173500792775e-06, "loss": 0.055, "step": 937 }, { "epoch": 0.4505283381364073, "grad_norm": 0.3724637804837822, "learning_rate": 9.92368789672954e-06, "loss": 0.0334, "step": 938 }, { "epoch": 0.4510086455331412, "grad_norm": 0.5849978243981226, "learning_rate": 9.923200754642422e-06, "loss": 0.0443, "step": 939 }, { "epoch": 0.4514889529298751, "grad_norm": 0.49650880356005744, "learning_rate": 9.92271207468359e-06, "loss": 0.0436, "step": 940 }, { "epoch": 0.45196926032660906, "grad_norm": 0.44519643938812636, "learning_rate": 9.922221857005693e-06, "loss": 0.0523, "step": 941 }, { "epoch": 0.45244956772334294, "grad_norm": 0.983957220828581, "learning_rate": 9.921730101761865e-06, "loss": 0.0945, "step": 942 }, { "epoch": 0.4529298751200769, "grad_norm": 0.6035634372006827, "learning_rate": 9.921236809105711e-06, "loss": 0.0522, "step": 943 }, { "epoch": 0.45341018251681076, "grad_norm": 0.9095761086634716, "learning_rate": 9.92074197919133e-06, "loss": 0.0482, "step": 944 }, { "epoch": 0.4538904899135447, "grad_norm": 0.7038000923358203, "learning_rate": 9.920245612173288e-06, "loss": 0.06, "step": 945 }, { "epoch": 0.4543707973102786, "grad_norm": 0.5317259635381724, "learning_rate": 9.919747708206635e-06, "loss": 0.0513, "step": 946 }, { "epoch": 0.4548511047070125, "grad_norm": 1.612873760960674, "learning_rate": 9.919248267446904e-06, "loss": 0.0555, "step": 947 }, { "epoch": 0.4553314121037464, "grad_norm": 0.46423671806106315, "learning_rate": 9.918747290050108e-06, "loss": 0.0368, "step": 948 }, { "epoch": 0.45581171950048033, "grad_norm": 0.7137648813885012, "learning_rate": 9.918244776172739e-06, "loss": 0.058, "step": 949 }, { "epoch": 0.4562920268972142, "grad_norm": 0.677167631389359, "learning_rate": 9.917740725971765e-06, "loss": 0.0476, "step": 950 }, { "epoch": 0.45677233429394815, "grad_norm": 0.6324916873646828, "learning_rate": 9.91723513960464e-06, "loss": 0.0524, "step": 951 }, { "epoch": 0.457252641690682, "grad_norm": 0.5324488356285513, "learning_rate": 9.916728017229293e-06, "loss": 0.0398, "step": 952 }, { "epoch": 0.45773294908741596, "grad_norm": 0.6324777151535353, "learning_rate": 9.916219359004137e-06, "loss": 0.0571, "step": 953 }, { "epoch": 0.45821325648414984, "grad_norm": 0.6435220215655985, "learning_rate": 9.915709165088063e-06, "loss": 0.0696, "step": 954 }, { "epoch": 0.4586935638808838, "grad_norm": 0.8164681729134681, "learning_rate": 9.91519743564044e-06, "loss": 0.0611, "step": 955 }, { "epoch": 0.45917387127761766, "grad_norm": 0.5313310873883075, "learning_rate": 9.914684170821119e-06, "loss": 0.0476, "step": 956 }, { "epoch": 0.4596541786743516, "grad_norm": 0.5973052937068528, "learning_rate": 9.91416937079043e-06, "loss": 0.0757, "step": 957 }, { "epoch": 0.4601344860710855, "grad_norm": 0.4440073895198674, "learning_rate": 9.91365303570918e-06, "loss": 0.0435, "step": 958 }, { "epoch": 0.4606147934678194, "grad_norm": 0.5635682614371156, "learning_rate": 9.913135165738661e-06, "loss": 0.0601, "step": 959 }, { "epoch": 0.4610951008645533, "grad_norm": 0.6205543687338353, "learning_rate": 9.91261576104064e-06, "loss": 0.0607, "step": 960 }, { "epoch": 0.46157540826128723, "grad_norm": 0.8474866942319256, "learning_rate": 9.912094821777362e-06, "loss": 0.0508, "step": 961 }, { "epoch": 0.4620557156580211, "grad_norm": 0.4415856929796406, "learning_rate": 9.91157234811156e-06, "loss": 0.0426, "step": 962 }, { "epoch": 0.46253602305475505, "grad_norm": 0.4965072565036624, "learning_rate": 9.911048340206435e-06, "loss": 0.0485, "step": 963 }, { "epoch": 0.46301633045148893, "grad_norm": 0.7207884825617963, "learning_rate": 9.910522798225673e-06, "loss": 0.0767, "step": 964 }, { "epoch": 0.46349663784822287, "grad_norm": 0.6441736297504665, "learning_rate": 9.909995722333442e-06, "loss": 0.0582, "step": 965 }, { "epoch": 0.46397694524495675, "grad_norm": 0.6229851666491163, "learning_rate": 9.909467112694385e-06, "loss": 0.0649, "step": 966 }, { "epoch": 0.4644572526416907, "grad_norm": 0.7004494943555968, "learning_rate": 9.908936969473621e-06, "loss": 0.0479, "step": 967 }, { "epoch": 0.46493756003842457, "grad_norm": 0.7544245132592255, "learning_rate": 9.908405292836758e-06, "loss": 0.0604, "step": 968 }, { "epoch": 0.4654178674351585, "grad_norm": 0.7931595109408737, "learning_rate": 9.907872082949873e-06, "loss": 0.0822, "step": 969 }, { "epoch": 0.4658981748318924, "grad_norm": 0.6826313680933557, "learning_rate": 9.907337339979525e-06, "loss": 0.0481, "step": 970 }, { "epoch": 0.4663784822286263, "grad_norm": 0.5041754294571876, "learning_rate": 9.90680106409276e-06, "loss": 0.0417, "step": 971 }, { "epoch": 0.4668587896253602, "grad_norm": 0.8823164058501882, "learning_rate": 9.906263255457087e-06, "loss": 0.0652, "step": 972 }, { "epoch": 0.46733909702209414, "grad_norm": 0.7871751423538097, "learning_rate": 9.905723914240507e-06, "loss": 0.0654, "step": 973 }, { "epoch": 0.4678194044188281, "grad_norm": 0.8027977963955859, "learning_rate": 9.905183040611498e-06, "loss": 0.0545, "step": 974 }, { "epoch": 0.46829971181556196, "grad_norm": 0.8567181353492188, "learning_rate": 9.904640634739007e-06, "loss": 0.0728, "step": 975 }, { "epoch": 0.4687800192122959, "grad_norm": 0.4735301698046212, "learning_rate": 9.904096696792472e-06, "loss": 0.0402, "step": 976 }, { "epoch": 0.4692603266090298, "grad_norm": 0.9898206363808187, "learning_rate": 9.903551226941801e-06, "loss": 0.078, "step": 977 }, { "epoch": 0.4697406340057637, "grad_norm": 0.6741220652957427, "learning_rate": 9.903004225357387e-06, "loss": 0.0793, "step": 978 }, { "epoch": 0.4702209414024976, "grad_norm": 0.44391076772330806, "learning_rate": 9.902455692210094e-06, "loss": 0.0519, "step": 979 }, { "epoch": 0.47070124879923153, "grad_norm": 0.8124544269575636, "learning_rate": 9.901905627671273e-06, "loss": 0.0534, "step": 980 }, { "epoch": 0.4711815561959654, "grad_norm": 0.7481718577639421, "learning_rate": 9.901354031912746e-06, "loss": 0.0819, "step": 981 }, { "epoch": 0.47166186359269935, "grad_norm": 1.8659551510872119, "learning_rate": 9.900800905106817e-06, "loss": 0.0675, "step": 982 }, { "epoch": 0.47214217098943323, "grad_norm": 0.695535075758422, "learning_rate": 9.900246247426269e-06, "loss": 0.0622, "step": 983 }, { "epoch": 0.47262247838616717, "grad_norm": 0.49991906676132497, "learning_rate": 9.899690059044358e-06, "loss": 0.0573, "step": 984 }, { "epoch": 0.47310278578290105, "grad_norm": 0.8200411105348173, "learning_rate": 9.899132340134825e-06, "loss": 0.0867, "step": 985 }, { "epoch": 0.473583093179635, "grad_norm": 0.6775421926604714, "learning_rate": 9.898573090871885e-06, "loss": 0.05, "step": 986 }, { "epoch": 0.47406340057636887, "grad_norm": 0.5492047748590122, "learning_rate": 9.898012311430232e-06, "loss": 0.0538, "step": 987 }, { "epoch": 0.4745437079731028, "grad_norm": 0.6656920927770863, "learning_rate": 9.897450001985038e-06, "loss": 0.0677, "step": 988 }, { "epoch": 0.4750240153698367, "grad_norm": 1.072363462917926, "learning_rate": 9.896886162711955e-06, "loss": 0.0523, "step": 989 }, { "epoch": 0.4755043227665706, "grad_norm": 0.627671736207023, "learning_rate": 9.896320793787106e-06, "loss": 0.0499, "step": 990 }, { "epoch": 0.4759846301633045, "grad_norm": 0.8567917544335966, "learning_rate": 9.895753895387101e-06, "loss": 0.0564, "step": 991 }, { "epoch": 0.47646493756003844, "grad_norm": 1.0192507334666507, "learning_rate": 9.895185467689022e-06, "loss": 0.0674, "step": 992 }, { "epoch": 0.4769452449567723, "grad_norm": 0.7961715978827638, "learning_rate": 9.894615510870429e-06, "loss": 0.0831, "step": 993 }, { "epoch": 0.47742555235350626, "grad_norm": 0.5483648725578345, "learning_rate": 9.894044025109363e-06, "loss": 0.0485, "step": 994 }, { "epoch": 0.47790585975024014, "grad_norm": 0.8078314585601833, "learning_rate": 9.893471010584337e-06, "loss": 0.0625, "step": 995 }, { "epoch": 0.4783861671469741, "grad_norm": 0.6306541062019995, "learning_rate": 9.892896467474348e-06, "loss": 0.0678, "step": 996 }, { "epoch": 0.47886647454370795, "grad_norm": 0.4985055781130739, "learning_rate": 9.892320395958865e-06, "loss": 0.0508, "step": 997 }, { "epoch": 0.4793467819404419, "grad_norm": 0.5440005750071006, "learning_rate": 9.89174279621784e-06, "loss": 0.0441, "step": 998 }, { "epoch": 0.47982708933717577, "grad_norm": 0.7286303002598773, "learning_rate": 9.891163668431696e-06, "loss": 0.0555, "step": 999 }, { "epoch": 0.4803073967339097, "grad_norm": 0.5690411831803118, "learning_rate": 9.890583012781338e-06, "loss": 0.0458, "step": 1000 }, { "epoch": 0.4807877041306436, "grad_norm": 0.46362095698553785, "learning_rate": 9.890000829448145e-06, "loss": 0.0385, "step": 1001 }, { "epoch": 0.4812680115273775, "grad_norm": 0.6543145086812504, "learning_rate": 9.889417118613978e-06, "loss": 0.0683, "step": 1002 }, { "epoch": 0.4817483189241114, "grad_norm": 0.556537310472474, "learning_rate": 9.888831880461171e-06, "loss": 0.0454, "step": 1003 }, { "epoch": 0.48222862632084534, "grad_norm": 0.6944965802448689, "learning_rate": 9.888245115172535e-06, "loss": 0.0499, "step": 1004 }, { "epoch": 0.4827089337175792, "grad_norm": 1.9875426185604483, "learning_rate": 9.88765682293136e-06, "loss": 0.0623, "step": 1005 }, { "epoch": 0.48318924111431316, "grad_norm": 0.5589465167484113, "learning_rate": 9.887067003921412e-06, "loss": 0.0549, "step": 1006 }, { "epoch": 0.48366954851104704, "grad_norm": 0.5825022233011566, "learning_rate": 9.886475658326935e-06, "loss": 0.0415, "step": 1007 }, { "epoch": 0.484149855907781, "grad_norm": 0.8085519967345373, "learning_rate": 9.885882786332647e-06, "loss": 0.0618, "step": 1008 }, { "epoch": 0.4846301633045149, "grad_norm": 0.6678730905959983, "learning_rate": 9.885288388123748e-06, "loss": 0.0466, "step": 1009 }, { "epoch": 0.4851104707012488, "grad_norm": 0.6528204330501064, "learning_rate": 9.88469246388591e-06, "loss": 0.0462, "step": 1010 }, { "epoch": 0.48559077809798273, "grad_norm": 0.50393839362882, "learning_rate": 9.884095013805282e-06, "loss": 0.0418, "step": 1011 }, { "epoch": 0.4860710854947166, "grad_norm": 0.6274523697170888, "learning_rate": 9.88349603806849e-06, "loss": 0.0687, "step": 1012 }, { "epoch": 0.48655139289145055, "grad_norm": 0.6366989175368812, "learning_rate": 9.882895536862643e-06, "loss": 0.0485, "step": 1013 }, { "epoch": 0.48703170028818443, "grad_norm": 0.48888844573115503, "learning_rate": 9.882293510375314e-06, "loss": 0.0512, "step": 1014 }, { "epoch": 0.48751200768491837, "grad_norm": 0.5932068169061537, "learning_rate": 9.881689958794564e-06, "loss": 0.0488, "step": 1015 }, { "epoch": 0.48799231508165225, "grad_norm": 1.2627441745695316, "learning_rate": 9.881084882308924e-06, "loss": 0.0714, "step": 1016 }, { "epoch": 0.4884726224783862, "grad_norm": 0.9627073875348847, "learning_rate": 9.880478281107404e-06, "loss": 0.0577, "step": 1017 }, { "epoch": 0.48895292987512007, "grad_norm": 0.7088884846326223, "learning_rate": 9.87987015537949e-06, "loss": 0.0874, "step": 1018 }, { "epoch": 0.489433237271854, "grad_norm": 0.7529185355526793, "learning_rate": 9.879260505315143e-06, "loss": 0.0497, "step": 1019 }, { "epoch": 0.4899135446685879, "grad_norm": 0.6165894758520094, "learning_rate": 9.878649331104798e-06, "loss": 0.0604, "step": 1020 }, { "epoch": 0.4903938520653218, "grad_norm": 0.5042776271300216, "learning_rate": 9.878036632939374e-06, "loss": 0.047, "step": 1021 }, { "epoch": 0.4908741594620557, "grad_norm": 0.44643658788918444, "learning_rate": 9.877422411010257e-06, "loss": 0.0498, "step": 1022 }, { "epoch": 0.49135446685878964, "grad_norm": 0.5634531390244911, "learning_rate": 9.876806665509314e-06, "loss": 0.0569, "step": 1023 }, { "epoch": 0.4918347742555235, "grad_norm": 0.5619776061410058, "learning_rate": 9.876189396628889e-06, "loss": 0.0446, "step": 1024 }, { "epoch": 0.49231508165225746, "grad_norm": 0.9390023063566659, "learning_rate": 9.875570604561796e-06, "loss": 0.0792, "step": 1025 }, { "epoch": 0.49279538904899134, "grad_norm": 0.5013365650894346, "learning_rate": 9.874950289501332e-06, "loss": 0.0519, "step": 1026 }, { "epoch": 0.4932756964457253, "grad_norm": 0.46104032503551207, "learning_rate": 9.874328451641264e-06, "loss": 0.0458, "step": 1027 }, { "epoch": 0.49375600384245916, "grad_norm": 0.46235258731993134, "learning_rate": 9.873705091175838e-06, "loss": 0.0417, "step": 1028 }, { "epoch": 0.4942363112391931, "grad_norm": 0.49971536792567545, "learning_rate": 9.873080208299773e-06, "loss": 0.045, "step": 1029 }, { "epoch": 0.494716618635927, "grad_norm": 0.5340542754950781, "learning_rate": 9.872453803208268e-06, "loss": 0.0455, "step": 1030 }, { "epoch": 0.4951969260326609, "grad_norm": 0.5727266302637889, "learning_rate": 9.871825876096992e-06, "loss": 0.0461, "step": 1031 }, { "epoch": 0.4956772334293948, "grad_norm": 0.6471275463338492, "learning_rate": 9.871196427162094e-06, "loss": 0.0531, "step": 1032 }, { "epoch": 0.49615754082612873, "grad_norm": 0.8462618075680615, "learning_rate": 9.870565456600194e-06, "loss": 0.0913, "step": 1033 }, { "epoch": 0.4966378482228626, "grad_norm": 0.6248139766098908, "learning_rate": 9.869932964608392e-06, "loss": 0.0488, "step": 1034 }, { "epoch": 0.49711815561959655, "grad_norm": 0.5031224522406227, "learning_rate": 9.86929895138426e-06, "loss": 0.0463, "step": 1035 }, { "epoch": 0.49759846301633043, "grad_norm": 0.6015714100591678, "learning_rate": 9.868663417125849e-06, "loss": 0.0543, "step": 1036 }, { "epoch": 0.49807877041306436, "grad_norm": 0.7015166961244664, "learning_rate": 9.868026362031676e-06, "loss": 0.0577, "step": 1037 }, { "epoch": 0.49855907780979825, "grad_norm": 0.5811321733944121, "learning_rate": 9.867387786300743e-06, "loss": 0.046, "step": 1038 }, { "epoch": 0.4990393852065322, "grad_norm": 0.9085434522653888, "learning_rate": 9.866747690132527e-06, "loss": 0.073, "step": 1039 }, { "epoch": 0.49951969260326606, "grad_norm": 0.6591822101096578, "learning_rate": 9.866106073726971e-06, "loss": 0.0567, "step": 1040 }, { "epoch": 0.5, "grad_norm": 0.835558730775927, "learning_rate": 9.865462937284501e-06, "loss": 0.0535, "step": 1041 }, { "epoch": 0.5004803073967339, "grad_norm": 0.5892727698062462, "learning_rate": 9.864818281006013e-06, "loss": 0.0439, "step": 1042 }, { "epoch": 0.5009606147934679, "grad_norm": 0.8395833865727974, "learning_rate": 9.86417210509288e-06, "loss": 0.0718, "step": 1043 }, { "epoch": 0.5014409221902018, "grad_norm": 1.1531297612195082, "learning_rate": 9.86352440974695e-06, "loss": 0.0701, "step": 1044 }, { "epoch": 0.5019212295869356, "grad_norm": 0.5682920886173015, "learning_rate": 9.862875195170547e-06, "loss": 0.0478, "step": 1045 }, { "epoch": 0.5024015369836695, "grad_norm": 0.5479631332409276, "learning_rate": 9.862224461566467e-06, "loss": 0.0481, "step": 1046 }, { "epoch": 0.5028818443804035, "grad_norm": 0.6686914490763103, "learning_rate": 9.861572209137978e-06, "loss": 0.054, "step": 1047 }, { "epoch": 0.5033621517771374, "grad_norm": 0.5548966562906734, "learning_rate": 9.860918438088828e-06, "loss": 0.058, "step": 1048 }, { "epoch": 0.5038424591738713, "grad_norm": 0.6221834543978872, "learning_rate": 9.860263148623238e-06, "loss": 0.0663, "step": 1049 }, { "epoch": 0.5043227665706052, "grad_norm": 0.4096266159270642, "learning_rate": 9.859606340945904e-06, "loss": 0.0339, "step": 1050 }, { "epoch": 0.5048030739673391, "grad_norm": 1.091503069790824, "learning_rate": 9.858948015261988e-06, "loss": 0.0628, "step": 1051 }, { "epoch": 0.505283381364073, "grad_norm": 0.6683717006752189, "learning_rate": 9.858288171777137e-06, "loss": 0.0588, "step": 1052 }, { "epoch": 0.5057636887608069, "grad_norm": 0.6905695109286866, "learning_rate": 9.857626810697468e-06, "loss": 0.0633, "step": 1053 }, { "epoch": 0.5062439961575408, "grad_norm": 0.5075936442073656, "learning_rate": 9.85696393222957e-06, "loss": 0.0475, "step": 1054 }, { "epoch": 0.5067243035542748, "grad_norm": 0.59460142658833, "learning_rate": 9.856299536580511e-06, "loss": 0.0494, "step": 1055 }, { "epoch": 0.5072046109510087, "grad_norm": 1.6512164384609662, "learning_rate": 9.855633623957828e-06, "loss": 0.0727, "step": 1056 }, { "epoch": 0.5076849183477425, "grad_norm": 0.5580012014124549, "learning_rate": 9.854966194569533e-06, "loss": 0.0441, "step": 1057 }, { "epoch": 0.5081652257444764, "grad_norm": 0.5763621029835471, "learning_rate": 9.854297248624113e-06, "loss": 0.0477, "step": 1058 }, { "epoch": 0.5086455331412104, "grad_norm": 0.8992511423911591, "learning_rate": 9.853626786330529e-06, "loss": 0.0812, "step": 1059 }, { "epoch": 0.5091258405379443, "grad_norm": 0.5368922969181873, "learning_rate": 9.852954807898212e-06, "loss": 0.0467, "step": 1060 }, { "epoch": 0.5096061479346782, "grad_norm": 0.44033960156979046, "learning_rate": 9.852281313537074e-06, "loss": 0.0426, "step": 1061 }, { "epoch": 0.5100864553314121, "grad_norm": 0.46569533955278264, "learning_rate": 9.851606303457492e-06, "loss": 0.0423, "step": 1062 }, { "epoch": 0.510566762728146, "grad_norm": 0.6709478469641016, "learning_rate": 9.850929777870324e-06, "loss": 0.0635, "step": 1063 }, { "epoch": 0.5110470701248799, "grad_norm": 0.7589418223076211, "learning_rate": 9.850251736986895e-06, "loss": 0.0485, "step": 1064 }, { "epoch": 0.5115273775216138, "grad_norm": 0.5039707034476073, "learning_rate": 9.849572181019008e-06, "loss": 0.0396, "step": 1065 }, { "epoch": 0.5120076849183477, "grad_norm": 0.665169242046722, "learning_rate": 9.848891110178936e-06, "loss": 0.056, "step": 1066 }, { "epoch": 0.5124879923150817, "grad_norm": 0.7006135018748972, "learning_rate": 9.848208524679426e-06, "loss": 0.0545, "step": 1067 }, { "epoch": 0.5129682997118156, "grad_norm": 1.371433470057797, "learning_rate": 9.847524424733701e-06, "loss": 0.0592, "step": 1068 }, { "epoch": 0.5134486071085494, "grad_norm": 0.5371431774356002, "learning_rate": 9.846838810555454e-06, "loss": 0.0481, "step": 1069 }, { "epoch": 0.5139289145052833, "grad_norm": 0.9546711896225505, "learning_rate": 9.846151682358853e-06, "loss": 0.0605, "step": 1070 }, { "epoch": 0.5144092219020173, "grad_norm": 0.4469628087936052, "learning_rate": 9.845463040358538e-06, "loss": 0.0461, "step": 1071 }, { "epoch": 0.5148895292987512, "grad_norm": 0.7337259088697539, "learning_rate": 9.84477288476962e-06, "loss": 0.0344, "step": 1072 }, { "epoch": 0.5153698366954851, "grad_norm": 0.6176921441355193, "learning_rate": 9.844081215807684e-06, "loss": 0.0554, "step": 1073 }, { "epoch": 0.515850144092219, "grad_norm": 0.7254549945035398, "learning_rate": 9.843388033688789e-06, "loss": 0.0406, "step": 1074 }, { "epoch": 0.516330451488953, "grad_norm": 0.6081358712579048, "learning_rate": 9.842693338629468e-06, "loss": 0.0607, "step": 1075 }, { "epoch": 0.5168107588856868, "grad_norm": 0.5697699264079668, "learning_rate": 9.84199713084672e-06, "loss": 0.0693, "step": 1076 }, { "epoch": 0.5172910662824207, "grad_norm": 0.3687520833244058, "learning_rate": 9.841299410558026e-06, "loss": 0.035, "step": 1077 }, { "epoch": 0.5177713736791547, "grad_norm": 0.6188868823780803, "learning_rate": 9.840600177981331e-06, "loss": 0.0378, "step": 1078 }, { "epoch": 0.5182516810758886, "grad_norm": 0.44919065810406017, "learning_rate": 9.839899433335059e-06, "loss": 0.0569, "step": 1079 }, { "epoch": 0.5187319884726225, "grad_norm": 0.525069381664302, "learning_rate": 9.839197176838102e-06, "loss": 0.0534, "step": 1080 }, { "epoch": 0.5192122958693564, "grad_norm": 0.5212151097899781, "learning_rate": 9.838493408709823e-06, "loss": 0.051, "step": 1081 }, { "epoch": 0.5196926032660903, "grad_norm": 0.6605149345379034, "learning_rate": 9.837788129170063e-06, "loss": 0.076, "step": 1082 }, { "epoch": 0.5201729106628242, "grad_norm": 0.7131108392835402, "learning_rate": 9.83708133843913e-06, "loss": 0.0723, "step": 1083 }, { "epoch": 0.5206532180595581, "grad_norm": 0.5238463836985972, "learning_rate": 9.836373036737805e-06, "loss": 0.0476, "step": 1084 }, { "epoch": 0.521133525456292, "grad_norm": 0.4394705568289204, "learning_rate": 9.835663224287343e-06, "loss": 0.052, "step": 1085 }, { "epoch": 0.521613832853026, "grad_norm": 0.6627060321697689, "learning_rate": 9.834951901309473e-06, "loss": 0.0514, "step": 1086 }, { "epoch": 0.5220941402497599, "grad_norm": 0.6047813561651657, "learning_rate": 9.834239068026388e-06, "loss": 0.0662, "step": 1087 }, { "epoch": 0.5225744476464937, "grad_norm": 0.608811144599817, "learning_rate": 9.83352472466076e-06, "loss": 0.046, "step": 1088 }, { "epoch": 0.5230547550432276, "grad_norm": 1.158801190471405, "learning_rate": 9.832808871435728e-06, "loss": 0.0524, "step": 1089 }, { "epoch": 0.5235350624399616, "grad_norm": 0.46635581913051444, "learning_rate": 9.832091508574906e-06, "loss": 0.0449, "step": 1090 }, { "epoch": 0.5240153698366955, "grad_norm": 0.48367373014821347, "learning_rate": 9.831372636302379e-06, "loss": 0.0458, "step": 1091 }, { "epoch": 0.5244956772334294, "grad_norm": 0.361218140616094, "learning_rate": 9.8306522548427e-06, "loss": 0.04, "step": 1092 }, { "epoch": 0.5249759846301633, "grad_norm": 0.655690358593338, "learning_rate": 9.829930364420902e-06, "loss": 0.0554, "step": 1093 }, { "epoch": 0.5254562920268973, "grad_norm": 0.7505215320060055, "learning_rate": 9.829206965262477e-06, "loss": 0.0743, "step": 1094 }, { "epoch": 0.5259365994236311, "grad_norm": 0.5750818638699632, "learning_rate": 9.828482057593397e-06, "loss": 0.0576, "step": 1095 }, { "epoch": 0.526416906820365, "grad_norm": 0.47388206139707323, "learning_rate": 9.827755641640105e-06, "loss": 0.0379, "step": 1096 }, { "epoch": 0.5268972142170989, "grad_norm": 0.5077311880425335, "learning_rate": 9.827027717629511e-06, "loss": 0.0623, "step": 1097 }, { "epoch": 0.5273775216138329, "grad_norm": 0.45204043027410235, "learning_rate": 9.826298285789002e-06, "loss": 0.0398, "step": 1098 }, { "epoch": 0.5278578290105668, "grad_norm": 0.7965027074583484, "learning_rate": 9.825567346346427e-06, "loss": 0.0678, "step": 1099 }, { "epoch": 0.5283381364073007, "grad_norm": 0.6917329847032074, "learning_rate": 9.824834899530116e-06, "loss": 0.0619, "step": 1100 }, { "epoch": 0.5288184438040345, "grad_norm": 0.5524043158252445, "learning_rate": 9.824100945568862e-06, "loss": 0.0465, "step": 1101 }, { "epoch": 0.5292987512007685, "grad_norm": 0.5638562567517995, "learning_rate": 9.823365484691933e-06, "loss": 0.0466, "step": 1102 }, { "epoch": 0.5297790585975024, "grad_norm": 0.8316442556169921, "learning_rate": 9.822628517129067e-06, "loss": 0.0881, "step": 1103 }, { "epoch": 0.5302593659942363, "grad_norm": 0.5548828049736547, "learning_rate": 9.821890043110471e-06, "loss": 0.0518, "step": 1104 }, { "epoch": 0.5307396733909702, "grad_norm": 0.41550027200381046, "learning_rate": 9.821150062866826e-06, "loss": 0.0408, "step": 1105 }, { "epoch": 0.5312199807877042, "grad_norm": 0.5895290692704226, "learning_rate": 9.82040857662928e-06, "loss": 0.051, "step": 1106 }, { "epoch": 0.531700288184438, "grad_norm": 0.5248740479596763, "learning_rate": 9.819665584629453e-06, "loss": 0.0561, "step": 1107 }, { "epoch": 0.5321805955811719, "grad_norm": 0.6505572311350228, "learning_rate": 9.818921087099435e-06, "loss": 0.059, "step": 1108 }, { "epoch": 0.5326609029779059, "grad_norm": 0.6556567838840428, "learning_rate": 9.818175084271786e-06, "loss": 0.0559, "step": 1109 }, { "epoch": 0.5331412103746398, "grad_norm": 0.5382916731099718, "learning_rate": 9.817427576379536e-06, "loss": 0.0485, "step": 1110 }, { "epoch": 0.5336215177713737, "grad_norm": 0.3758870057448877, "learning_rate": 9.81667856365619e-06, "loss": 0.0496, "step": 1111 }, { "epoch": 0.5341018251681076, "grad_norm": 0.6578831277841793, "learning_rate": 9.815928046335713e-06, "loss": 0.0557, "step": 1112 }, { "epoch": 0.5345821325648416, "grad_norm": 1.060321357406775, "learning_rate": 9.81517602465255e-06, "loss": 0.0532, "step": 1113 }, { "epoch": 0.5350624399615754, "grad_norm": 0.5103225144908645, "learning_rate": 9.81442249884161e-06, "loss": 0.035, "step": 1114 }, { "epoch": 0.5355427473583093, "grad_norm": 0.37825548058600067, "learning_rate": 9.813667469138273e-06, "loss": 0.0429, "step": 1115 }, { "epoch": 0.5360230547550432, "grad_norm": 0.805237319756677, "learning_rate": 9.812910935778393e-06, "loss": 0.068, "step": 1116 }, { "epoch": 0.5365033621517772, "grad_norm": 0.42363680651332974, "learning_rate": 9.812152898998286e-06, "loss": 0.0469, "step": 1117 }, { "epoch": 0.5369836695485111, "grad_norm": 0.6106525825949898, "learning_rate": 9.811393359034742e-06, "loss": 0.0542, "step": 1118 }, { "epoch": 0.537463976945245, "grad_norm": 0.8001537869444254, "learning_rate": 9.810632316125023e-06, "loss": 0.0565, "step": 1119 }, { "epoch": 0.5379442843419788, "grad_norm": 0.7262847802024673, "learning_rate": 9.809869770506855e-06, "loss": 0.0695, "step": 1120 }, { "epoch": 0.5384245917387128, "grad_norm": 1.1087961916090308, "learning_rate": 9.80910572241844e-06, "loss": 0.0657, "step": 1121 }, { "epoch": 0.5389048991354467, "grad_norm": 0.41628123944641576, "learning_rate": 9.80834017209844e-06, "loss": 0.0463, "step": 1122 }, { "epoch": 0.5393852065321806, "grad_norm": 0.5808409195734693, "learning_rate": 9.807573119785995e-06, "loss": 0.0606, "step": 1123 }, { "epoch": 0.5398655139289145, "grad_norm": 0.49580631030323374, "learning_rate": 9.806804565720712e-06, "loss": 0.0302, "step": 1124 }, { "epoch": 0.5403458213256485, "grad_norm": 1.0814853755104163, "learning_rate": 9.806034510142664e-06, "loss": 0.0675, "step": 1125 }, { "epoch": 0.5408261287223823, "grad_norm": 0.549003216277453, "learning_rate": 9.805262953292395e-06, "loss": 0.0533, "step": 1126 }, { "epoch": 0.5413064361191162, "grad_norm": 0.5609756437216896, "learning_rate": 9.80448989541092e-06, "loss": 0.057, "step": 1127 }, { "epoch": 0.5417867435158501, "grad_norm": 0.9338428086220139, "learning_rate": 9.803715336739717e-06, "loss": 0.0834, "step": 1128 }, { "epoch": 0.5422670509125841, "grad_norm": 0.5122018899106414, "learning_rate": 9.802939277520742e-06, "loss": 0.0484, "step": 1129 }, { "epoch": 0.542747358309318, "grad_norm": 0.5809571628506907, "learning_rate": 9.80216171799641e-06, "loss": 0.0446, "step": 1130 }, { "epoch": 0.5432276657060519, "grad_norm": 0.5453610787253219, "learning_rate": 9.801382658409611e-06, "loss": 0.0605, "step": 1131 }, { "epoch": 0.5437079731027857, "grad_norm": 0.45815751598603865, "learning_rate": 9.800602099003702e-06, "loss": 0.0441, "step": 1132 }, { "epoch": 0.5441882804995197, "grad_norm": 0.618974171564184, "learning_rate": 9.799820040022507e-06, "loss": 0.0565, "step": 1133 }, { "epoch": 0.5446685878962536, "grad_norm": 0.577552914326065, "learning_rate": 9.79903648171032e-06, "loss": 0.0625, "step": 1134 }, { "epoch": 0.5451488952929875, "grad_norm": 0.674317185574733, "learning_rate": 9.798251424311904e-06, "loss": 0.0601, "step": 1135 }, { "epoch": 0.5456292026897214, "grad_norm": 0.5368692747640629, "learning_rate": 9.797464868072489e-06, "loss": 0.0425, "step": 1136 }, { "epoch": 0.5461095100864554, "grad_norm": 0.8059063068692716, "learning_rate": 9.79667681323777e-06, "loss": 0.0607, "step": 1137 }, { "epoch": 0.5465898174831892, "grad_norm": 0.44591141730781386, "learning_rate": 9.795887260053918e-06, "loss": 0.0458, "step": 1138 }, { "epoch": 0.5470701248799231, "grad_norm": 0.49579824081441304, "learning_rate": 9.795096208767565e-06, "loss": 0.0417, "step": 1139 }, { "epoch": 0.547550432276657, "grad_norm": 0.9974120235484584, "learning_rate": 9.794303659625815e-06, "loss": 0.067, "step": 1140 }, { "epoch": 0.548030739673391, "grad_norm": 0.6293616673344423, "learning_rate": 9.793509612876237e-06, "loss": 0.0574, "step": 1141 }, { "epoch": 0.5485110470701249, "grad_norm": 0.5069087987447661, "learning_rate": 9.792714068766872e-06, "loss": 0.0507, "step": 1142 }, { "epoch": 0.5489913544668588, "grad_norm": 0.44270498114612894, "learning_rate": 9.791917027546223e-06, "loss": 0.0511, "step": 1143 }, { "epoch": 0.5494716618635928, "grad_norm": 0.5614291657476101, "learning_rate": 9.791118489463265e-06, "loss": 0.0482, "step": 1144 }, { "epoch": 0.5499519692603266, "grad_norm": 0.6022614849565968, "learning_rate": 9.790318454767438e-06, "loss": 0.0433, "step": 1145 }, { "epoch": 0.5504322766570605, "grad_norm": 0.4960237823747353, "learning_rate": 9.78951692370865e-06, "loss": 0.0618, "step": 1146 }, { "epoch": 0.5509125840537944, "grad_norm": 0.7042914518317633, "learning_rate": 9.78871389653728e-06, "loss": 0.058, "step": 1147 }, { "epoch": 0.5513928914505284, "grad_norm": 0.6312566271413287, "learning_rate": 9.787909373504172e-06, "loss": 0.0429, "step": 1148 }, { "epoch": 0.5518731988472623, "grad_norm": 0.39432463738714213, "learning_rate": 9.787103354860633e-06, "loss": 0.0346, "step": 1149 }, { "epoch": 0.5523535062439962, "grad_norm": 0.9311799513518504, "learning_rate": 9.786295840858444e-06, "loss": 0.0658, "step": 1150 }, { "epoch": 0.55283381364073, "grad_norm": 0.5007348827746324, "learning_rate": 9.785486831749847e-06, "loss": 0.0452, "step": 1151 }, { "epoch": 0.553314121037464, "grad_norm": 0.49962024985272985, "learning_rate": 9.784676327787557e-06, "loss": 0.0568, "step": 1152 }, { "epoch": 0.5537944284341979, "grad_norm": 0.3691640957902528, "learning_rate": 9.783864329224752e-06, "loss": 0.0451, "step": 1153 }, { "epoch": 0.5542747358309318, "grad_norm": 0.413567800244177, "learning_rate": 9.783050836315078e-06, "loss": 0.0472, "step": 1154 }, { "epoch": 0.5547550432276657, "grad_norm": 0.39518046681000146, "learning_rate": 9.782235849312647e-06, "loss": 0.041, "step": 1155 }, { "epoch": 0.5552353506243997, "grad_norm": 0.49523548669900735, "learning_rate": 9.781419368472039e-06, "loss": 0.0441, "step": 1156 }, { "epoch": 0.5557156580211335, "grad_norm": 0.6024044401609281, "learning_rate": 9.7806013940483e-06, "loss": 0.0536, "step": 1157 }, { "epoch": 0.5561959654178674, "grad_norm": 1.2088324636184875, "learning_rate": 9.779781926296942e-06, "loss": 0.0708, "step": 1158 }, { "epoch": 0.5566762728146013, "grad_norm": 0.6166764403933446, "learning_rate": 9.778960965473945e-06, "loss": 0.0482, "step": 1159 }, { "epoch": 0.5571565802113353, "grad_norm": 0.9288628974217886, "learning_rate": 9.778138511835753e-06, "loss": 0.0672, "step": 1160 }, { "epoch": 0.5576368876080692, "grad_norm": 1.2330701366219725, "learning_rate": 9.77731456563928e-06, "loss": 0.0558, "step": 1161 }, { "epoch": 0.5581171950048031, "grad_norm": 0.4791648541731939, "learning_rate": 9.776489127141902e-06, "loss": 0.043, "step": 1162 }, { "epoch": 0.5585975024015369, "grad_norm": 0.5709504036661929, "learning_rate": 9.775662196601464e-06, "loss": 0.0481, "step": 1163 }, { "epoch": 0.5590778097982709, "grad_norm": 0.6468021810734045, "learning_rate": 9.774833774276278e-06, "loss": 0.0541, "step": 1164 }, { "epoch": 0.5595581171950048, "grad_norm": 0.46892414876617317, "learning_rate": 9.774003860425116e-06, "loss": 0.0408, "step": 1165 }, { "epoch": 0.5600384245917387, "grad_norm": 0.7110924147202355, "learning_rate": 9.773172455307223e-06, "loss": 0.0869, "step": 1166 }, { "epoch": 0.5605187319884726, "grad_norm": 0.5925406633330682, "learning_rate": 9.772339559182307e-06, "loss": 0.038, "step": 1167 }, { "epoch": 0.5609990393852066, "grad_norm": 0.4344853797907911, "learning_rate": 9.77150517231054e-06, "loss": 0.0448, "step": 1168 }, { "epoch": 0.5614793467819404, "grad_norm": 1.5378334777798277, "learning_rate": 9.770669294952562e-06, "loss": 0.0455, "step": 1169 }, { "epoch": 0.5619596541786743, "grad_norm": 0.661380367146787, "learning_rate": 9.76983192736948e-06, "loss": 0.0646, "step": 1170 }, { "epoch": 0.5624399615754082, "grad_norm": 0.6292936111066385, "learning_rate": 9.768993069822862e-06, "loss": 0.0543, "step": 1171 }, { "epoch": 0.5629202689721422, "grad_norm": 0.5168700137660031, "learning_rate": 9.768152722574747e-06, "loss": 0.0482, "step": 1172 }, { "epoch": 0.5634005763688761, "grad_norm": 0.48793915527548615, "learning_rate": 9.76731088588763e-06, "loss": 0.052, "step": 1173 }, { "epoch": 0.56388088376561, "grad_norm": 0.8671102690781609, "learning_rate": 9.766467560024485e-06, "loss": 0.07, "step": 1174 }, { "epoch": 0.5643611911623438, "grad_norm": 0.5202999857616358, "learning_rate": 9.765622745248739e-06, "loss": 0.0541, "step": 1175 }, { "epoch": 0.5648414985590778, "grad_norm": 1.2213889314313011, "learning_rate": 9.76477644182429e-06, "loss": 0.045, "step": 1176 }, { "epoch": 0.5653218059558117, "grad_norm": 0.5437848268666584, "learning_rate": 9.7639286500155e-06, "loss": 0.0434, "step": 1177 }, { "epoch": 0.5658021133525456, "grad_norm": 0.46403176390550605, "learning_rate": 9.763079370087196e-06, "loss": 0.0352, "step": 1178 }, { "epoch": 0.5662824207492796, "grad_norm": 0.7171751460163477, "learning_rate": 9.762228602304667e-06, "loss": 0.0578, "step": 1179 }, { "epoch": 0.5667627281460135, "grad_norm": 0.48339364673648727, "learning_rate": 9.761376346933672e-06, "loss": 0.0464, "step": 1180 }, { "epoch": 0.5672430355427474, "grad_norm": 0.4414131140482316, "learning_rate": 9.760522604240434e-06, "loss": 0.0356, "step": 1181 }, { "epoch": 0.5677233429394812, "grad_norm": 0.995263931495515, "learning_rate": 9.759667374491632e-06, "loss": 0.064, "step": 1182 }, { "epoch": 0.5682036503362152, "grad_norm": 0.6310705676300887, "learning_rate": 9.758810657954424e-06, "loss": 0.0643, "step": 1183 }, { "epoch": 0.5686839577329491, "grad_norm": 0.8865712295477853, "learning_rate": 9.757952454896418e-06, "loss": 0.0707, "step": 1184 }, { "epoch": 0.569164265129683, "grad_norm": 0.44330618752194106, "learning_rate": 9.757092765585695e-06, "loss": 0.0427, "step": 1185 }, { "epoch": 0.5696445725264169, "grad_norm": 0.5101911660662402, "learning_rate": 9.7562315902908e-06, "loss": 0.0612, "step": 1186 }, { "epoch": 0.5701248799231509, "grad_norm": 0.3634609856460933, "learning_rate": 9.755368929280738e-06, "loss": 0.0359, "step": 1187 }, { "epoch": 0.5706051873198847, "grad_norm": 0.8929703488376083, "learning_rate": 9.754504782824982e-06, "loss": 0.0608, "step": 1188 }, { "epoch": 0.5710854947166186, "grad_norm": 0.7621720458573135, "learning_rate": 9.753639151193468e-06, "loss": 0.0747, "step": 1189 }, { "epoch": 0.5715658021133525, "grad_norm": 0.4562290573979324, "learning_rate": 9.752772034656593e-06, "loss": 0.0453, "step": 1190 }, { "epoch": 0.5720461095100865, "grad_norm": 0.454374886984673, "learning_rate": 9.75190343348522e-06, "loss": 0.046, "step": 1191 }, { "epoch": 0.5725264169068204, "grad_norm": 0.4985067047730674, "learning_rate": 9.75103334795068e-06, "loss": 0.0465, "step": 1192 }, { "epoch": 0.5730067243035543, "grad_norm": 0.62518531188208, "learning_rate": 9.750161778324759e-06, "loss": 0.0546, "step": 1193 }, { "epoch": 0.5734870317002881, "grad_norm": 0.5218968728591356, "learning_rate": 9.749288724879716e-06, "loss": 0.0451, "step": 1194 }, { "epoch": 0.5739673390970221, "grad_norm": 0.520699846944625, "learning_rate": 9.748414187888262e-06, "loss": 0.0511, "step": 1195 }, { "epoch": 0.574447646493756, "grad_norm": 0.7212870071975475, "learning_rate": 9.747538167623585e-06, "loss": 0.0539, "step": 1196 }, { "epoch": 0.5749279538904899, "grad_norm": 0.41354709356039293, "learning_rate": 9.746660664359326e-06, "loss": 0.0348, "step": 1197 }, { "epoch": 0.5754082612872238, "grad_norm": 0.47323694276958017, "learning_rate": 9.745781678369594e-06, "loss": 0.0342, "step": 1198 }, { "epoch": 0.5758885686839578, "grad_norm": 0.7798319365982702, "learning_rate": 9.744901209928959e-06, "loss": 0.0386, "step": 1199 }, { "epoch": 0.5763688760806917, "grad_norm": 0.41388948108418533, "learning_rate": 9.744019259312454e-06, "loss": 0.0404, "step": 1200 }, { "epoch": 0.5768491834774255, "grad_norm": 0.45590776454332793, "learning_rate": 9.74313582679558e-06, "loss": 0.059, "step": 1201 }, { "epoch": 0.5773294908741594, "grad_norm": 0.8624611717777299, "learning_rate": 9.742250912654292e-06, "loss": 0.0573, "step": 1202 }, { "epoch": 0.5778097982708934, "grad_norm": 0.4105792475406539, "learning_rate": 9.741364517165017e-06, "loss": 0.0474, "step": 1203 }, { "epoch": 0.5782901056676273, "grad_norm": 0.4030691281347931, "learning_rate": 9.740476640604637e-06, "loss": 0.0533, "step": 1204 }, { "epoch": 0.5787704130643612, "grad_norm": 0.8208559639247031, "learning_rate": 9.7395872832505e-06, "loss": 0.0491, "step": 1205 }, { "epoch": 0.579250720461095, "grad_norm": 0.6089308840844816, "learning_rate": 9.73869644538042e-06, "loss": 0.069, "step": 1206 }, { "epoch": 0.579731027857829, "grad_norm": 0.6462057166604767, "learning_rate": 9.737804127272668e-06, "loss": 0.051, "step": 1207 }, { "epoch": 0.5802113352545629, "grad_norm": 0.4269852827384148, "learning_rate": 9.73691032920598e-06, "loss": 0.0434, "step": 1208 }, { "epoch": 0.5806916426512968, "grad_norm": 0.4245254269716441, "learning_rate": 9.736015051459551e-06, "loss": 0.0372, "step": 1209 }, { "epoch": 0.5811719500480308, "grad_norm": 0.3532506765163264, "learning_rate": 9.735118294313045e-06, "loss": 0.0466, "step": 1210 }, { "epoch": 0.5816522574447647, "grad_norm": 0.34262000331545467, "learning_rate": 9.734220058046582e-06, "loss": 0.0356, "step": 1211 }, { "epoch": 0.5821325648414986, "grad_norm": 0.5754600784602188, "learning_rate": 9.733320342940747e-06, "loss": 0.0543, "step": 1212 }, { "epoch": 0.5826128722382324, "grad_norm": 0.5170774377794336, "learning_rate": 9.732419149276586e-06, "loss": 0.0558, "step": 1213 }, { "epoch": 0.5830931796349664, "grad_norm": 0.5109210528826162, "learning_rate": 9.731516477335607e-06, "loss": 0.062, "step": 1214 }, { "epoch": 0.5835734870317003, "grad_norm": 0.4985819768058242, "learning_rate": 9.73061232739978e-06, "loss": 0.0529, "step": 1215 }, { "epoch": 0.5840537944284342, "grad_norm": 0.5498108762960605, "learning_rate": 9.729706699751535e-06, "loss": 0.0517, "step": 1216 }, { "epoch": 0.5845341018251681, "grad_norm": 0.7920509732214022, "learning_rate": 9.728799594673766e-06, "loss": 0.0478, "step": 1217 }, { "epoch": 0.5850144092219021, "grad_norm": 0.5652672810126951, "learning_rate": 9.727891012449827e-06, "loss": 0.0525, "step": 1218 }, { "epoch": 0.585494716618636, "grad_norm": 0.5691398549888063, "learning_rate": 9.726980953363536e-06, "loss": 0.0496, "step": 1219 }, { "epoch": 0.5859750240153698, "grad_norm": 0.5253745870416723, "learning_rate": 9.726069417699167e-06, "loss": 0.0483, "step": 1220 }, { "epoch": 0.5864553314121037, "grad_norm": 0.6773929868480058, "learning_rate": 9.725156405741461e-06, "loss": 0.0455, "step": 1221 }, { "epoch": 0.5869356388088377, "grad_norm": 0.8365763274612497, "learning_rate": 9.724241917775616e-06, "loss": 0.0619, "step": 1222 }, { "epoch": 0.5874159462055716, "grad_norm": 0.6630891954815319, "learning_rate": 9.723325954087294e-06, "loss": 0.064, "step": 1223 }, { "epoch": 0.5878962536023055, "grad_norm": 0.34729186105181487, "learning_rate": 9.722408514962619e-06, "loss": 0.0353, "step": 1224 }, { "epoch": 0.5883765609990393, "grad_norm": 0.44671842700761955, "learning_rate": 9.721489600688168e-06, "loss": 0.0442, "step": 1225 }, { "epoch": 0.5888568683957733, "grad_norm": 0.44380993259781293, "learning_rate": 9.720569211550988e-06, "loss": 0.0401, "step": 1226 }, { "epoch": 0.5893371757925072, "grad_norm": 0.7426688381698872, "learning_rate": 9.719647347838584e-06, "loss": 0.0536, "step": 1227 }, { "epoch": 0.5898174831892411, "grad_norm": 0.6909672290678518, "learning_rate": 9.718724009838917e-06, "loss": 0.0488, "step": 1228 }, { "epoch": 0.590297790585975, "grad_norm": 0.5801124250465916, "learning_rate": 9.717799197840416e-06, "loss": 0.0559, "step": 1229 }, { "epoch": 0.590778097982709, "grad_norm": 0.38585748356811145, "learning_rate": 9.716872912131964e-06, "loss": 0.0387, "step": 1230 }, { "epoch": 0.5912584053794429, "grad_norm": 0.5037471388123007, "learning_rate": 9.715945153002908e-06, "loss": 0.0478, "step": 1231 }, { "epoch": 0.5917387127761767, "grad_norm": 0.42301309786103813, "learning_rate": 9.715015920743056e-06, "loss": 0.0441, "step": 1232 }, { "epoch": 0.5922190201729106, "grad_norm": 0.5895965832076508, "learning_rate": 9.714085215642672e-06, "loss": 0.0442, "step": 1233 }, { "epoch": 0.5926993275696446, "grad_norm": 0.6111729772870694, "learning_rate": 9.713153037992484e-06, "loss": 0.0494, "step": 1234 }, { "epoch": 0.5931796349663785, "grad_norm": 0.6116675844026525, "learning_rate": 9.712219388083676e-06, "loss": 0.0578, "step": 1235 }, { "epoch": 0.5936599423631124, "grad_norm": 0.46491825617527227, "learning_rate": 9.711284266207899e-06, "loss": 0.0564, "step": 1236 }, { "epoch": 0.5941402497598463, "grad_norm": 0.952450984803784, "learning_rate": 9.710347672657254e-06, "loss": 0.046, "step": 1237 }, { "epoch": 0.5946205571565802, "grad_norm": 0.6313323004545888, "learning_rate": 9.70940960772431e-06, "loss": 0.0452, "step": 1238 }, { "epoch": 0.5951008645533141, "grad_norm": 0.5509216833295507, "learning_rate": 9.708470071702094e-06, "loss": 0.0517, "step": 1239 }, { "epoch": 0.595581171950048, "grad_norm": 1.043696226103975, "learning_rate": 9.707529064884087e-06, "loss": 0.0586, "step": 1240 }, { "epoch": 0.5960614793467819, "grad_norm": 0.5486854474154073, "learning_rate": 9.706586587564236e-06, "loss": 0.057, "step": 1241 }, { "epoch": 0.5965417867435159, "grad_norm": 0.633733102374295, "learning_rate": 9.705642640036945e-06, "loss": 0.0548, "step": 1242 }, { "epoch": 0.5970220941402498, "grad_norm": 0.4966097609578802, "learning_rate": 9.704697222597074e-06, "loss": 0.0428, "step": 1243 }, { "epoch": 0.5975024015369836, "grad_norm": 0.5824655077330811, "learning_rate": 9.703750335539952e-06, "loss": 0.0565, "step": 1244 }, { "epoch": 0.5979827089337176, "grad_norm": 0.563546323378018, "learning_rate": 9.702801979161353e-06, "loss": 0.0543, "step": 1245 }, { "epoch": 0.5984630163304515, "grad_norm": 0.5563049946399633, "learning_rate": 9.70185215375752e-06, "loss": 0.049, "step": 1246 }, { "epoch": 0.5989433237271854, "grad_norm": 0.3744869072214263, "learning_rate": 9.700900859625155e-06, "loss": 0.032, "step": 1247 }, { "epoch": 0.5994236311239193, "grad_norm": 0.5977503365912346, "learning_rate": 9.699948097061412e-06, "loss": 0.0598, "step": 1248 }, { "epoch": 0.5999039385206533, "grad_norm": 0.7030336814456123, "learning_rate": 9.69899386636391e-06, "loss": 0.0446, "step": 1249 }, { "epoch": 0.6003842459173871, "grad_norm": 0.4669080970568829, "learning_rate": 9.698038167830722e-06, "loss": 0.0515, "step": 1250 }, { "epoch": 0.600864553314121, "grad_norm": 0.49830925851066343, "learning_rate": 9.697081001760384e-06, "loss": 0.0576, "step": 1251 }, { "epoch": 0.6013448607108549, "grad_norm": 0.6686926230527453, "learning_rate": 9.696122368451887e-06, "loss": 0.0399, "step": 1252 }, { "epoch": 0.6018251681075889, "grad_norm": 0.6340142308992163, "learning_rate": 9.695162268204681e-06, "loss": 0.0792, "step": 1253 }, { "epoch": 0.6023054755043228, "grad_norm": 0.532128653279977, "learning_rate": 9.694200701318679e-06, "loss": 0.041, "step": 1254 }, { "epoch": 0.6027857829010567, "grad_norm": 0.5065554349402721, "learning_rate": 9.693237668094242e-06, "loss": 0.0466, "step": 1255 }, { "epoch": 0.6032660902977905, "grad_norm": 0.5153614448612733, "learning_rate": 9.692273168832198e-06, "loss": 0.054, "step": 1256 }, { "epoch": 0.6037463976945245, "grad_norm": 0.5353390841759837, "learning_rate": 9.69130720383383e-06, "loss": 0.0374, "step": 1257 }, { "epoch": 0.6042267050912584, "grad_norm": 0.639129347628516, "learning_rate": 9.690339773400876e-06, "loss": 0.0432, "step": 1258 }, { "epoch": 0.6047070124879923, "grad_norm": 0.58928331069562, "learning_rate": 9.689370877835538e-06, "loss": 0.0357, "step": 1259 }, { "epoch": 0.6051873198847262, "grad_norm": 0.5336886529718406, "learning_rate": 9.688400517440471e-06, "loss": 0.056, "step": 1260 }, { "epoch": 0.6056676272814602, "grad_norm": 0.5656089550860497, "learning_rate": 9.687428692518789e-06, "loss": 0.0539, "step": 1261 }, { "epoch": 0.6061479346781941, "grad_norm": 0.4162011114169869, "learning_rate": 9.686455403374062e-06, "loss": 0.0354, "step": 1262 }, { "epoch": 0.6066282420749279, "grad_norm": 0.6906145746227967, "learning_rate": 9.685480650310319e-06, "loss": 0.0526, "step": 1263 }, { "epoch": 0.6071085494716618, "grad_norm": 1.028325898856233, "learning_rate": 9.684504433632049e-06, "loss": 0.0588, "step": 1264 }, { "epoch": 0.6075888568683958, "grad_norm": 0.5167645947728937, "learning_rate": 9.68352675364419e-06, "loss": 0.0354, "step": 1265 }, { "epoch": 0.6080691642651297, "grad_norm": 0.6196435075019047, "learning_rate": 9.682547610652145e-06, "loss": 0.05, "step": 1266 }, { "epoch": 0.6085494716618636, "grad_norm": 0.47414855740871636, "learning_rate": 9.681567004961769e-06, "loss": 0.0495, "step": 1267 }, { "epoch": 0.6090297790585975, "grad_norm": 0.4592587792291843, "learning_rate": 9.68058493687938e-06, "loss": 0.0386, "step": 1268 }, { "epoch": 0.6095100864553314, "grad_norm": 0.46426133154926164, "learning_rate": 9.679601406711746e-06, "loss": 0.0631, "step": 1269 }, { "epoch": 0.6099903938520653, "grad_norm": 0.4772199148883563, "learning_rate": 9.678616414766096e-06, "loss": 0.056, "step": 1270 }, { "epoch": 0.6104707012487992, "grad_norm": 0.7108506602194041, "learning_rate": 9.677629961350113e-06, "loss": 0.0658, "step": 1271 }, { "epoch": 0.6109510086455331, "grad_norm": 0.6231876402263944, "learning_rate": 9.676642046771938e-06, "loss": 0.0433, "step": 1272 }, { "epoch": 0.6114313160422671, "grad_norm": 0.47557579581945064, "learning_rate": 9.675652671340169e-06, "loss": 0.04, "step": 1273 }, { "epoch": 0.611911623439001, "grad_norm": 0.7107171113991735, "learning_rate": 9.67466183536386e-06, "loss": 0.0628, "step": 1274 }, { "epoch": 0.6123919308357348, "grad_norm": 0.9476315212332672, "learning_rate": 9.673669539152518e-06, "loss": 0.0443, "step": 1275 }, { "epoch": 0.6128722382324687, "grad_norm": 0.7168398810301742, "learning_rate": 9.67267578301611e-06, "loss": 0.0744, "step": 1276 }, { "epoch": 0.6133525456292027, "grad_norm": 0.5729632454572932, "learning_rate": 9.67168056726506e-06, "loss": 0.0514, "step": 1277 }, { "epoch": 0.6138328530259366, "grad_norm": 0.9643183924220915, "learning_rate": 9.670683892210245e-06, "loss": 0.0622, "step": 1278 }, { "epoch": 0.6143131604226705, "grad_norm": 0.6683536482508268, "learning_rate": 9.669685758162996e-06, "loss": 0.0764, "step": 1279 }, { "epoch": 0.6147934678194045, "grad_norm": 0.4503574435017507, "learning_rate": 9.668686165435106e-06, "loss": 0.0494, "step": 1280 }, { "epoch": 0.6152737752161384, "grad_norm": 0.4920339006116487, "learning_rate": 9.667685114338819e-06, "loss": 0.0478, "step": 1281 }, { "epoch": 0.6157540826128722, "grad_norm": 0.5067811483748804, "learning_rate": 9.666682605186834e-06, "loss": 0.0502, "step": 1282 }, { "epoch": 0.6162343900096061, "grad_norm": 0.4640157427837885, "learning_rate": 9.66567863829231e-06, "loss": 0.0317, "step": 1283 }, { "epoch": 0.6167146974063401, "grad_norm": 0.7574081208925442, "learning_rate": 9.664673213968856e-06, "loss": 0.0478, "step": 1284 }, { "epoch": 0.617195004803074, "grad_norm": 0.6090301906896739, "learning_rate": 9.663666332530541e-06, "loss": 0.0532, "step": 1285 }, { "epoch": 0.6176753121998079, "grad_norm": 0.5041461690169365, "learning_rate": 9.662657994291884e-06, "loss": 0.059, "step": 1286 }, { "epoch": 0.6181556195965417, "grad_norm": 0.3750500141302966, "learning_rate": 9.661648199567866e-06, "loss": 0.0367, "step": 1287 }, { "epoch": 0.6186359269932757, "grad_norm": 0.5062593535367501, "learning_rate": 9.660636948673913e-06, "loss": 0.0353, "step": 1288 }, { "epoch": 0.6191162343900096, "grad_norm": 0.639235731952125, "learning_rate": 9.659624241925917e-06, "loss": 0.0596, "step": 1289 }, { "epoch": 0.6195965417867435, "grad_norm": 0.5049024822490057, "learning_rate": 9.65861007964022e-06, "loss": 0.0447, "step": 1290 }, { "epoch": 0.6200768491834774, "grad_norm": 0.4296613053659539, "learning_rate": 9.657594462133614e-06, "loss": 0.044, "step": 1291 }, { "epoch": 0.6205571565802114, "grad_norm": 0.5138878968910088, "learning_rate": 9.656577389723353e-06, "loss": 0.0493, "step": 1292 }, { "epoch": 0.6210374639769453, "grad_norm": 0.46036782082872985, "learning_rate": 9.655558862727141e-06, "loss": 0.037, "step": 1293 }, { "epoch": 0.6215177713736791, "grad_norm": 0.4704621930317916, "learning_rate": 9.654538881463139e-06, "loss": 0.048, "step": 1294 }, { "epoch": 0.621998078770413, "grad_norm": 0.48737100335382305, "learning_rate": 9.653517446249955e-06, "loss": 0.0396, "step": 1295 }, { "epoch": 0.622478386167147, "grad_norm": 0.49092045898074815, "learning_rate": 9.652494557406666e-06, "loss": 0.0456, "step": 1296 }, { "epoch": 0.6229586935638809, "grad_norm": 0.5835467675400211, "learning_rate": 9.65147021525279e-06, "loss": 0.0378, "step": 1297 }, { "epoch": 0.6234390009606148, "grad_norm": 0.42343488555568504, "learning_rate": 9.650444420108303e-06, "loss": 0.0378, "step": 1298 }, { "epoch": 0.6239193083573487, "grad_norm": 0.49168464423068825, "learning_rate": 9.649417172293636e-06, "loss": 0.0522, "step": 1299 }, { "epoch": 0.6243996157540826, "grad_norm": 0.5154918146605992, "learning_rate": 9.648388472129671e-06, "loss": 0.044, "step": 1300 }, { "epoch": 0.6248799231508165, "grad_norm": 0.3503174379219431, "learning_rate": 9.647358319937746e-06, "loss": 0.0347, "step": 1301 }, { "epoch": 0.6253602305475504, "grad_norm": 0.4137072426795727, "learning_rate": 9.646326716039653e-06, "loss": 0.0296, "step": 1302 }, { "epoch": 0.6258405379442843, "grad_norm": 0.5333253731829095, "learning_rate": 9.645293660757637e-06, "loss": 0.0504, "step": 1303 }, { "epoch": 0.6263208453410183, "grad_norm": 0.45225710559548166, "learning_rate": 9.644259154414396e-06, "loss": 0.0433, "step": 1304 }, { "epoch": 0.6268011527377522, "grad_norm": 0.5129776997920263, "learning_rate": 9.643223197333078e-06, "loss": 0.0555, "step": 1305 }, { "epoch": 0.627281460134486, "grad_norm": 0.6635014880318021, "learning_rate": 9.64218578983729e-06, "loss": 0.0663, "step": 1306 }, { "epoch": 0.6277617675312199, "grad_norm": 0.40151280112857785, "learning_rate": 9.641146932251088e-06, "loss": 0.0449, "step": 1307 }, { "epoch": 0.6282420749279539, "grad_norm": 0.4750922210262119, "learning_rate": 9.640106624898985e-06, "loss": 0.0458, "step": 1308 }, { "epoch": 0.6287223823246878, "grad_norm": 0.534233777229496, "learning_rate": 9.639064868105943e-06, "loss": 0.0522, "step": 1309 }, { "epoch": 0.6292026897214217, "grad_norm": 0.48289449635982123, "learning_rate": 9.638021662197376e-06, "loss": 0.0373, "step": 1310 }, { "epoch": 0.6296829971181557, "grad_norm": 0.5462852993159404, "learning_rate": 9.636977007499153e-06, "loss": 0.0742, "step": 1311 }, { "epoch": 0.6301633045148896, "grad_norm": 0.5378449863960542, "learning_rate": 9.6359309043376e-06, "loss": 0.0493, "step": 1312 }, { "epoch": 0.6306436119116234, "grad_norm": 0.7788750925559146, "learning_rate": 9.634883353039484e-06, "loss": 0.0855, "step": 1313 }, { "epoch": 0.6311239193083573, "grad_norm": 0.4117391703483272, "learning_rate": 9.633834353932035e-06, "loss": 0.0424, "step": 1314 }, { "epoch": 0.6316042267050913, "grad_norm": 0.7547438553021464, "learning_rate": 9.63278390734293e-06, "loss": 0.0534, "step": 1315 }, { "epoch": 0.6320845341018252, "grad_norm": 0.6007646020799848, "learning_rate": 9.631732013600302e-06, "loss": 0.0403, "step": 1316 }, { "epoch": 0.6325648414985591, "grad_norm": 0.589813047694365, "learning_rate": 9.63067867303273e-06, "loss": 0.0536, "step": 1317 }, { "epoch": 0.633045148895293, "grad_norm": 0.5824782124675916, "learning_rate": 9.62962388596925e-06, "loss": 0.0588, "step": 1318 }, { "epoch": 0.633525456292027, "grad_norm": 0.5558162647237488, "learning_rate": 9.628567652739348e-06, "loss": 0.0459, "step": 1319 }, { "epoch": 0.6340057636887608, "grad_norm": 0.43466194972289696, "learning_rate": 9.627509973672962e-06, "loss": 0.0504, "step": 1320 }, { "epoch": 0.6344860710854947, "grad_norm": 0.5311930203126808, "learning_rate": 9.626450849100483e-06, "loss": 0.0493, "step": 1321 }, { "epoch": 0.6349663784822286, "grad_norm": 0.4875635622342151, "learning_rate": 9.62539027935275e-06, "loss": 0.0502, "step": 1322 }, { "epoch": 0.6354466858789626, "grad_norm": 0.7400810886912117, "learning_rate": 9.624328264761056e-06, "loss": 0.0834, "step": 1323 }, { "epoch": 0.6359269932756965, "grad_norm": 0.48011574918186833, "learning_rate": 9.623264805657146e-06, "loss": 0.0461, "step": 1324 }, { "epoch": 0.6364073006724303, "grad_norm": 0.36881464066476766, "learning_rate": 9.622199902373218e-06, "loss": 0.0344, "step": 1325 }, { "epoch": 0.6368876080691642, "grad_norm": 0.4130880239031028, "learning_rate": 9.621133555241912e-06, "loss": 0.0326, "step": 1326 }, { "epoch": 0.6373679154658982, "grad_norm": 0.4479060787169812, "learning_rate": 9.620065764596328e-06, "loss": 0.0393, "step": 1327 }, { "epoch": 0.6378482228626321, "grad_norm": 0.5942961946492873, "learning_rate": 9.618996530770018e-06, "loss": 0.0432, "step": 1328 }, { "epoch": 0.638328530259366, "grad_norm": 0.5638567374914406, "learning_rate": 9.617925854096975e-06, "loss": 0.0501, "step": 1329 }, { "epoch": 0.6388088376560999, "grad_norm": 0.42880488417099755, "learning_rate": 9.616853734911653e-06, "loss": 0.0374, "step": 1330 }, { "epoch": 0.6392891450528339, "grad_norm": 0.5463743835118613, "learning_rate": 9.615780173548952e-06, "loss": 0.0516, "step": 1331 }, { "epoch": 0.6397694524495677, "grad_norm": 0.691161000342225, "learning_rate": 9.614705170344221e-06, "loss": 0.0848, "step": 1332 }, { "epoch": 0.6402497598463016, "grad_norm": 0.6156410304491652, "learning_rate": 9.613628725633262e-06, "loss": 0.056, "step": 1333 }, { "epoch": 0.6407300672430355, "grad_norm": 0.45322396316185093, "learning_rate": 9.612550839752326e-06, "loss": 0.044, "step": 1334 }, { "epoch": 0.6412103746397695, "grad_norm": 0.6507298412844991, "learning_rate": 9.611471513038115e-06, "loss": 0.0687, "step": 1335 }, { "epoch": 0.6416906820365034, "grad_norm": 0.6325291411421659, "learning_rate": 9.610390745827783e-06, "loss": 0.0608, "step": 1336 }, { "epoch": 0.6421709894332372, "grad_norm": 0.60388801983135, "learning_rate": 9.60930853845893e-06, "loss": 0.0402, "step": 1337 }, { "epoch": 0.6426512968299711, "grad_norm": 0.4995528775460853, "learning_rate": 9.608224891269607e-06, "loss": 0.0381, "step": 1338 }, { "epoch": 0.6431316042267051, "grad_norm": 0.3700705272643591, "learning_rate": 9.607139804598316e-06, "loss": 0.0349, "step": 1339 }, { "epoch": 0.643611911623439, "grad_norm": 0.4362560597879333, "learning_rate": 9.606053278784009e-06, "loss": 0.0403, "step": 1340 }, { "epoch": 0.6440922190201729, "grad_norm": 0.54879971648486, "learning_rate": 9.604965314166085e-06, "loss": 0.0541, "step": 1341 }, { "epoch": 0.6445725264169068, "grad_norm": 0.4929293793701462, "learning_rate": 9.603875911084394e-06, "loss": 0.0365, "step": 1342 }, { "epoch": 0.6450528338136408, "grad_norm": 0.5628523668339881, "learning_rate": 9.602785069879239e-06, "loss": 0.0598, "step": 1343 }, { "epoch": 0.6455331412103746, "grad_norm": 0.467411924570006, "learning_rate": 9.601692790891363e-06, "loss": 0.0521, "step": 1344 }, { "epoch": 0.6460134486071085, "grad_norm": 0.5210675000514947, "learning_rate": 9.600599074461967e-06, "loss": 0.0637, "step": 1345 }, { "epoch": 0.6464937560038425, "grad_norm": 0.6153647716384278, "learning_rate": 9.599503920932698e-06, "loss": 0.0479, "step": 1346 }, { "epoch": 0.6469740634005764, "grad_norm": 0.5616593537777448, "learning_rate": 9.59840733064565e-06, "loss": 0.0364, "step": 1347 }, { "epoch": 0.6474543707973103, "grad_norm": 0.3344274743801641, "learning_rate": 9.59730930394337e-06, "loss": 0.0314, "step": 1348 }, { "epoch": 0.6479346781940442, "grad_norm": 0.6754093193995799, "learning_rate": 9.59620984116885e-06, "loss": 0.0438, "step": 1349 }, { "epoch": 0.6484149855907781, "grad_norm": 0.8533549419575339, "learning_rate": 9.595108942665528e-06, "loss": 0.0513, "step": 1350 }, { "epoch": 0.648895292987512, "grad_norm": 0.3620854442522442, "learning_rate": 9.5940066087773e-06, "loss": 0.0423, "step": 1351 }, { "epoch": 0.6493756003842459, "grad_norm": 0.5893844408390839, "learning_rate": 9.592902839848502e-06, "loss": 0.0377, "step": 1352 }, { "epoch": 0.6498559077809798, "grad_norm": 0.4092201290840598, "learning_rate": 9.591797636223921e-06, "loss": 0.0411, "step": 1353 }, { "epoch": 0.6503362151777138, "grad_norm": 0.47322297966868715, "learning_rate": 9.590690998248791e-06, "loss": 0.043, "step": 1354 }, { "epoch": 0.6508165225744477, "grad_norm": 0.4636748975157291, "learning_rate": 9.589582926268798e-06, "loss": 0.0429, "step": 1355 }, { "epoch": 0.6512968299711815, "grad_norm": 0.4349692458727795, "learning_rate": 9.588473420630071e-06, "loss": 0.0444, "step": 1356 }, { "epoch": 0.6517771373679154, "grad_norm": 0.5677476836765938, "learning_rate": 9.587362481679187e-06, "loss": 0.0562, "step": 1357 }, { "epoch": 0.6522574447646494, "grad_norm": 0.4371170679582839, "learning_rate": 9.586250109763176e-06, "loss": 0.0371, "step": 1358 }, { "epoch": 0.6527377521613833, "grad_norm": 0.6019100000609227, "learning_rate": 9.585136305229513e-06, "loss": 0.0488, "step": 1359 }, { "epoch": 0.6532180595581172, "grad_norm": 0.38363888520290607, "learning_rate": 9.584021068426114e-06, "loss": 0.0338, "step": 1360 }, { "epoch": 0.6536983669548511, "grad_norm": 0.4993078534835986, "learning_rate": 9.582904399701353e-06, "loss": 0.0431, "step": 1361 }, { "epoch": 0.654178674351585, "grad_norm": 0.5671412832634853, "learning_rate": 9.581786299404046e-06, "loss": 0.0561, "step": 1362 }, { "epoch": 0.6546589817483189, "grad_norm": 0.7306005927500124, "learning_rate": 9.580666767883456e-06, "loss": 0.042, "step": 1363 }, { "epoch": 0.6551392891450528, "grad_norm": 0.5032575865518208, "learning_rate": 9.579545805489292e-06, "loss": 0.0518, "step": 1364 }, { "epoch": 0.6556195965417867, "grad_norm": 0.6343768547939731, "learning_rate": 9.578423412571713e-06, "loss": 0.055, "step": 1365 }, { "epoch": 0.6560999039385207, "grad_norm": 0.6313252829889849, "learning_rate": 9.577299589481325e-06, "loss": 0.0499, "step": 1366 }, { "epoch": 0.6565802113352546, "grad_norm": 0.5874538450840524, "learning_rate": 9.576174336569177e-06, "loss": 0.0546, "step": 1367 }, { "epoch": 0.6570605187319885, "grad_norm": 0.4208422595696595, "learning_rate": 9.575047654186768e-06, "loss": 0.0427, "step": 1368 }, { "epoch": 0.6575408261287223, "grad_norm": 0.6079831399082672, "learning_rate": 9.57391954268604e-06, "loss": 0.051, "step": 1369 }, { "epoch": 0.6580211335254563, "grad_norm": 0.5767257558258905, "learning_rate": 9.57279000241939e-06, "loss": 0.0402, "step": 1370 }, { "epoch": 0.6585014409221902, "grad_norm": 0.4733503070820277, "learning_rate": 9.571659033739648e-06, "loss": 0.048, "step": 1371 }, { "epoch": 0.6589817483189241, "grad_norm": 0.4524785331372919, "learning_rate": 9.570526637000102e-06, "loss": 0.04, "step": 1372 }, { "epoch": 0.659462055715658, "grad_norm": 0.6668276001778005, "learning_rate": 9.56939281255448e-06, "loss": 0.0498, "step": 1373 }, { "epoch": 0.659942363112392, "grad_norm": 0.49259152852847804, "learning_rate": 9.568257560756955e-06, "loss": 0.0386, "step": 1374 }, { "epoch": 0.6604226705091258, "grad_norm": 0.8583845386777227, "learning_rate": 9.567120881962152e-06, "loss": 0.0498, "step": 1375 }, { "epoch": 0.6609029779058597, "grad_norm": 0.533492536983965, "learning_rate": 9.565982776525136e-06, "loss": 0.0472, "step": 1376 }, { "epoch": 0.6613832853025937, "grad_norm": 0.6027948215201636, "learning_rate": 9.56484324480142e-06, "loss": 0.047, "step": 1377 }, { "epoch": 0.6618635926993276, "grad_norm": 0.3382733587601467, "learning_rate": 9.563702287146963e-06, "loss": 0.0423, "step": 1378 }, { "epoch": 0.6623439000960615, "grad_norm": 0.40078317285361365, "learning_rate": 9.562559903918167e-06, "loss": 0.0437, "step": 1379 }, { "epoch": 0.6628242074927954, "grad_norm": 0.4152293792382375, "learning_rate": 9.561416095471882e-06, "loss": 0.0487, "step": 1380 }, { "epoch": 0.6633045148895294, "grad_norm": 0.37525899009119723, "learning_rate": 9.560270862165401e-06, "loss": 0.0445, "step": 1381 }, { "epoch": 0.6637848222862632, "grad_norm": 0.4239007996552649, "learning_rate": 9.559124204356465e-06, "loss": 0.0365, "step": 1382 }, { "epoch": 0.6642651296829971, "grad_norm": 0.44856667071995515, "learning_rate": 9.557976122403259e-06, "loss": 0.0432, "step": 1383 }, { "epoch": 0.664745437079731, "grad_norm": 0.40105877291250286, "learning_rate": 9.556826616664408e-06, "loss": 0.0375, "step": 1384 }, { "epoch": 0.665225744476465, "grad_norm": 0.6413226916161751, "learning_rate": 9.555675687498988e-06, "loss": 0.0491, "step": 1385 }, { "epoch": 0.6657060518731989, "grad_norm": 0.4994127702541754, "learning_rate": 9.554523335266519e-06, "loss": 0.0362, "step": 1386 }, { "epoch": 0.6661863592699327, "grad_norm": 0.43217572367178236, "learning_rate": 9.553369560326961e-06, "loss": 0.041, "step": 1387 }, { "epoch": 0.6666666666666666, "grad_norm": 0.5576149940821006, "learning_rate": 9.552214363040725e-06, "loss": 0.0609, "step": 1388 }, { "epoch": 0.6671469740634006, "grad_norm": 0.4612462479554566, "learning_rate": 9.551057743768658e-06, "loss": 0.0326, "step": 1389 }, { "epoch": 0.6676272814601345, "grad_norm": 0.7707155411339915, "learning_rate": 9.54989970287206e-06, "loss": 0.0515, "step": 1390 }, { "epoch": 0.6681075888568684, "grad_norm": 0.6183305732020524, "learning_rate": 9.54874024071267e-06, "loss": 0.0573, "step": 1391 }, { "epoch": 0.6685878962536023, "grad_norm": 0.7355014675302118, "learning_rate": 9.54757935765267e-06, "loss": 0.0509, "step": 1392 }, { "epoch": 0.6690682036503363, "grad_norm": 0.41942626169957864, "learning_rate": 9.54641705405469e-06, "loss": 0.0346, "step": 1393 }, { "epoch": 0.6695485110470701, "grad_norm": 0.46998738583164973, "learning_rate": 9.5452533302818e-06, "loss": 0.0441, "step": 1394 }, { "epoch": 0.670028818443804, "grad_norm": 0.8255715566434525, "learning_rate": 9.544088186697515e-06, "loss": 0.0762, "step": 1395 }, { "epoch": 0.6705091258405379, "grad_norm": 0.742976741485616, "learning_rate": 9.542921623665796e-06, "loss": 0.056, "step": 1396 }, { "epoch": 0.6709894332372719, "grad_norm": 0.7975592021603152, "learning_rate": 9.541753641551042e-06, "loss": 0.0618, "step": 1397 }, { "epoch": 0.6714697406340058, "grad_norm": 0.7133686245791877, "learning_rate": 9.540584240718098e-06, "loss": 0.0359, "step": 1398 }, { "epoch": 0.6719500480307397, "grad_norm": 0.6417332791114704, "learning_rate": 9.539413421532256e-06, "loss": 0.067, "step": 1399 }, { "epoch": 0.6724303554274735, "grad_norm": 0.6512453356740282, "learning_rate": 9.538241184359245e-06, "loss": 0.057, "step": 1400 }, { "epoch": 0.6729106628242075, "grad_norm": 0.6011163829953513, "learning_rate": 9.537067529565241e-06, "loss": 0.0433, "step": 1401 }, { "epoch": 0.6733909702209414, "grad_norm": 0.5605867258554449, "learning_rate": 9.535892457516858e-06, "loss": 0.07, "step": 1402 }, { "epoch": 0.6738712776176753, "grad_norm": 0.44843028713428207, "learning_rate": 9.53471596858116e-06, "loss": 0.0346, "step": 1403 }, { "epoch": 0.6743515850144092, "grad_norm": 0.45306372964448993, "learning_rate": 9.533538063125649e-06, "loss": 0.0395, "step": 1404 }, { "epoch": 0.6748318924111432, "grad_norm": 0.665610180113429, "learning_rate": 9.532358741518265e-06, "loss": 0.0727, "step": 1405 }, { "epoch": 0.675312199807877, "grad_norm": 0.40211030378348633, "learning_rate": 9.531178004127404e-06, "loss": 0.0415, "step": 1406 }, { "epoch": 0.6757925072046109, "grad_norm": 0.4867082233781477, "learning_rate": 9.52999585132189e-06, "loss": 0.0413, "step": 1407 }, { "epoch": 0.6762728146013448, "grad_norm": 0.7741469232795118, "learning_rate": 9.528812283470994e-06, "loss": 0.0717, "step": 1408 }, { "epoch": 0.6767531219980788, "grad_norm": 0.42760299424580006, "learning_rate": 9.527627300944434e-06, "loss": 0.0378, "step": 1409 }, { "epoch": 0.6772334293948127, "grad_norm": 0.6198122845932187, "learning_rate": 9.52644090411236e-06, "loss": 0.0644, "step": 1410 }, { "epoch": 0.6777137367915466, "grad_norm": 0.7010269392103107, "learning_rate": 9.525253093345377e-06, "loss": 0.0742, "step": 1411 }, { "epoch": 0.6781940441882806, "grad_norm": 0.46098176739925134, "learning_rate": 9.524063869014517e-06, "loss": 0.0539, "step": 1412 }, { "epoch": 0.6786743515850144, "grad_norm": 0.45626576421316317, "learning_rate": 9.522873231491268e-06, "loss": 0.0459, "step": 1413 }, { "epoch": 0.6791546589817483, "grad_norm": 0.5235232885723231, "learning_rate": 9.521681181147544e-06, "loss": 0.0354, "step": 1414 }, { "epoch": 0.6796349663784822, "grad_norm": 0.4083701335457131, "learning_rate": 9.520487718355716e-06, "loss": 0.0376, "step": 1415 }, { "epoch": 0.6801152737752162, "grad_norm": 0.4519589464495528, "learning_rate": 9.519292843488584e-06, "loss": 0.0435, "step": 1416 }, { "epoch": 0.6805955811719501, "grad_norm": 0.5270239622507114, "learning_rate": 9.518096556919396e-06, "loss": 0.0385, "step": 1417 }, { "epoch": 0.681075888568684, "grad_norm": 0.49655054870938087, "learning_rate": 9.51689885902184e-06, "loss": 0.0418, "step": 1418 }, { "epoch": 0.6815561959654178, "grad_norm": 0.48390295607618405, "learning_rate": 9.51569975017004e-06, "loss": 0.0473, "step": 1419 }, { "epoch": 0.6820365033621518, "grad_norm": 0.4134572167278166, "learning_rate": 9.514499230738567e-06, "loss": 0.0356, "step": 1420 }, { "epoch": 0.6825168107588857, "grad_norm": 0.42968236699716256, "learning_rate": 9.51329730110243e-06, "loss": 0.0396, "step": 1421 }, { "epoch": 0.6829971181556196, "grad_norm": 0.507732504462261, "learning_rate": 9.512093961637077e-06, "loss": 0.044, "step": 1422 }, { "epoch": 0.6834774255523535, "grad_norm": 0.513408290011866, "learning_rate": 9.510889212718398e-06, "loss": 0.0539, "step": 1423 }, { "epoch": 0.6839577329490875, "grad_norm": 0.5815081646240737, "learning_rate": 9.509683054722726e-06, "loss": 0.0398, "step": 1424 }, { "epoch": 0.6844380403458213, "grad_norm": 0.4078417115979057, "learning_rate": 9.508475488026829e-06, "loss": 0.0332, "step": 1425 }, { "epoch": 0.6849183477425552, "grad_norm": 1.0586820403034098, "learning_rate": 9.507266513007918e-06, "loss": 0.0763, "step": 1426 }, { "epoch": 0.6853986551392891, "grad_norm": 0.564383510484416, "learning_rate": 9.506056130043644e-06, "loss": 0.0434, "step": 1427 }, { "epoch": 0.6858789625360231, "grad_norm": 0.3768335964210522, "learning_rate": 9.504844339512096e-06, "loss": 0.0326, "step": 1428 }, { "epoch": 0.686359269932757, "grad_norm": 0.5437219091200941, "learning_rate": 9.503631141791804e-06, "loss": 0.0551, "step": 1429 }, { "epoch": 0.6868395773294909, "grad_norm": 0.39882576370499795, "learning_rate": 9.502416537261739e-06, "loss": 0.0394, "step": 1430 }, { "epoch": 0.6873198847262247, "grad_norm": 0.5298522285045902, "learning_rate": 9.50120052630131e-06, "loss": 0.0424, "step": 1431 }, { "epoch": 0.6878001921229587, "grad_norm": 0.7335489985769658, "learning_rate": 9.499983109290361e-06, "loss": 0.0695, "step": 1432 }, { "epoch": 0.6882804995196926, "grad_norm": 0.7767384589812387, "learning_rate": 9.498764286609183e-06, "loss": 0.0684, "step": 1433 }, { "epoch": 0.6887608069164265, "grad_norm": 0.4485179955778425, "learning_rate": 9.4975440586385e-06, "loss": 0.0545, "step": 1434 }, { "epoch": 0.6892411143131604, "grad_norm": 0.7644156832335937, "learning_rate": 9.496322425759484e-06, "loss": 0.056, "step": 1435 }, { "epoch": 0.6897214217098944, "grad_norm": 0.6682437861904863, "learning_rate": 9.495099388353731e-06, "loss": 0.0413, "step": 1436 }, { "epoch": 0.6902017291066282, "grad_norm": 0.2782387488882061, "learning_rate": 9.493874946803287e-06, "loss": 0.0307, "step": 1437 }, { "epoch": 0.6906820365033621, "grad_norm": 0.7196277741729624, "learning_rate": 9.492649101490636e-06, "loss": 0.0605, "step": 1438 }, { "epoch": 0.691162343900096, "grad_norm": 0.43324754881869854, "learning_rate": 9.491421852798695e-06, "loss": 0.0475, "step": 1439 }, { "epoch": 0.69164265129683, "grad_norm": 0.43738740093767214, "learning_rate": 9.490193201110825e-06, "loss": 0.0375, "step": 1440 }, { "epoch": 0.6921229586935639, "grad_norm": 0.5488539442142931, "learning_rate": 9.48896314681082e-06, "loss": 0.0728, "step": 1441 }, { "epoch": 0.6926032660902978, "grad_norm": 1.1540239102800764, "learning_rate": 9.48773169028292e-06, "loss": 0.0586, "step": 1442 }, { "epoch": 0.6930835734870316, "grad_norm": 0.6283641043852788, "learning_rate": 9.486498831911792e-06, "loss": 0.0736, "step": 1443 }, { "epoch": 0.6935638808837656, "grad_norm": 1.0108486453877055, "learning_rate": 9.485264572082551e-06, "loss": 0.0918, "step": 1444 }, { "epoch": 0.6940441882804995, "grad_norm": 0.40880665155636625, "learning_rate": 9.484028911180742e-06, "loss": 0.0389, "step": 1445 }, { "epoch": 0.6945244956772334, "grad_norm": 0.47431051486539744, "learning_rate": 9.482791849592354e-06, "loss": 0.0597, "step": 1446 }, { "epoch": 0.6950048030739674, "grad_norm": 0.8329629013113664, "learning_rate": 9.48155338770381e-06, "loss": 0.0727, "step": 1447 }, { "epoch": 0.6954851104707013, "grad_norm": 0.5434685409654171, "learning_rate": 9.480313525901973e-06, "loss": 0.0505, "step": 1448 }, { "epoch": 0.6959654178674352, "grad_norm": 0.6327014782848, "learning_rate": 9.479072264574138e-06, "loss": 0.0524, "step": 1449 }, { "epoch": 0.696445725264169, "grad_norm": 0.4768500817588462, "learning_rate": 9.477829604108044e-06, "loss": 0.0479, "step": 1450 }, { "epoch": 0.696926032660903, "grad_norm": 0.33637687220332657, "learning_rate": 9.476585544891862e-06, "loss": 0.0335, "step": 1451 }, { "epoch": 0.6974063400576369, "grad_norm": 0.6410159305875692, "learning_rate": 9.475340087314203e-06, "loss": 0.0457, "step": 1452 }, { "epoch": 0.6978866474543708, "grad_norm": 0.7561772611383961, "learning_rate": 9.47409323176411e-06, "loss": 0.0625, "step": 1453 }, { "epoch": 0.6983669548511047, "grad_norm": 0.3920818511938983, "learning_rate": 9.472844978631071e-06, "loss": 0.0282, "step": 1454 }, { "epoch": 0.6988472622478387, "grad_norm": 0.7746401992023322, "learning_rate": 9.471595328305002e-06, "loss": 0.0577, "step": 1455 }, { "epoch": 0.6993275696445725, "grad_norm": 1.1432462417277984, "learning_rate": 9.470344281176261e-06, "loss": 0.0503, "step": 1456 }, { "epoch": 0.6998078770413064, "grad_norm": 0.5579207608396545, "learning_rate": 9.469091837635641e-06, "loss": 0.0436, "step": 1457 }, { "epoch": 0.7002881844380403, "grad_norm": 0.4293775018518884, "learning_rate": 9.467837998074369e-06, "loss": 0.0426, "step": 1458 }, { "epoch": 0.7007684918347743, "grad_norm": 0.5654128796038869, "learning_rate": 9.466582762884111e-06, "loss": 0.0515, "step": 1459 }, { "epoch": 0.7012487992315082, "grad_norm": 0.6275264278723893, "learning_rate": 9.465326132456966e-06, "loss": 0.0314, "step": 1460 }, { "epoch": 0.7017291066282421, "grad_norm": 0.7608816446615059, "learning_rate": 9.464068107185476e-06, "loss": 0.0793, "step": 1461 }, { "epoch": 0.7022094140249759, "grad_norm": 0.588693970208091, "learning_rate": 9.462808687462606e-06, "loss": 0.0451, "step": 1462 }, { "epoch": 0.7026897214217099, "grad_norm": 0.6771566307850208, "learning_rate": 9.461547873681767e-06, "loss": 0.0515, "step": 1463 }, { "epoch": 0.7031700288184438, "grad_norm": 0.526932323467287, "learning_rate": 9.460285666236804e-06, "loss": 0.057, "step": 1464 }, { "epoch": 0.7036503362151777, "grad_norm": 0.6547665214657098, "learning_rate": 9.459022065521994e-06, "loss": 0.0469, "step": 1465 }, { "epoch": 0.7041306436119116, "grad_norm": 1.1306126178130151, "learning_rate": 9.45775707193205e-06, "loss": 0.0437, "step": 1466 }, { "epoch": 0.7046109510086456, "grad_norm": 0.6656984791797591, "learning_rate": 9.456490685862123e-06, "loss": 0.0657, "step": 1467 }, { "epoch": 0.7050912584053795, "grad_norm": 0.5262955218143904, "learning_rate": 9.455222907707795e-06, "loss": 0.0405, "step": 1468 }, { "epoch": 0.7055715658021133, "grad_norm": 0.4113212330571846, "learning_rate": 9.453953737865087e-06, "loss": 0.0331, "step": 1469 }, { "epoch": 0.7060518731988472, "grad_norm": 0.4687499190476505, "learning_rate": 9.452683176730452e-06, "loss": 0.0424, "step": 1470 }, { "epoch": 0.7065321805955812, "grad_norm": 0.5572677332289074, "learning_rate": 9.451411224700776e-06, "loss": 0.0612, "step": 1471 }, { "epoch": 0.7070124879923151, "grad_norm": 0.39073970748339487, "learning_rate": 9.450137882173385e-06, "loss": 0.0279, "step": 1472 }, { "epoch": 0.707492795389049, "grad_norm": 0.8716017403152045, "learning_rate": 9.448863149546032e-06, "loss": 0.0612, "step": 1473 }, { "epoch": 0.7079731027857828, "grad_norm": 0.8087490747385843, "learning_rate": 9.447587027216912e-06, "loss": 0.0754, "step": 1474 }, { "epoch": 0.7084534101825168, "grad_norm": 0.40033610599261016, "learning_rate": 9.446309515584648e-06, "loss": 0.0516, "step": 1475 }, { "epoch": 0.7089337175792507, "grad_norm": 0.3828791531186985, "learning_rate": 9.445030615048301e-06, "loss": 0.0308, "step": 1476 }, { "epoch": 0.7094140249759846, "grad_norm": 0.558762243566099, "learning_rate": 9.443750326007361e-06, "loss": 0.046, "step": 1477 }, { "epoch": 0.7098943323727186, "grad_norm": 0.46666384505267333, "learning_rate": 9.44246864886176e-06, "loss": 0.0463, "step": 1478 }, { "epoch": 0.7103746397694525, "grad_norm": 0.4943805321936899, "learning_rate": 9.441185584011854e-06, "loss": 0.0412, "step": 1479 }, { "epoch": 0.7108549471661864, "grad_norm": 0.4595133347133602, "learning_rate": 9.439901131858437e-06, "loss": 0.0457, "step": 1480 }, { "epoch": 0.7113352545629202, "grad_norm": 0.43379618763244315, "learning_rate": 9.43861529280274e-06, "loss": 0.0357, "step": 1481 }, { "epoch": 0.7118155619596542, "grad_norm": 0.58990377026948, "learning_rate": 9.43732806724642e-06, "loss": 0.0533, "step": 1482 }, { "epoch": 0.7122958693563881, "grad_norm": 0.31473504698760496, "learning_rate": 9.436039455591574e-06, "loss": 0.0309, "step": 1483 }, { "epoch": 0.712776176753122, "grad_norm": 0.45683837690648205, "learning_rate": 9.434749458240723e-06, "loss": 0.047, "step": 1484 }, { "epoch": 0.7132564841498559, "grad_norm": 0.3777015877291396, "learning_rate": 9.433458075596834e-06, "loss": 0.0434, "step": 1485 }, { "epoch": 0.7137367915465899, "grad_norm": 0.5110934466400615, "learning_rate": 9.432165308063293e-06, "loss": 0.0603, "step": 1486 }, { "epoch": 0.7142170989433237, "grad_norm": 0.4887572837652129, "learning_rate": 9.430871156043929e-06, "loss": 0.0508, "step": 1487 }, { "epoch": 0.7146974063400576, "grad_norm": 0.5164375394535271, "learning_rate": 9.429575619942996e-06, "loss": 0.0357, "step": 1488 }, { "epoch": 0.7151777137367915, "grad_norm": 0.4860839779256668, "learning_rate": 9.428278700165185e-06, "loss": 0.0496, "step": 1489 }, { "epoch": 0.7156580211335255, "grad_norm": 0.44939726645063366, "learning_rate": 9.426980397115619e-06, "loss": 0.0392, "step": 1490 }, { "epoch": 0.7161383285302594, "grad_norm": 0.4491948208101559, "learning_rate": 9.42568071119985e-06, "loss": 0.0418, "step": 1491 }, { "epoch": 0.7166186359269933, "grad_norm": 0.4413729317811157, "learning_rate": 9.424379642823864e-06, "loss": 0.0418, "step": 1492 }, { "epoch": 0.7170989433237271, "grad_norm": 0.6050908044633452, "learning_rate": 9.423077192394081e-06, "loss": 0.054, "step": 1493 }, { "epoch": 0.7175792507204611, "grad_norm": 0.34829153690830816, "learning_rate": 9.421773360317348e-06, "loss": 0.0455, "step": 1494 }, { "epoch": 0.718059558117195, "grad_norm": 0.6479436968237636, "learning_rate": 9.420468147000947e-06, "loss": 0.0591, "step": 1495 }, { "epoch": 0.7185398655139289, "grad_norm": 0.5935139475806216, "learning_rate": 9.419161552852592e-06, "loss": 0.0592, "step": 1496 }, { "epoch": 0.7190201729106628, "grad_norm": 0.35365850589490483, "learning_rate": 9.417853578280425e-06, "loss": 0.035, "step": 1497 }, { "epoch": 0.7195004803073968, "grad_norm": 0.5335048893582442, "learning_rate": 9.41654422369302e-06, "loss": 0.0343, "step": 1498 }, { "epoch": 0.7199807877041307, "grad_norm": 0.47006252573172974, "learning_rate": 9.415233489499388e-06, "loss": 0.0466, "step": 1499 }, { "epoch": 0.7204610951008645, "grad_norm": 0.4138903520224807, "learning_rate": 9.413921376108958e-06, "loss": 0.0337, "step": 1500 }, { "epoch": 0.7209414024975984, "grad_norm": 0.5302837596305445, "learning_rate": 9.412607883931608e-06, "loss": 0.0355, "step": 1501 }, { "epoch": 0.7214217098943324, "grad_norm": 0.46188113895356303, "learning_rate": 9.411293013377628e-06, "loss": 0.0464, "step": 1502 }, { "epoch": 0.7219020172910663, "grad_norm": 0.5048308160692421, "learning_rate": 9.409976764857752e-06, "loss": 0.0659, "step": 1503 }, { "epoch": 0.7223823246878002, "grad_norm": 0.42557944797976405, "learning_rate": 9.408659138783138e-06, "loss": 0.0476, "step": 1504 }, { "epoch": 0.722862632084534, "grad_norm": 0.43337734311918524, "learning_rate": 9.407340135565375e-06, "loss": 0.0473, "step": 1505 }, { "epoch": 0.723342939481268, "grad_norm": 0.43466454870999904, "learning_rate": 9.406019755616484e-06, "loss": 0.0456, "step": 1506 }, { "epoch": 0.7238232468780019, "grad_norm": 0.44882261533196544, "learning_rate": 9.404697999348917e-06, "loss": 0.0545, "step": 1507 }, { "epoch": 0.7243035542747358, "grad_norm": 0.4627444515845815, "learning_rate": 9.403374867175553e-06, "loss": 0.039, "step": 1508 }, { "epoch": 0.7247838616714697, "grad_norm": 0.43987793983099877, "learning_rate": 9.4020503595097e-06, "loss": 0.0315, "step": 1509 }, { "epoch": 0.7252641690682037, "grad_norm": 0.4866538331222698, "learning_rate": 9.400724476765099e-06, "loss": 0.0451, "step": 1510 }, { "epoch": 0.7257444764649376, "grad_norm": 0.35933139183080276, "learning_rate": 9.39939721935592e-06, "loss": 0.0341, "step": 1511 }, { "epoch": 0.7262247838616714, "grad_norm": 0.36877632658996023, "learning_rate": 9.398068587696758e-06, "loss": 0.0382, "step": 1512 }, { "epoch": 0.7267050912584054, "grad_norm": 0.9738122717404578, "learning_rate": 9.396738582202645e-06, "loss": 0.0742, "step": 1513 }, { "epoch": 0.7271853986551393, "grad_norm": 0.3887278676876787, "learning_rate": 9.395407203289036e-06, "loss": 0.0376, "step": 1514 }, { "epoch": 0.7276657060518732, "grad_norm": 0.6994372693236012, "learning_rate": 9.394074451371817e-06, "loss": 0.0394, "step": 1515 }, { "epoch": 0.7281460134486071, "grad_norm": 0.45247145506590497, "learning_rate": 9.392740326867304e-06, "loss": 0.0364, "step": 1516 }, { "epoch": 0.7286263208453411, "grad_norm": 0.5967559115342186, "learning_rate": 9.391404830192239e-06, "loss": 0.0333, "step": 1517 }, { "epoch": 0.729106628242075, "grad_norm": 0.6079539868350357, "learning_rate": 9.390067961763795e-06, "loss": 0.0472, "step": 1518 }, { "epoch": 0.7295869356388088, "grad_norm": 0.4845564720247174, "learning_rate": 9.388729721999573e-06, "loss": 0.0399, "step": 1519 }, { "epoch": 0.7300672430355427, "grad_norm": 0.41693125786890695, "learning_rate": 9.387390111317599e-06, "loss": 0.0413, "step": 1520 }, { "epoch": 0.7305475504322767, "grad_norm": 0.550874709305461, "learning_rate": 9.386049130136335e-06, "loss": 0.0527, "step": 1521 }, { "epoch": 0.7310278578290106, "grad_norm": 0.7548778248070213, "learning_rate": 9.384706778874664e-06, "loss": 0.0521, "step": 1522 }, { "epoch": 0.7315081652257445, "grad_norm": 0.5908031992669268, "learning_rate": 9.3833630579519e-06, "loss": 0.0407, "step": 1523 }, { "epoch": 0.7319884726224783, "grad_norm": 0.6721817113672193, "learning_rate": 9.382017967787783e-06, "loss": 0.0456, "step": 1524 }, { "epoch": 0.7324687800192123, "grad_norm": 0.412185975988658, "learning_rate": 9.380671508802483e-06, "loss": 0.0392, "step": 1525 }, { "epoch": 0.7329490874159462, "grad_norm": 0.46229352306795535, "learning_rate": 9.379323681416596e-06, "loss": 0.0357, "step": 1526 }, { "epoch": 0.7334293948126801, "grad_norm": 0.5289989908411948, "learning_rate": 9.377974486051149e-06, "loss": 0.0548, "step": 1527 }, { "epoch": 0.733909702209414, "grad_norm": 0.5146883009854686, "learning_rate": 9.376623923127588e-06, "loss": 0.0505, "step": 1528 }, { "epoch": 0.734390009606148, "grad_norm": 0.5889961825894928, "learning_rate": 9.375271993067794e-06, "loss": 0.0442, "step": 1529 }, { "epoch": 0.7348703170028819, "grad_norm": 0.492353320981019, "learning_rate": 9.373918696294074e-06, "loss": 0.045, "step": 1530 }, { "epoch": 0.7353506243996157, "grad_norm": 0.39252458109011074, "learning_rate": 9.372564033229159e-06, "loss": 0.0357, "step": 1531 }, { "epoch": 0.7358309317963496, "grad_norm": 0.6428195670677652, "learning_rate": 9.371208004296208e-06, "loss": 0.0644, "step": 1532 }, { "epoch": 0.7363112391930836, "grad_norm": 0.6817577459803879, "learning_rate": 9.36985060991881e-06, "loss": 0.0804, "step": 1533 }, { "epoch": 0.7367915465898175, "grad_norm": 0.7456675871856415, "learning_rate": 9.368491850520972e-06, "loss": 0.0476, "step": 1534 }, { "epoch": 0.7372718539865514, "grad_norm": 0.45828290446709025, "learning_rate": 9.367131726527137e-06, "loss": 0.0387, "step": 1535 }, { "epoch": 0.7377521613832853, "grad_norm": 0.5469514522999743, "learning_rate": 9.365770238362168e-06, "loss": 0.0401, "step": 1536 }, { "epoch": 0.7382324687800192, "grad_norm": 0.6196800460825531, "learning_rate": 9.364407386451358e-06, "loss": 0.0382, "step": 1537 }, { "epoch": 0.7387127761767531, "grad_norm": 0.8185923332401619, "learning_rate": 9.363043171220423e-06, "loss": 0.0439, "step": 1538 }, { "epoch": 0.739193083573487, "grad_norm": 0.5914692890542824, "learning_rate": 9.361677593095506e-06, "loss": 0.0616, "step": 1539 }, { "epoch": 0.7396733909702209, "grad_norm": 0.42464742265876826, "learning_rate": 9.360310652503176e-06, "loss": 0.0454, "step": 1540 }, { "epoch": 0.7401536983669549, "grad_norm": 0.3717412618511177, "learning_rate": 9.358942349870428e-06, "loss": 0.0335, "step": 1541 }, { "epoch": 0.7406340057636888, "grad_norm": 0.47502141185790275, "learning_rate": 9.35757268562468e-06, "loss": 0.0488, "step": 1542 }, { "epoch": 0.7411143131604226, "grad_norm": 0.3846823017408531, "learning_rate": 9.356201660193782e-06, "loss": 0.0345, "step": 1543 }, { "epoch": 0.7415946205571565, "grad_norm": 0.5650933228577771, "learning_rate": 9.354829274005999e-06, "loss": 0.0383, "step": 1544 }, { "epoch": 0.7420749279538905, "grad_norm": 0.5690246647765329, "learning_rate": 9.35345552749003e-06, "loss": 0.0569, "step": 1545 }, { "epoch": 0.7425552353506244, "grad_norm": 0.38731794835615024, "learning_rate": 9.352080421074993e-06, "loss": 0.0419, "step": 1546 }, { "epoch": 0.7430355427473583, "grad_norm": 0.40974758691143354, "learning_rate": 9.350703955190435e-06, "loss": 0.0376, "step": 1547 }, { "epoch": 0.7435158501440923, "grad_norm": 0.46283701258349547, "learning_rate": 9.349326130266323e-06, "loss": 0.0328, "step": 1548 }, { "epoch": 0.7439961575408262, "grad_norm": 0.40124018251646276, "learning_rate": 9.347946946733055e-06, "loss": 0.0387, "step": 1549 }, { "epoch": 0.74447646493756, "grad_norm": 0.4121164368713994, "learning_rate": 9.346566405021448e-06, "loss": 0.0387, "step": 1550 }, { "epoch": 0.7449567723342939, "grad_norm": 0.48335845270026145, "learning_rate": 9.345184505562747e-06, "loss": 0.0368, "step": 1551 }, { "epoch": 0.7454370797310279, "grad_norm": 0.45269768346922323, "learning_rate": 9.343801248788616e-06, "loss": 0.052, "step": 1552 }, { "epoch": 0.7459173871277618, "grad_norm": 0.4610035020071121, "learning_rate": 9.342416635131148e-06, "loss": 0.0614, "step": 1553 }, { "epoch": 0.7463976945244957, "grad_norm": 0.4443227466310018, "learning_rate": 9.341030665022856e-06, "loss": 0.0447, "step": 1554 }, { "epoch": 0.7468780019212296, "grad_norm": 0.7642793611261786, "learning_rate": 9.339643338896682e-06, "loss": 0.0635, "step": 1555 }, { "epoch": 0.7473583093179635, "grad_norm": 0.4923735731312003, "learning_rate": 9.338254657185988e-06, "loss": 0.0507, "step": 1556 }, { "epoch": 0.7478386167146974, "grad_norm": 0.630683703114263, "learning_rate": 9.336864620324555e-06, "loss": 0.0621, "step": 1557 }, { "epoch": 0.7483189241114313, "grad_norm": 0.5090396267143328, "learning_rate": 9.335473228746599e-06, "loss": 0.0406, "step": 1558 }, { "epoch": 0.7487992315081652, "grad_norm": 0.367559997986288, "learning_rate": 9.334080482886746e-06, "loss": 0.0384, "step": 1559 }, { "epoch": 0.7492795389048992, "grad_norm": 0.4446659468934462, "learning_rate": 9.332686383180055e-06, "loss": 0.0361, "step": 1560 }, { "epoch": 0.7497598463016331, "grad_norm": 0.38856445414113083, "learning_rate": 9.331290930062003e-06, "loss": 0.0259, "step": 1561 }, { "epoch": 0.7502401536983669, "grad_norm": 0.5997908771571422, "learning_rate": 9.32989412396849e-06, "loss": 0.0471, "step": 1562 }, { "epoch": 0.7507204610951008, "grad_norm": 0.3393607659537801, "learning_rate": 9.328495965335844e-06, "loss": 0.0394, "step": 1563 }, { "epoch": 0.7512007684918348, "grad_norm": 0.37942787002008155, "learning_rate": 9.327096454600804e-06, "loss": 0.0273, "step": 1564 }, { "epoch": 0.7516810758885687, "grad_norm": 0.5089526163454388, "learning_rate": 9.325695592200545e-06, "loss": 0.0632, "step": 1565 }, { "epoch": 0.7521613832853026, "grad_norm": 0.5121396909775255, "learning_rate": 9.324293378572654e-06, "loss": 0.0551, "step": 1566 }, { "epoch": 0.7526416906820365, "grad_norm": 0.38135878200286283, "learning_rate": 9.322889814155143e-06, "loss": 0.0345, "step": 1567 }, { "epoch": 0.7531219980787704, "grad_norm": 0.6071199641753309, "learning_rate": 9.32148489938645e-06, "loss": 0.0424, "step": 1568 }, { "epoch": 0.7536023054755043, "grad_norm": 0.6157559078057468, "learning_rate": 9.32007863470543e-06, "loss": 0.045, "step": 1569 }, { "epoch": 0.7540826128722382, "grad_norm": 0.48273199109262704, "learning_rate": 9.31867102055136e-06, "loss": 0.0406, "step": 1570 }, { "epoch": 0.7545629202689721, "grad_norm": 0.4593296987544262, "learning_rate": 9.31726205736394e-06, "loss": 0.0436, "step": 1571 }, { "epoch": 0.7550432276657061, "grad_norm": 0.4500045712808899, "learning_rate": 9.315851745583294e-06, "loss": 0.0269, "step": 1572 }, { "epoch": 0.75552353506244, "grad_norm": 0.4454538603150306, "learning_rate": 9.31444008564996e-06, "loss": 0.0366, "step": 1573 }, { "epoch": 0.7560038424591738, "grad_norm": 0.8394007155144771, "learning_rate": 9.313027078004903e-06, "loss": 0.0503, "step": 1574 }, { "epoch": 0.7564841498559077, "grad_norm": 0.5712174753843666, "learning_rate": 9.311612723089511e-06, "loss": 0.0547, "step": 1575 }, { "epoch": 0.7569644572526417, "grad_norm": 0.46645965469348327, "learning_rate": 9.310197021345586e-06, "loss": 0.0374, "step": 1576 }, { "epoch": 0.7574447646493756, "grad_norm": 0.4740884325639232, "learning_rate": 9.308779973215355e-06, "loss": 0.0548, "step": 1577 }, { "epoch": 0.7579250720461095, "grad_norm": 0.5888223818640053, "learning_rate": 9.307361579141461e-06, "loss": 0.0581, "step": 1578 }, { "epoch": 0.7584053794428435, "grad_norm": 0.5714740123256156, "learning_rate": 9.305941839566978e-06, "loss": 0.0479, "step": 1579 }, { "epoch": 0.7588856868395774, "grad_norm": 0.6224308689788665, "learning_rate": 9.304520754935387e-06, "loss": 0.0418, "step": 1580 }, { "epoch": 0.7593659942363112, "grad_norm": 0.5273832414104769, "learning_rate": 9.303098325690601e-06, "loss": 0.0534, "step": 1581 }, { "epoch": 0.7598463016330451, "grad_norm": 0.4100845821662855, "learning_rate": 9.301674552276942e-06, "loss": 0.0393, "step": 1582 }, { "epoch": 0.7603266090297791, "grad_norm": 0.5233760618757851, "learning_rate": 9.300249435139162e-06, "loss": 0.0567, "step": 1583 }, { "epoch": 0.760806916426513, "grad_norm": 0.4470067504003617, "learning_rate": 9.298822974722425e-06, "loss": 0.039, "step": 1584 }, { "epoch": 0.7612872238232469, "grad_norm": 0.6788279010789742, "learning_rate": 9.297395171472321e-06, "loss": 0.045, "step": 1585 }, { "epoch": 0.7617675312199808, "grad_norm": 0.5433301733942123, "learning_rate": 9.295966025834853e-06, "loss": 0.0564, "step": 1586 }, { "epoch": 0.7622478386167147, "grad_norm": 0.49450333266253477, "learning_rate": 9.294535538256447e-06, "loss": 0.0531, "step": 1587 }, { "epoch": 0.7627281460134486, "grad_norm": 0.5471927312457198, "learning_rate": 9.29310370918395e-06, "loss": 0.0669, "step": 1588 }, { "epoch": 0.7632084534101825, "grad_norm": 0.4715635212373278, "learning_rate": 9.291670539064623e-06, "loss": 0.0405, "step": 1589 }, { "epoch": 0.7636887608069164, "grad_norm": 0.4111372690921772, "learning_rate": 9.290236028346152e-06, "loss": 0.0379, "step": 1590 }, { "epoch": 0.7641690682036504, "grad_norm": 0.7088243704037053, "learning_rate": 9.288800177476636e-06, "loss": 0.049, "step": 1591 }, { "epoch": 0.7646493756003843, "grad_norm": 0.38910744032185673, "learning_rate": 9.287362986904595e-06, "loss": 0.0377, "step": 1592 }, { "epoch": 0.7651296829971181, "grad_norm": 0.47900015103136745, "learning_rate": 9.28592445707897e-06, "loss": 0.0406, "step": 1593 }, { "epoch": 0.765609990393852, "grad_norm": 1.0851740572976578, "learning_rate": 9.284484588449115e-06, "loss": 0.0444, "step": 1594 }, { "epoch": 0.766090297790586, "grad_norm": 0.5147230458014541, "learning_rate": 9.283043381464806e-06, "loss": 0.0387, "step": 1595 }, { "epoch": 0.7665706051873199, "grad_norm": 0.46123024101584026, "learning_rate": 9.281600836576237e-06, "loss": 0.0451, "step": 1596 }, { "epoch": 0.7670509125840538, "grad_norm": 0.4441738388860228, "learning_rate": 9.280156954234017e-06, "loss": 0.0388, "step": 1597 }, { "epoch": 0.7675312199807877, "grad_norm": 0.5563354966449449, "learning_rate": 9.278711734889178e-06, "loss": 0.0407, "step": 1598 }, { "epoch": 0.7680115273775217, "grad_norm": 0.6141961922794104, "learning_rate": 9.277265178993164e-06, "loss": 0.0398, "step": 1599 }, { "epoch": 0.7684918347742555, "grad_norm": 0.5225559365038454, "learning_rate": 9.275817286997843e-06, "loss": 0.0371, "step": 1600 }, { "epoch": 0.7689721421709894, "grad_norm": 0.5371833677807129, "learning_rate": 9.274368059355491e-06, "loss": 0.0369, "step": 1601 }, { "epoch": 0.7694524495677233, "grad_norm": 0.4509358847362159, "learning_rate": 9.27291749651881e-06, "loss": 0.0348, "step": 1602 }, { "epoch": 0.7699327569644573, "grad_norm": 0.3790301116583118, "learning_rate": 9.271465598940914e-06, "loss": 0.0341, "step": 1603 }, { "epoch": 0.7704130643611912, "grad_norm": 0.5646089029750233, "learning_rate": 9.270012367075337e-06, "loss": 0.0458, "step": 1604 }, { "epoch": 0.770893371757925, "grad_norm": 0.6621291150481707, "learning_rate": 9.268557801376027e-06, "loss": 0.0673, "step": 1605 }, { "epoch": 0.7713736791546589, "grad_norm": 0.4824676654847145, "learning_rate": 9.267101902297354e-06, "loss": 0.0605, "step": 1606 }, { "epoch": 0.7718539865513929, "grad_norm": 0.47867737571924573, "learning_rate": 9.265644670294094e-06, "loss": 0.0402, "step": 1607 }, { "epoch": 0.7723342939481268, "grad_norm": 0.7521404239285508, "learning_rate": 9.26418610582145e-06, "loss": 0.0437, "step": 1608 }, { "epoch": 0.7728146013448607, "grad_norm": 0.4949872678032156, "learning_rate": 9.262726209335038e-06, "loss": 0.0471, "step": 1609 }, { "epoch": 0.7732949087415946, "grad_norm": 0.8146436171510608, "learning_rate": 9.261264981290887e-06, "loss": 0.0499, "step": 1610 }, { "epoch": 0.7737752161383286, "grad_norm": 0.5261485997463431, "learning_rate": 9.259802422145445e-06, "loss": 0.0499, "step": 1611 }, { "epoch": 0.7742555235350624, "grad_norm": 0.37722938922014426, "learning_rate": 9.258338532355575e-06, "loss": 0.0406, "step": 1612 }, { "epoch": 0.7747358309317963, "grad_norm": 0.5137949267423024, "learning_rate": 9.256873312378559e-06, "loss": 0.0371, "step": 1613 }, { "epoch": 0.7752161383285303, "grad_norm": 0.46384787647828507, "learning_rate": 9.255406762672085e-06, "loss": 0.0436, "step": 1614 }, { "epoch": 0.7756964457252642, "grad_norm": 0.42501919449374165, "learning_rate": 9.253938883694266e-06, "loss": 0.0319, "step": 1615 }, { "epoch": 0.7761767531219981, "grad_norm": 0.48825545396901937, "learning_rate": 9.252469675903627e-06, "loss": 0.0438, "step": 1616 }, { "epoch": 0.776657060518732, "grad_norm": 0.45045012320002853, "learning_rate": 9.250999139759107e-06, "loss": 0.0507, "step": 1617 }, { "epoch": 0.777137367915466, "grad_norm": 0.47986993527995575, "learning_rate": 9.249527275720062e-06, "loss": 0.0438, "step": 1618 }, { "epoch": 0.7776176753121998, "grad_norm": 0.6897438756676713, "learning_rate": 9.248054084246259e-06, "loss": 0.0711, "step": 1619 }, { "epoch": 0.7780979827089337, "grad_norm": 0.33151049726562337, "learning_rate": 9.246579565797886e-06, "loss": 0.0357, "step": 1620 }, { "epoch": 0.7785782901056676, "grad_norm": 0.43010301611246965, "learning_rate": 9.245103720835538e-06, "loss": 0.0451, "step": 1621 }, { "epoch": 0.7790585975024016, "grad_norm": 0.40143671990795116, "learning_rate": 9.24362654982023e-06, "loss": 0.0389, "step": 1622 }, { "epoch": 0.7795389048991355, "grad_norm": 0.4379641653718776, "learning_rate": 9.24214805321339e-06, "loss": 0.0422, "step": 1623 }, { "epoch": 0.7800192122958693, "grad_norm": 0.7421035915776883, "learning_rate": 9.24066823147686e-06, "loss": 0.0477, "step": 1624 }, { "epoch": 0.7804995196926032, "grad_norm": 0.6044451650392826, "learning_rate": 9.239187085072891e-06, "loss": 0.0482, "step": 1625 }, { "epoch": 0.7809798270893372, "grad_norm": 0.5596451678457982, "learning_rate": 9.237704614464157e-06, "loss": 0.0418, "step": 1626 }, { "epoch": 0.7814601344860711, "grad_norm": 0.6592034808792037, "learning_rate": 9.236220820113738e-06, "loss": 0.0546, "step": 1627 }, { "epoch": 0.781940441882805, "grad_norm": 0.5465739249129324, "learning_rate": 9.234735702485132e-06, "loss": 0.0442, "step": 1628 }, { "epoch": 0.7824207492795389, "grad_norm": 0.4911854758209489, "learning_rate": 9.233249262042247e-06, "loss": 0.0398, "step": 1629 }, { "epoch": 0.7829010566762729, "grad_norm": 0.5748292545758572, "learning_rate": 9.231761499249407e-06, "loss": 0.0508, "step": 1630 }, { "epoch": 0.7833813640730067, "grad_norm": 0.6346041474004851, "learning_rate": 9.230272414571349e-06, "loss": 0.0439, "step": 1631 }, { "epoch": 0.7838616714697406, "grad_norm": 0.5634165757189631, "learning_rate": 9.22878200847322e-06, "loss": 0.0539, "step": 1632 }, { "epoch": 0.7843419788664745, "grad_norm": 0.5027607179661531, "learning_rate": 9.227290281420583e-06, "loss": 0.045, "step": 1633 }, { "epoch": 0.7848222862632085, "grad_norm": 0.3037912293882595, "learning_rate": 9.22579723387941e-06, "loss": 0.0369, "step": 1634 }, { "epoch": 0.7853025936599424, "grad_norm": 0.8410699245876824, "learning_rate": 9.22430286631609e-06, "loss": 0.0356, "step": 1635 }, { "epoch": 0.7857829010566763, "grad_norm": 0.5368787438929216, "learning_rate": 9.222807179197421e-06, "loss": 0.043, "step": 1636 }, { "epoch": 0.7862632084534101, "grad_norm": 0.4032303101187688, "learning_rate": 9.221310172990616e-06, "loss": 0.0368, "step": 1637 }, { "epoch": 0.7867435158501441, "grad_norm": 0.5409648779344621, "learning_rate": 9.219811848163295e-06, "loss": 0.0629, "step": 1638 }, { "epoch": 0.787223823246878, "grad_norm": 0.6171811747268313, "learning_rate": 9.218312205183497e-06, "loss": 0.0531, "step": 1639 }, { "epoch": 0.7877041306436119, "grad_norm": 0.4216070952232712, "learning_rate": 9.216811244519667e-06, "loss": 0.0449, "step": 1640 }, { "epoch": 0.7881844380403458, "grad_norm": 0.5763421855553458, "learning_rate": 9.215308966640662e-06, "loss": 0.0368, "step": 1641 }, { "epoch": 0.7886647454370798, "grad_norm": 0.4870149354831771, "learning_rate": 9.213805372015756e-06, "loss": 0.0451, "step": 1642 }, { "epoch": 0.7891450528338136, "grad_norm": 0.393109877651519, "learning_rate": 9.212300461114626e-06, "loss": 0.0335, "step": 1643 }, { "epoch": 0.7896253602305475, "grad_norm": 0.5879082191240319, "learning_rate": 9.210794234407368e-06, "loss": 0.0451, "step": 1644 }, { "epoch": 0.7901056676272814, "grad_norm": 0.5295234190274382, "learning_rate": 9.209286692364484e-06, "loss": 0.061, "step": 1645 }, { "epoch": 0.7905859750240154, "grad_norm": 0.9676479560008302, "learning_rate": 9.207777835456887e-06, "loss": 0.0593, "step": 1646 }, { "epoch": 0.7910662824207493, "grad_norm": 0.41367077682405545, "learning_rate": 9.206267664155906e-06, "loss": 0.0286, "step": 1647 }, { "epoch": 0.7915465898174832, "grad_norm": 0.3879540531683163, "learning_rate": 9.204756178933274e-06, "loss": 0.0377, "step": 1648 }, { "epoch": 0.7920268972142172, "grad_norm": 0.4431616617976934, "learning_rate": 9.203243380261138e-06, "loss": 0.0558, "step": 1649 }, { "epoch": 0.792507204610951, "grad_norm": 0.3784767168841202, "learning_rate": 9.201729268612054e-06, "loss": 0.0539, "step": 1650 }, { "epoch": 0.7929875120076849, "grad_norm": 0.45445078560396285, "learning_rate": 9.20021384445899e-06, "loss": 0.0406, "step": 1651 }, { "epoch": 0.7934678194044188, "grad_norm": 0.40977728429479643, "learning_rate": 9.198697108275318e-06, "loss": 0.0337, "step": 1652 }, { "epoch": 0.7939481268011528, "grad_norm": 0.36463474145779456, "learning_rate": 9.19717906053483e-06, "loss": 0.0377, "step": 1653 }, { "epoch": 0.7944284341978867, "grad_norm": 0.36067865022176565, "learning_rate": 9.19565970171172e-06, "loss": 0.032, "step": 1654 }, { "epoch": 0.7949087415946205, "grad_norm": 0.5087941975761743, "learning_rate": 9.194139032280594e-06, "loss": 0.0363, "step": 1655 }, { "epoch": 0.7953890489913544, "grad_norm": 0.48198473655680424, "learning_rate": 9.192617052716463e-06, "loss": 0.0458, "step": 1656 }, { "epoch": 0.7958693563880884, "grad_norm": 0.5456323999363246, "learning_rate": 9.19109376349476e-06, "loss": 0.0546, "step": 1657 }, { "epoch": 0.7963496637848223, "grad_norm": 0.4698008133059572, "learning_rate": 9.18956916509131e-06, "loss": 0.041, "step": 1658 }, { "epoch": 0.7968299711815562, "grad_norm": 0.48858912139881877, "learning_rate": 9.18804325798236e-06, "loss": 0.0482, "step": 1659 }, { "epoch": 0.7973102785782901, "grad_norm": 0.45224694401197335, "learning_rate": 9.18651604264456e-06, "loss": 0.0432, "step": 1660 }, { "epoch": 0.7977905859750241, "grad_norm": 0.5291111116543427, "learning_rate": 9.184987519554969e-06, "loss": 0.0392, "step": 1661 }, { "epoch": 0.7982708933717579, "grad_norm": 0.39953149277001193, "learning_rate": 9.183457689191055e-06, "loss": 0.0392, "step": 1662 }, { "epoch": 0.7987512007684918, "grad_norm": 0.49701247579632074, "learning_rate": 9.181926552030698e-06, "loss": 0.0374, "step": 1663 }, { "epoch": 0.7992315081652257, "grad_norm": 0.38365739480522254, "learning_rate": 9.18039410855218e-06, "loss": 0.043, "step": 1664 }, { "epoch": 0.7997118155619597, "grad_norm": 0.37429467352808293, "learning_rate": 9.178860359234193e-06, "loss": 0.0407, "step": 1665 }, { "epoch": 0.8001921229586936, "grad_norm": 0.4091686599510648, "learning_rate": 9.17732530455584e-06, "loss": 0.038, "step": 1666 }, { "epoch": 0.8006724303554275, "grad_norm": 0.7195325093951213, "learning_rate": 9.175788944996629e-06, "loss": 0.0741, "step": 1667 }, { "epoch": 0.8011527377521613, "grad_norm": 0.3820712024818586, "learning_rate": 9.174251281036478e-06, "loss": 0.0516, "step": 1668 }, { "epoch": 0.8016330451488953, "grad_norm": 0.6407457249834841, "learning_rate": 9.172712313155708e-06, "loss": 0.0434, "step": 1669 }, { "epoch": 0.8021133525456292, "grad_norm": 0.4969915345518172, "learning_rate": 9.171172041835048e-06, "loss": 0.0385, "step": 1670 }, { "epoch": 0.8025936599423631, "grad_norm": 0.8727857658628531, "learning_rate": 9.169630467555643e-06, "loss": 0.0548, "step": 1671 }, { "epoch": 0.803073967339097, "grad_norm": 0.5478163807092746, "learning_rate": 9.168087590799034e-06, "loss": 0.0417, "step": 1672 }, { "epoch": 0.803554274735831, "grad_norm": 0.5827507091175965, "learning_rate": 9.16654341204717e-06, "loss": 0.0568, "step": 1673 }, { "epoch": 0.8040345821325648, "grad_norm": 0.5177032542659947, "learning_rate": 9.164997931782415e-06, "loss": 0.0492, "step": 1674 }, { "epoch": 0.8045148895292987, "grad_norm": 0.4309298356028421, "learning_rate": 9.163451150487531e-06, "loss": 0.0454, "step": 1675 }, { "epoch": 0.8049951969260326, "grad_norm": 0.4057663676388221, "learning_rate": 9.161903068645692e-06, "loss": 0.0422, "step": 1676 }, { "epoch": 0.8054755043227666, "grad_norm": 0.47646062877439854, "learning_rate": 9.160353686740476e-06, "loss": 0.0351, "step": 1677 }, { "epoch": 0.8059558117195005, "grad_norm": 0.45558821865778054, "learning_rate": 9.158803005255862e-06, "loss": 0.0416, "step": 1678 }, { "epoch": 0.8064361191162344, "grad_norm": 0.536210255513318, "learning_rate": 9.157251024676249e-06, "loss": 0.0499, "step": 1679 }, { "epoch": 0.8069164265129684, "grad_norm": 0.6111498216460901, "learning_rate": 9.155697745486424e-06, "loss": 0.0342, "step": 1680 }, { "epoch": 0.8073967339097022, "grad_norm": 0.4833245316704844, "learning_rate": 9.154143168171594e-06, "loss": 0.0447, "step": 1681 }, { "epoch": 0.8078770413064361, "grad_norm": 0.7125662823152267, "learning_rate": 9.152587293217362e-06, "loss": 0.0475, "step": 1682 }, { "epoch": 0.80835734870317, "grad_norm": 0.42403591379119365, "learning_rate": 9.151030121109745e-06, "loss": 0.0391, "step": 1683 }, { "epoch": 0.808837656099904, "grad_norm": 0.5205159350136663, "learning_rate": 9.149471652335155e-06, "loss": 0.0512, "step": 1684 }, { "epoch": 0.8093179634966379, "grad_norm": 0.4957469692515393, "learning_rate": 9.14791188738042e-06, "loss": 0.0406, "step": 1685 }, { "epoch": 0.8097982708933718, "grad_norm": 0.7665309194587624, "learning_rate": 9.146350826732762e-06, "loss": 0.0549, "step": 1686 }, { "epoch": 0.8102785782901056, "grad_norm": 0.43844120238867723, "learning_rate": 9.144788470879818e-06, "loss": 0.0429, "step": 1687 }, { "epoch": 0.8107588856868396, "grad_norm": 0.5211050949999975, "learning_rate": 9.143224820309622e-06, "loss": 0.0479, "step": 1688 }, { "epoch": 0.8112391930835735, "grad_norm": 0.45269831596656984, "learning_rate": 9.141659875510615e-06, "loss": 0.0527, "step": 1689 }, { "epoch": 0.8117195004803074, "grad_norm": 0.4724243903083948, "learning_rate": 9.140093636971646e-06, "loss": 0.0353, "step": 1690 }, { "epoch": 0.8121998078770413, "grad_norm": 0.6109510239550704, "learning_rate": 9.13852610518196e-06, "loss": 0.0466, "step": 1691 }, { "epoch": 0.8126801152737753, "grad_norm": 0.41144558793142044, "learning_rate": 9.136957280631212e-06, "loss": 0.0363, "step": 1692 }, { "epoch": 0.8131604226705091, "grad_norm": 0.4886106331868101, "learning_rate": 9.135387163809462e-06, "loss": 0.0442, "step": 1693 }, { "epoch": 0.813640730067243, "grad_norm": 0.486941943939095, "learning_rate": 9.133815755207168e-06, "loss": 0.0533, "step": 1694 }, { "epoch": 0.8141210374639769, "grad_norm": 0.34872099800574413, "learning_rate": 9.132243055315193e-06, "loss": 0.0314, "step": 1695 }, { "epoch": 0.8146013448607109, "grad_norm": 0.44324684859322067, "learning_rate": 9.130669064624811e-06, "loss": 0.0532, "step": 1696 }, { "epoch": 0.8150816522574448, "grad_norm": 0.44545944868455956, "learning_rate": 9.129093783627687e-06, "loss": 0.037, "step": 1697 }, { "epoch": 0.8155619596541787, "grad_norm": 0.45798246599733317, "learning_rate": 9.1275172128159e-06, "loss": 0.0481, "step": 1698 }, { "epoch": 0.8160422670509125, "grad_norm": 0.44349109742245846, "learning_rate": 9.125939352681922e-06, "loss": 0.0517, "step": 1699 }, { "epoch": 0.8165225744476465, "grad_norm": 0.6001166233671226, "learning_rate": 9.124360203718638e-06, "loss": 0.0461, "step": 1700 }, { "epoch": 0.8170028818443804, "grad_norm": 0.6831035265294242, "learning_rate": 9.122779766419329e-06, "loss": 0.0591, "step": 1701 }, { "epoch": 0.8174831892411143, "grad_norm": 0.38077445809771976, "learning_rate": 9.121198041277677e-06, "loss": 0.0329, "step": 1702 }, { "epoch": 0.8179634966378482, "grad_norm": 0.41140076728447145, "learning_rate": 9.119615028787771e-06, "loss": 0.0485, "step": 1703 }, { "epoch": 0.8184438040345822, "grad_norm": 0.5189348552493814, "learning_rate": 9.118030729444103e-06, "loss": 0.0448, "step": 1704 }, { "epoch": 0.818924111431316, "grad_norm": 0.465577166008072, "learning_rate": 9.11644514374156e-06, "loss": 0.0514, "step": 1705 }, { "epoch": 0.8194044188280499, "grad_norm": 0.5621851173772174, "learning_rate": 9.114858272175438e-06, "loss": 0.0392, "step": 1706 }, { "epoch": 0.8198847262247838, "grad_norm": 0.6859692616790941, "learning_rate": 9.113270115241429e-06, "loss": 0.0705, "step": 1707 }, { "epoch": 0.8203650336215178, "grad_norm": 0.47502833358038377, "learning_rate": 9.111680673435632e-06, "loss": 0.0441, "step": 1708 }, { "epoch": 0.8208453410182517, "grad_norm": 0.46706802163027805, "learning_rate": 9.110089947254544e-06, "loss": 0.0391, "step": 1709 }, { "epoch": 0.8213256484149856, "grad_norm": 0.31719462369803303, "learning_rate": 9.108497937195064e-06, "loss": 0.0289, "step": 1710 }, { "epoch": 0.8218059558117194, "grad_norm": 0.40385158643378627, "learning_rate": 9.106904643754491e-06, "loss": 0.0381, "step": 1711 }, { "epoch": 0.8222862632084534, "grad_norm": 0.5545734057584185, "learning_rate": 9.105310067430526e-06, "loss": 0.0471, "step": 1712 }, { "epoch": 0.8227665706051873, "grad_norm": 0.4461054981605916, "learning_rate": 9.10371420872127e-06, "loss": 0.0508, "step": 1713 }, { "epoch": 0.8232468780019212, "grad_norm": 0.47654544400356025, "learning_rate": 9.102117068125227e-06, "loss": 0.0475, "step": 1714 }, { "epoch": 0.8237271853986552, "grad_norm": 0.39911477246208293, "learning_rate": 9.100518646141299e-06, "loss": 0.0404, "step": 1715 }, { "epoch": 0.8242074927953891, "grad_norm": 0.7172841710639825, "learning_rate": 9.098918943268786e-06, "loss": 0.046, "step": 1716 }, { "epoch": 0.824687800192123, "grad_norm": 0.43088188833750957, "learning_rate": 9.097317960007395e-06, "loss": 0.0403, "step": 1717 }, { "epoch": 0.8251681075888568, "grad_norm": 0.5012218482080577, "learning_rate": 9.095715696857225e-06, "loss": 0.045, "step": 1718 }, { "epoch": 0.8256484149855908, "grad_norm": 0.46478416389541866, "learning_rate": 9.094112154318784e-06, "loss": 0.033, "step": 1719 }, { "epoch": 0.8261287223823247, "grad_norm": 0.4130084158371063, "learning_rate": 9.092507332892968e-06, "loss": 0.0348, "step": 1720 }, { "epoch": 0.8266090297790586, "grad_norm": 0.6684505123844774, "learning_rate": 9.090901233081082e-06, "loss": 0.0629, "step": 1721 }, { "epoch": 0.8270893371757925, "grad_norm": 0.4323599984449867, "learning_rate": 9.089293855384828e-06, "loss": 0.0437, "step": 1722 }, { "epoch": 0.8275696445725265, "grad_norm": 0.32774506981480916, "learning_rate": 9.087685200306306e-06, "loss": 0.0334, "step": 1723 }, { "epoch": 0.8280499519692603, "grad_norm": 0.5191345063331478, "learning_rate": 9.086075268348014e-06, "loss": 0.0433, "step": 1724 }, { "epoch": 0.8285302593659942, "grad_norm": 0.33509417066826364, "learning_rate": 9.084464060012849e-06, "loss": 0.0402, "step": 1725 }, { "epoch": 0.8290105667627281, "grad_norm": 0.3142718339340029, "learning_rate": 9.082851575804112e-06, "loss": 0.0331, "step": 1726 }, { "epoch": 0.8294908741594621, "grad_norm": 0.4748408271731178, "learning_rate": 9.081237816225497e-06, "loss": 0.0516, "step": 1727 }, { "epoch": 0.829971181556196, "grad_norm": 0.37444079243147355, "learning_rate": 9.079622781781094e-06, "loss": 0.037, "step": 1728 }, { "epoch": 0.8304514889529299, "grad_norm": 0.5592237440039568, "learning_rate": 9.0780064729754e-06, "loss": 0.0452, "step": 1729 }, { "epoch": 0.8309317963496637, "grad_norm": 0.5481405856733301, "learning_rate": 9.076388890313304e-06, "loss": 0.0446, "step": 1730 }, { "epoch": 0.8314121037463977, "grad_norm": 0.3932969307248321, "learning_rate": 9.07477003430009e-06, "loss": 0.0488, "step": 1731 }, { "epoch": 0.8318924111431316, "grad_norm": 0.3702785209776025, "learning_rate": 9.073149905441451e-06, "loss": 0.0334, "step": 1732 }, { "epoch": 0.8323727185398655, "grad_norm": 1.4773631914590435, "learning_rate": 9.071528504243465e-06, "loss": 0.0347, "step": 1733 }, { "epoch": 0.8328530259365994, "grad_norm": 0.4936274544739719, "learning_rate": 9.069905831212616e-06, "loss": 0.081, "step": 1734 }, { "epoch": 0.8333333333333334, "grad_norm": 0.5882359938166744, "learning_rate": 9.068281886855778e-06, "loss": 0.0437, "step": 1735 }, { "epoch": 0.8338136407300673, "grad_norm": 0.46989224834772747, "learning_rate": 9.066656671680231e-06, "loss": 0.0439, "step": 1736 }, { "epoch": 0.8342939481268011, "grad_norm": 0.4534004273658601, "learning_rate": 9.065030186193643e-06, "loss": 0.033, "step": 1737 }, { "epoch": 0.834774255523535, "grad_norm": 1.2265014670233068, "learning_rate": 9.063402430904087e-06, "loss": 0.0525, "step": 1738 }, { "epoch": 0.835254562920269, "grad_norm": 0.5422498828003771, "learning_rate": 9.061773406320027e-06, "loss": 0.0521, "step": 1739 }, { "epoch": 0.8357348703170029, "grad_norm": 0.4195808712209068, "learning_rate": 9.060143112950326e-06, "loss": 0.0316, "step": 1740 }, { "epoch": 0.8362151777137368, "grad_norm": 0.4855308450574598, "learning_rate": 9.058511551304241e-06, "loss": 0.0503, "step": 1741 }, { "epoch": 0.8366954851104706, "grad_norm": 0.5075843680608557, "learning_rate": 9.056878721891427e-06, "loss": 0.0425, "step": 1742 }, { "epoch": 0.8371757925072046, "grad_norm": 0.5482382504823542, "learning_rate": 9.055244625221935e-06, "loss": 0.0428, "step": 1743 }, { "epoch": 0.8376560999039385, "grad_norm": 0.6695595892233471, "learning_rate": 9.053609261806214e-06, "loss": 0.0587, "step": 1744 }, { "epoch": 0.8381364073006724, "grad_norm": 0.4581918394935273, "learning_rate": 9.0519726321551e-06, "loss": 0.0361, "step": 1745 }, { "epoch": 0.8386167146974063, "grad_norm": 0.3952408558855935, "learning_rate": 9.050334736779837e-06, "loss": 0.0381, "step": 1746 }, { "epoch": 0.8390970220941403, "grad_norm": 0.4109873080252859, "learning_rate": 9.048695576192058e-06, "loss": 0.0442, "step": 1747 }, { "epoch": 0.8395773294908742, "grad_norm": 0.5616982246438386, "learning_rate": 9.047055150903787e-06, "loss": 0.0584, "step": 1748 }, { "epoch": 0.840057636887608, "grad_norm": 0.37826080721339767, "learning_rate": 9.045413461427453e-06, "loss": 0.0409, "step": 1749 }, { "epoch": 0.840537944284342, "grad_norm": 0.40088324296361066, "learning_rate": 9.043770508275868e-06, "loss": 0.0487, "step": 1750 }, { "epoch": 0.8410182516810759, "grad_norm": 0.44718093929332914, "learning_rate": 9.04212629196225e-06, "loss": 0.0352, "step": 1751 }, { "epoch": 0.8414985590778098, "grad_norm": 0.4706059052244275, "learning_rate": 9.040480813000205e-06, "loss": 0.0358, "step": 1752 }, { "epoch": 0.8419788664745437, "grad_norm": 0.3670084341616506, "learning_rate": 9.038834071903736e-06, "loss": 0.0286, "step": 1753 }, { "epoch": 0.8424591738712777, "grad_norm": 0.5139890566947024, "learning_rate": 9.037186069187239e-06, "loss": 0.0471, "step": 1754 }, { "epoch": 0.8429394812680115, "grad_norm": 0.6580821574923913, "learning_rate": 9.035536805365503e-06, "loss": 0.0404, "step": 1755 }, { "epoch": 0.8434197886647454, "grad_norm": 0.3488569658059193, "learning_rate": 9.033886280953714e-06, "loss": 0.0332, "step": 1756 }, { "epoch": 0.8439000960614793, "grad_norm": 0.459104475456851, "learning_rate": 9.032234496467448e-06, "loss": 0.0457, "step": 1757 }, { "epoch": 0.8443804034582133, "grad_norm": 0.5555684583259805, "learning_rate": 9.03058145242268e-06, "loss": 0.0471, "step": 1758 }, { "epoch": 0.8448607108549472, "grad_norm": 0.4615657333941676, "learning_rate": 9.028927149335773e-06, "loss": 0.0412, "step": 1759 }, { "epoch": 0.8453410182516811, "grad_norm": 0.45856078963996716, "learning_rate": 9.027271587723487e-06, "loss": 0.0374, "step": 1760 }, { "epoch": 0.845821325648415, "grad_norm": 0.6912129790142436, "learning_rate": 9.025614768102972e-06, "loss": 0.0693, "step": 1761 }, { "epoch": 0.8463016330451489, "grad_norm": 0.5122989554497435, "learning_rate": 9.023956690991775e-06, "loss": 0.0465, "step": 1762 }, { "epoch": 0.8467819404418828, "grad_norm": 0.36126845802883945, "learning_rate": 9.02229735690783e-06, "loss": 0.0296, "step": 1763 }, { "epoch": 0.8472622478386167, "grad_norm": 0.6087551399045577, "learning_rate": 9.020636766369471e-06, "loss": 0.0506, "step": 1764 }, { "epoch": 0.8477425552353506, "grad_norm": 0.38525060421145424, "learning_rate": 9.018974919895418e-06, "loss": 0.0355, "step": 1765 }, { "epoch": 0.8482228626320846, "grad_norm": 0.346018375745237, "learning_rate": 9.017311818004785e-06, "loss": 0.0341, "step": 1766 }, { "epoch": 0.8487031700288185, "grad_norm": 0.6260615640367205, "learning_rate": 9.01564746121708e-06, "loss": 0.0443, "step": 1767 }, { "epoch": 0.8491834774255523, "grad_norm": 0.5187278606147561, "learning_rate": 9.013981850052205e-06, "loss": 0.0433, "step": 1768 }, { "epoch": 0.8496637848222862, "grad_norm": 0.5503187612040098, "learning_rate": 9.012314985030445e-06, "loss": 0.0358, "step": 1769 }, { "epoch": 0.8501440922190202, "grad_norm": 0.6316117180141888, "learning_rate": 9.010646866672488e-06, "loss": 0.0384, "step": 1770 }, { "epoch": 0.8506243996157541, "grad_norm": 0.5212663878288761, "learning_rate": 9.008977495499405e-06, "loss": 0.0361, "step": 1771 }, { "epoch": 0.851104707012488, "grad_norm": 0.5547948672955962, "learning_rate": 9.007306872032663e-06, "loss": 0.0446, "step": 1772 }, { "epoch": 0.8515850144092219, "grad_norm": 0.42396510808899085, "learning_rate": 9.005634996794117e-06, "loss": 0.032, "step": 1773 }, { "epoch": 0.8520653218059558, "grad_norm": 0.7725093468772292, "learning_rate": 9.003961870306015e-06, "loss": 0.0549, "step": 1774 }, { "epoch": 0.8525456292026897, "grad_norm": 0.4308369316438503, "learning_rate": 9.002287493090996e-06, "loss": 0.0499, "step": 1775 }, { "epoch": 0.8530259365994236, "grad_norm": 0.2780784001577288, "learning_rate": 9.000611865672088e-06, "loss": 0.0287, "step": 1776 }, { "epoch": 0.8535062439961575, "grad_norm": 0.5180718543954558, "learning_rate": 8.998934988572713e-06, "loss": 0.0329, "step": 1777 }, { "epoch": 0.8539865513928915, "grad_norm": 0.5057214443895521, "learning_rate": 8.997256862316678e-06, "loss": 0.0481, "step": 1778 }, { "epoch": 0.8544668587896254, "grad_norm": 0.3751078985501239, "learning_rate": 8.995577487428187e-06, "loss": 0.0321, "step": 1779 }, { "epoch": 0.8549471661863592, "grad_norm": 0.8942585028670543, "learning_rate": 8.993896864431825e-06, "loss": 0.0442, "step": 1780 }, { "epoch": 0.8554274735830932, "grad_norm": 0.537990863386714, "learning_rate": 8.992214993852576e-06, "loss": 0.0385, "step": 1781 }, { "epoch": 0.8559077809798271, "grad_norm": 0.3368917295945931, "learning_rate": 8.99053187621581e-06, "loss": 0.0358, "step": 1782 }, { "epoch": 0.856388088376561, "grad_norm": 0.3625643306114823, "learning_rate": 8.988847512047285e-06, "loss": 0.0359, "step": 1783 }, { "epoch": 0.8568683957732949, "grad_norm": 0.5392108809967152, "learning_rate": 8.98716190187315e-06, "loss": 0.0445, "step": 1784 }, { "epoch": 0.8573487031700289, "grad_norm": 0.5998276931275567, "learning_rate": 8.985475046219942e-06, "loss": 0.0443, "step": 1785 }, { "epoch": 0.8578290105667628, "grad_norm": 0.5131152759715736, "learning_rate": 8.983786945614589e-06, "loss": 0.0387, "step": 1786 }, { "epoch": 0.8583093179634966, "grad_norm": 0.6910401242120892, "learning_rate": 8.98209760058441e-06, "loss": 0.0469, "step": 1787 }, { "epoch": 0.8587896253602305, "grad_norm": 0.4194901862760443, "learning_rate": 8.980407011657103e-06, "loss": 0.0403, "step": 1788 }, { "epoch": 0.8592699327569645, "grad_norm": 0.46623807550964225, "learning_rate": 8.978715179360767e-06, "loss": 0.0392, "step": 1789 }, { "epoch": 0.8597502401536984, "grad_norm": 0.44859136364628743, "learning_rate": 8.97702210422388e-06, "loss": 0.0458, "step": 1790 }, { "epoch": 0.8602305475504323, "grad_norm": 0.6257891336546498, "learning_rate": 8.975327786775316e-06, "loss": 0.0366, "step": 1791 }, { "epoch": 0.8607108549471661, "grad_norm": 0.4736431571435201, "learning_rate": 8.973632227544326e-06, "loss": 0.0433, "step": 1792 }, { "epoch": 0.8611911623439001, "grad_norm": 0.4152452493682378, "learning_rate": 8.971935427060563e-06, "loss": 0.0405, "step": 1793 }, { "epoch": 0.861671469740634, "grad_norm": 0.9662287838754176, "learning_rate": 8.970237385854059e-06, "loss": 0.0583, "step": 1794 }, { "epoch": 0.8621517771373679, "grad_norm": 0.5056226800498823, "learning_rate": 8.96853810445523e-06, "loss": 0.0476, "step": 1795 }, { "epoch": 0.8626320845341018, "grad_norm": 0.5166750113936333, "learning_rate": 8.966837583394891e-06, "loss": 0.0434, "step": 1796 }, { "epoch": 0.8631123919308358, "grad_norm": 0.5477224778262247, "learning_rate": 8.965135823204232e-06, "loss": 0.0416, "step": 1797 }, { "epoch": 0.8635926993275697, "grad_norm": 0.3837910069912173, "learning_rate": 8.963432824414842e-06, "loss": 0.0268, "step": 1798 }, { "epoch": 0.8640730067243035, "grad_norm": 0.750187104754067, "learning_rate": 8.961728587558684e-06, "loss": 0.0514, "step": 1799 }, { "epoch": 0.8645533141210374, "grad_norm": 0.5116240081446706, "learning_rate": 8.96002311316812e-06, "loss": 0.0362, "step": 1800 }, { "epoch": 0.8650336215177714, "grad_norm": 0.5452866982844962, "learning_rate": 8.95831640177589e-06, "loss": 0.0412, "step": 1801 }, { "epoch": 0.8655139289145053, "grad_norm": 0.3893421475161622, "learning_rate": 8.956608453915126e-06, "loss": 0.0422, "step": 1802 }, { "epoch": 0.8659942363112392, "grad_norm": 0.400744042269121, "learning_rate": 8.954899270119339e-06, "loss": 0.0442, "step": 1803 }, { "epoch": 0.866474543707973, "grad_norm": 0.8459931991973275, "learning_rate": 8.953188850922436e-06, "loss": 0.0319, "step": 1804 }, { "epoch": 0.866954851104707, "grad_norm": 0.4769183729256473, "learning_rate": 8.951477196858703e-06, "loss": 0.0392, "step": 1805 }, { "epoch": 0.8674351585014409, "grad_norm": 0.5961258504118155, "learning_rate": 8.94976430846281e-06, "loss": 0.044, "step": 1806 }, { "epoch": 0.8679154658981748, "grad_norm": 0.7889199748787529, "learning_rate": 8.94805018626982e-06, "loss": 0.0554, "step": 1807 }, { "epoch": 0.8683957732949087, "grad_norm": 0.6776357232960094, "learning_rate": 8.946334830815176e-06, "loss": 0.0474, "step": 1808 }, { "epoch": 0.8688760806916427, "grad_norm": 0.5294621498199231, "learning_rate": 8.944618242634707e-06, "loss": 0.0441, "step": 1809 }, { "epoch": 0.8693563880883766, "grad_norm": 0.40665428772649925, "learning_rate": 8.942900422264627e-06, "loss": 0.0293, "step": 1810 }, { "epoch": 0.8698366954851104, "grad_norm": 0.5854731121778896, "learning_rate": 8.941181370241538e-06, "loss": 0.0472, "step": 1811 }, { "epoch": 0.8703170028818443, "grad_norm": 0.514195838759812, "learning_rate": 8.939461087102424e-06, "loss": 0.0396, "step": 1812 }, { "epoch": 0.8707973102785783, "grad_norm": 0.6456083251358632, "learning_rate": 8.937739573384653e-06, "loss": 0.048, "step": 1813 }, { "epoch": 0.8712776176753122, "grad_norm": 0.40027049247918695, "learning_rate": 8.936016829625977e-06, "loss": 0.0373, "step": 1814 }, { "epoch": 0.8717579250720461, "grad_norm": 1.4781590167762066, "learning_rate": 8.934292856364535e-06, "loss": 0.0706, "step": 1815 }, { "epoch": 0.8722382324687801, "grad_norm": 0.5618715192739092, "learning_rate": 8.932567654138849e-06, "loss": 0.0523, "step": 1816 }, { "epoch": 0.872718539865514, "grad_norm": 0.43422019108974075, "learning_rate": 8.930841223487823e-06, "loss": 0.0464, "step": 1817 }, { "epoch": 0.8731988472622478, "grad_norm": 0.3876967201171116, "learning_rate": 8.929113564950746e-06, "loss": 0.0278, "step": 1818 }, { "epoch": 0.8736791546589817, "grad_norm": 0.46720060258300933, "learning_rate": 8.927384679067293e-06, "loss": 0.0402, "step": 1819 }, { "epoch": 0.8741594620557157, "grad_norm": 0.519410545548104, "learning_rate": 8.925654566377519e-06, "loss": 0.0541, "step": 1820 }, { "epoch": 0.8746397694524496, "grad_norm": 1.022650569670746, "learning_rate": 8.923923227421862e-06, "loss": 0.0564, "step": 1821 }, { "epoch": 0.8751200768491835, "grad_norm": 0.358444548188737, "learning_rate": 8.922190662741146e-06, "loss": 0.0352, "step": 1822 }, { "epoch": 0.8756003842459174, "grad_norm": 0.6535408000392346, "learning_rate": 8.920456872876575e-06, "loss": 0.0679, "step": 1823 }, { "epoch": 0.8760806916426513, "grad_norm": 0.3811923244253041, "learning_rate": 8.918721858369738e-06, "loss": 0.0495, "step": 1824 }, { "epoch": 0.8765609990393852, "grad_norm": 0.502122776772645, "learning_rate": 8.916985619762605e-06, "loss": 0.0371, "step": 1825 }, { "epoch": 0.8770413064361191, "grad_norm": 1.0483402982818049, "learning_rate": 8.915248157597529e-06, "loss": 0.0655, "step": 1826 }, { "epoch": 0.877521613832853, "grad_norm": 0.45223694971335204, "learning_rate": 8.913509472417246e-06, "loss": 0.0311, "step": 1827 }, { "epoch": 0.878001921229587, "grad_norm": 0.4508039587720194, "learning_rate": 8.91176956476487e-06, "loss": 0.0436, "step": 1828 }, { "epoch": 0.8784822286263209, "grad_norm": 0.4562648581626756, "learning_rate": 8.910028435183906e-06, "loss": 0.0358, "step": 1829 }, { "epoch": 0.8789625360230547, "grad_norm": 0.3635207577181714, "learning_rate": 8.90828608421823e-06, "loss": 0.0357, "step": 1830 }, { "epoch": 0.8794428434197886, "grad_norm": 0.5209884592558325, "learning_rate": 8.906542512412105e-06, "loss": 0.0478, "step": 1831 }, { "epoch": 0.8799231508165226, "grad_norm": 0.37857114644619005, "learning_rate": 8.904797720310176e-06, "loss": 0.045, "step": 1832 }, { "epoch": 0.8804034582132565, "grad_norm": 0.8142054557917969, "learning_rate": 8.903051708457465e-06, "loss": 0.0376, "step": 1833 }, { "epoch": 0.8808837656099904, "grad_norm": 0.7396298749741087, "learning_rate": 8.90130447739938e-06, "loss": 0.0589, "step": 1834 }, { "epoch": 0.8813640730067243, "grad_norm": 0.33453688614785887, "learning_rate": 8.899556027681708e-06, "loss": 0.0231, "step": 1835 }, { "epoch": 0.8818443804034583, "grad_norm": 0.4457358953894517, "learning_rate": 8.897806359850614e-06, "loss": 0.0317, "step": 1836 }, { "epoch": 0.8823246878001921, "grad_norm": 1.3515020556940829, "learning_rate": 8.896055474452649e-06, "loss": 0.0427, "step": 1837 }, { "epoch": 0.882804995196926, "grad_norm": 0.4580165993160693, "learning_rate": 8.894303372034738e-06, "loss": 0.0433, "step": 1838 }, { "epoch": 0.8832853025936599, "grad_norm": 0.4503178993927309, "learning_rate": 8.89255005314419e-06, "loss": 0.0373, "step": 1839 }, { "epoch": 0.8837656099903939, "grad_norm": 0.4368170227289143, "learning_rate": 8.890795518328698e-06, "loss": 0.0461, "step": 1840 }, { "epoch": 0.8842459173871278, "grad_norm": 0.5359261995070854, "learning_rate": 8.889039768136323e-06, "loss": 0.0633, "step": 1841 }, { "epoch": 0.8847262247838616, "grad_norm": 0.32753604552230753, "learning_rate": 8.887282803115518e-06, "loss": 0.031, "step": 1842 }, { "epoch": 0.8852065321805955, "grad_norm": 0.561984541895264, "learning_rate": 8.885524623815107e-06, "loss": 0.061, "step": 1843 }, { "epoch": 0.8856868395773295, "grad_norm": 0.4954923412229821, "learning_rate": 8.883765230784298e-06, "loss": 0.0376, "step": 1844 }, { "epoch": 0.8861671469740634, "grad_norm": 0.40554156267027097, "learning_rate": 8.882004624572676e-06, "loss": 0.0286, "step": 1845 }, { "epoch": 0.8866474543707973, "grad_norm": 0.5113849488787681, "learning_rate": 8.880242805730208e-06, "loss": 0.0426, "step": 1846 }, { "epoch": 0.8871277617675313, "grad_norm": 0.37027022381367386, "learning_rate": 8.878479774807235e-06, "loss": 0.0295, "step": 1847 }, { "epoch": 0.8876080691642652, "grad_norm": 0.6769968021961212, "learning_rate": 8.876715532354478e-06, "loss": 0.0452, "step": 1848 }, { "epoch": 0.888088376560999, "grad_norm": 0.48737302282911404, "learning_rate": 8.87495007892304e-06, "loss": 0.0445, "step": 1849 }, { "epoch": 0.8885686839577329, "grad_norm": 0.3386523878498041, "learning_rate": 8.873183415064401e-06, "loss": 0.0333, "step": 1850 }, { "epoch": 0.8890489913544669, "grad_norm": 0.6658697028186209, "learning_rate": 8.871415541330417e-06, "loss": 0.0343, "step": 1851 }, { "epoch": 0.8895292987512008, "grad_norm": 0.4919802717601218, "learning_rate": 8.86964645827332e-06, "loss": 0.0455, "step": 1852 }, { "epoch": 0.8900096061479347, "grad_norm": 0.4263226706660005, "learning_rate": 8.867876166445724e-06, "loss": 0.0418, "step": 1853 }, { "epoch": 0.8904899135446686, "grad_norm": 0.3214256914507718, "learning_rate": 8.86610466640062e-06, "loss": 0.0327, "step": 1854 }, { "epoch": 0.8909702209414025, "grad_norm": 0.6264841255471314, "learning_rate": 8.864331958691377e-06, "loss": 0.0305, "step": 1855 }, { "epoch": 0.8914505283381364, "grad_norm": 1.2829456922872837, "learning_rate": 8.862558043871737e-06, "loss": 0.0585, "step": 1856 }, { "epoch": 0.8919308357348703, "grad_norm": 0.5015853830955843, "learning_rate": 8.860782922495821e-06, "loss": 0.0479, "step": 1857 }, { "epoch": 0.8924111431316042, "grad_norm": 0.6658604832983488, "learning_rate": 8.859006595118133e-06, "loss": 0.0485, "step": 1858 }, { "epoch": 0.8928914505283382, "grad_norm": 0.47639098590814116, "learning_rate": 8.857229062293544e-06, "loss": 0.0388, "step": 1859 }, { "epoch": 0.8933717579250721, "grad_norm": 0.4302098869706598, "learning_rate": 8.855450324577308e-06, "loss": 0.0482, "step": 1860 }, { "epoch": 0.8938520653218059, "grad_norm": 0.6482456313500351, "learning_rate": 8.853670382525052e-06, "loss": 0.0436, "step": 1861 }, { "epoch": 0.8943323727185398, "grad_norm": 0.47660614333387596, "learning_rate": 8.851889236692783e-06, "loss": 0.046, "step": 1862 }, { "epoch": 0.8948126801152738, "grad_norm": 0.6952319224454124, "learning_rate": 8.85010688763688e-06, "loss": 0.0495, "step": 1863 }, { "epoch": 0.8952929875120077, "grad_norm": 0.4901487858015449, "learning_rate": 8.8483233359141e-06, "loss": 0.0358, "step": 1864 }, { "epoch": 0.8957732949087416, "grad_norm": 0.5728646775595035, "learning_rate": 8.846538582081575e-06, "loss": 0.0435, "step": 1865 }, { "epoch": 0.8962536023054755, "grad_norm": 0.44725468456391415, "learning_rate": 8.84475262669681e-06, "loss": 0.0443, "step": 1866 }, { "epoch": 0.8967339097022095, "grad_norm": 0.5773271593291803, "learning_rate": 8.842965470317694e-06, "loss": 0.0424, "step": 1867 }, { "epoch": 0.8972142170989433, "grad_norm": 0.7198849042715714, "learning_rate": 8.84117711350248e-06, "loss": 0.0468, "step": 1868 }, { "epoch": 0.8976945244956772, "grad_norm": 0.5957419746315502, "learning_rate": 8.839387556809805e-06, "loss": 0.0468, "step": 1869 }, { "epoch": 0.8981748318924111, "grad_norm": 0.588750525644505, "learning_rate": 8.837596800798674e-06, "loss": 0.0464, "step": 1870 }, { "epoch": 0.8986551392891451, "grad_norm": 0.4505825678549945, "learning_rate": 8.835804846028473e-06, "loss": 0.0428, "step": 1871 }, { "epoch": 0.899135446685879, "grad_norm": 0.6871356390036418, "learning_rate": 8.834011693058955e-06, "loss": 0.0693, "step": 1872 }, { "epoch": 0.8996157540826129, "grad_norm": 0.4898107165484671, "learning_rate": 8.832217342450253e-06, "loss": 0.0425, "step": 1873 }, { "epoch": 0.9000960614793467, "grad_norm": 0.5624738413790371, "learning_rate": 8.830421794762873e-06, "loss": 0.044, "step": 1874 }, { "epoch": 0.9005763688760807, "grad_norm": 0.5195088744781344, "learning_rate": 8.828625050557695e-06, "loss": 0.0606, "step": 1875 }, { "epoch": 0.9010566762728146, "grad_norm": 0.38938511927268654, "learning_rate": 8.826827110395973e-06, "loss": 0.0336, "step": 1876 }, { "epoch": 0.9015369836695485, "grad_norm": 0.38684793123655187, "learning_rate": 8.82502797483933e-06, "loss": 0.0375, "step": 1877 }, { "epoch": 0.9020172910662824, "grad_norm": 0.7363107702433455, "learning_rate": 8.823227644449767e-06, "loss": 0.0465, "step": 1878 }, { "epoch": 0.9024975984630164, "grad_norm": 0.48895156850885646, "learning_rate": 8.821426119789662e-06, "loss": 0.0483, "step": 1879 }, { "epoch": 0.9029779058597502, "grad_norm": 0.660241990808151, "learning_rate": 8.819623401421757e-06, "loss": 0.0476, "step": 1880 }, { "epoch": 0.9034582132564841, "grad_norm": 0.46543854311143706, "learning_rate": 8.817819489909172e-06, "loss": 0.0403, "step": 1881 }, { "epoch": 0.9039385206532181, "grad_norm": 0.6017118184027497, "learning_rate": 8.8160143858154e-06, "loss": 0.0478, "step": 1882 }, { "epoch": 0.904418828049952, "grad_norm": 0.39226105718412113, "learning_rate": 8.814208089704306e-06, "loss": 0.0418, "step": 1883 }, { "epoch": 0.9048991354466859, "grad_norm": 0.4599436630675356, "learning_rate": 8.812400602140125e-06, "loss": 0.0468, "step": 1884 }, { "epoch": 0.9053794428434198, "grad_norm": 0.8681683906823245, "learning_rate": 8.810591923687468e-06, "loss": 0.0473, "step": 1885 }, { "epoch": 0.9058597502401537, "grad_norm": 0.7322132140172354, "learning_rate": 8.808782054911315e-06, "loss": 0.0662, "step": 1886 }, { "epoch": 0.9063400576368876, "grad_norm": 0.6055607958778498, "learning_rate": 8.806970996377018e-06, "loss": 0.0461, "step": 1887 }, { "epoch": 0.9068203650336215, "grad_norm": 0.4352135953438638, "learning_rate": 8.805158748650304e-06, "loss": 0.0443, "step": 1888 }, { "epoch": 0.9073006724303554, "grad_norm": 0.8207849567629364, "learning_rate": 8.803345312297269e-06, "loss": 0.0686, "step": 1889 }, { "epoch": 0.9077809798270894, "grad_norm": 0.4467690737669225, "learning_rate": 8.801530687884378e-06, "loss": 0.0532, "step": 1890 }, { "epoch": 0.9082612872238233, "grad_norm": 0.5198478158932659, "learning_rate": 8.799714875978472e-06, "loss": 0.0419, "step": 1891 }, { "epoch": 0.9087415946205571, "grad_norm": 1.0896879533925696, "learning_rate": 8.797897877146757e-06, "loss": 0.0734, "step": 1892 }, { "epoch": 0.909221902017291, "grad_norm": 0.3617708509543011, "learning_rate": 8.796079691956818e-06, "loss": 0.0327, "step": 1893 }, { "epoch": 0.909702209414025, "grad_norm": 0.7301142094909887, "learning_rate": 8.794260320976602e-06, "loss": 0.0392, "step": 1894 }, { "epoch": 0.9101825168107589, "grad_norm": 0.4068531645359285, "learning_rate": 8.79243976477443e-06, "loss": 0.0469, "step": 1895 }, { "epoch": 0.9106628242074928, "grad_norm": 0.48618429718856376, "learning_rate": 8.790618023918995e-06, "loss": 0.0558, "step": 1896 }, { "epoch": 0.9111431316042267, "grad_norm": 0.5431200818159274, "learning_rate": 8.788795098979358e-06, "loss": 0.0515, "step": 1897 }, { "epoch": 0.9116234390009607, "grad_norm": 0.6076681250674426, "learning_rate": 8.786970990524952e-06, "loss": 0.043, "step": 1898 }, { "epoch": 0.9121037463976945, "grad_norm": 0.4574042135859862, "learning_rate": 8.785145699125577e-06, "loss": 0.0468, "step": 1899 }, { "epoch": 0.9125840537944284, "grad_norm": 0.7075435238401119, "learning_rate": 8.783319225351401e-06, "loss": 0.0464, "step": 1900 }, { "epoch": 0.9130643611911623, "grad_norm": 0.534633281625552, "learning_rate": 8.781491569772966e-06, "loss": 0.0426, "step": 1901 }, { "epoch": 0.9135446685878963, "grad_norm": 0.3921046587973985, "learning_rate": 8.77966273296118e-06, "loss": 0.0358, "step": 1902 }, { "epoch": 0.9140249759846302, "grad_norm": 0.41990313919226124, "learning_rate": 8.777832715487325e-06, "loss": 0.0285, "step": 1903 }, { "epoch": 0.914505283381364, "grad_norm": 0.41717379390801274, "learning_rate": 8.776001517923042e-06, "loss": 0.0363, "step": 1904 }, { "epoch": 0.9149855907780979, "grad_norm": 0.3366938255497245, "learning_rate": 8.774169140840349e-06, "loss": 0.0374, "step": 1905 }, { "epoch": 0.9154658981748319, "grad_norm": 0.36457948244141064, "learning_rate": 8.772335584811631e-06, "loss": 0.0299, "step": 1906 }, { "epoch": 0.9159462055715658, "grad_norm": 0.4828707338456646, "learning_rate": 8.770500850409641e-06, "loss": 0.0331, "step": 1907 }, { "epoch": 0.9164265129682997, "grad_norm": 0.5966763216669441, "learning_rate": 8.768664938207494e-06, "loss": 0.0498, "step": 1908 }, { "epoch": 0.9169068203650336, "grad_norm": 0.3927338383436385, "learning_rate": 8.766827848778683e-06, "loss": 0.0338, "step": 1909 }, { "epoch": 0.9173871277617676, "grad_norm": 0.5893155428418285, "learning_rate": 8.764989582697065e-06, "loss": 0.0411, "step": 1910 }, { "epoch": 0.9178674351585014, "grad_norm": 0.5356691788194019, "learning_rate": 8.763150140536858e-06, "loss": 0.0398, "step": 1911 }, { "epoch": 0.9183477425552353, "grad_norm": 0.4306294724075641, "learning_rate": 8.761309522872657e-06, "loss": 0.0403, "step": 1912 }, { "epoch": 0.9188280499519692, "grad_norm": 0.4226012125799819, "learning_rate": 8.75946773027942e-06, "loss": 0.0353, "step": 1913 }, { "epoch": 0.9193083573487032, "grad_norm": 0.4136981494543946, "learning_rate": 8.75762476333247e-06, "loss": 0.0362, "step": 1914 }, { "epoch": 0.9197886647454371, "grad_norm": 0.6346827624212021, "learning_rate": 8.755780622607499e-06, "loss": 0.0445, "step": 1915 }, { "epoch": 0.920268972142171, "grad_norm": 0.558450568203092, "learning_rate": 8.753935308680568e-06, "loss": 0.043, "step": 1916 }, { "epoch": 0.920749279538905, "grad_norm": 0.43965358503774565, "learning_rate": 8.7520888221281e-06, "loss": 0.0495, "step": 1917 }, { "epoch": 0.9212295869356388, "grad_norm": 0.5476700265664696, "learning_rate": 8.750241163526887e-06, "loss": 0.0347, "step": 1918 }, { "epoch": 0.9217098943323727, "grad_norm": 0.39304076802918375, "learning_rate": 8.748392333454085e-06, "loss": 0.0349, "step": 1919 }, { "epoch": 0.9221902017291066, "grad_norm": 0.4971949825044665, "learning_rate": 8.74654233248722e-06, "loss": 0.0395, "step": 1920 }, { "epoch": 0.9226705091258406, "grad_norm": 0.5186534483519105, "learning_rate": 8.74469116120418e-06, "loss": 0.0307, "step": 1921 }, { "epoch": 0.9231508165225745, "grad_norm": 0.3224287987538711, "learning_rate": 8.742838820183218e-06, "loss": 0.024, "step": 1922 }, { "epoch": 0.9236311239193083, "grad_norm": 0.4280132539326167, "learning_rate": 8.740985310002956e-06, "loss": 0.0417, "step": 1923 }, { "epoch": 0.9241114313160422, "grad_norm": 0.41045791987815555, "learning_rate": 8.739130631242379e-06, "loss": 0.0359, "step": 1924 }, { "epoch": 0.9245917387127762, "grad_norm": 0.5184741851396398, "learning_rate": 8.737274784480839e-06, "loss": 0.0563, "step": 1925 }, { "epoch": 0.9250720461095101, "grad_norm": 0.31914533082950014, "learning_rate": 8.735417770298046e-06, "loss": 0.0299, "step": 1926 }, { "epoch": 0.925552353506244, "grad_norm": 0.5115763990615543, "learning_rate": 8.733559589274086e-06, "loss": 0.0495, "step": 1927 }, { "epoch": 0.9260326609029779, "grad_norm": 0.4761196913567422, "learning_rate": 8.731700241989398e-06, "loss": 0.0445, "step": 1928 }, { "epoch": 0.9265129682997119, "grad_norm": 0.551401916157282, "learning_rate": 8.729839729024794e-06, "loss": 0.036, "step": 1929 }, { "epoch": 0.9269932756964457, "grad_norm": 0.3446484842092706, "learning_rate": 8.727978050961446e-06, "loss": 0.036, "step": 1930 }, { "epoch": 0.9274735830931796, "grad_norm": 0.3994863651754855, "learning_rate": 8.726115208380892e-06, "loss": 0.0445, "step": 1931 }, { "epoch": 0.9279538904899135, "grad_norm": 0.39412495064642333, "learning_rate": 8.724251201865029e-06, "loss": 0.0495, "step": 1932 }, { "epoch": 0.9284341978866475, "grad_norm": 0.6031610926197576, "learning_rate": 8.722386031996124e-06, "loss": 0.0532, "step": 1933 }, { "epoch": 0.9289145052833814, "grad_norm": 0.4130746229906913, "learning_rate": 8.720519699356804e-06, "loss": 0.0438, "step": 1934 }, { "epoch": 0.9293948126801153, "grad_norm": 0.4945763714239433, "learning_rate": 8.71865220453006e-06, "loss": 0.0537, "step": 1935 }, { "epoch": 0.9298751200768491, "grad_norm": 0.6642151852115892, "learning_rate": 8.716783548099243e-06, "loss": 0.0305, "step": 1936 }, { "epoch": 0.9303554274735831, "grad_norm": 0.4808430941047988, "learning_rate": 8.714913730648073e-06, "loss": 0.0383, "step": 1937 }, { "epoch": 0.930835734870317, "grad_norm": 0.38784696307406447, "learning_rate": 8.713042752760629e-06, "loss": 0.0427, "step": 1938 }, { "epoch": 0.9313160422670509, "grad_norm": 0.46691293249094495, "learning_rate": 8.71117061502135e-06, "loss": 0.037, "step": 1939 }, { "epoch": 0.9317963496637848, "grad_norm": 0.528772802563039, "learning_rate": 8.709297318015042e-06, "loss": 0.0413, "step": 1940 }, { "epoch": 0.9322766570605188, "grad_norm": 0.49227721232250315, "learning_rate": 8.707422862326872e-06, "loss": 0.0359, "step": 1941 }, { "epoch": 0.9327569644572526, "grad_norm": 0.47166989196463194, "learning_rate": 8.705547248542366e-06, "loss": 0.0362, "step": 1942 }, { "epoch": 0.9332372718539865, "grad_norm": 0.5256984649221889, "learning_rate": 8.703670477247415e-06, "loss": 0.0483, "step": 1943 }, { "epoch": 0.9337175792507204, "grad_norm": 0.4307455296849523, "learning_rate": 8.701792549028269e-06, "loss": 0.0487, "step": 1944 }, { "epoch": 0.9341978866474544, "grad_norm": 0.46947997670245323, "learning_rate": 8.699913464471543e-06, "loss": 0.0441, "step": 1945 }, { "epoch": 0.9346781940441883, "grad_norm": 0.5890864790257363, "learning_rate": 8.69803322416421e-06, "loss": 0.0629, "step": 1946 }, { "epoch": 0.9351585014409222, "grad_norm": 0.380486982939282, "learning_rate": 8.696151828693606e-06, "loss": 0.0367, "step": 1947 }, { "epoch": 0.9356388088376562, "grad_norm": 0.42021576660117066, "learning_rate": 8.694269278647425e-06, "loss": 0.0379, "step": 1948 }, { "epoch": 0.93611911623439, "grad_norm": 0.41391650206618286, "learning_rate": 8.692385574613725e-06, "loss": 0.0365, "step": 1949 }, { "epoch": 0.9365994236311239, "grad_norm": 0.6058118454984688, "learning_rate": 8.690500717180924e-06, "loss": 0.065, "step": 1950 }, { "epoch": 0.9370797310278578, "grad_norm": 0.4144008472750772, "learning_rate": 8.688614706937794e-06, "loss": 0.0337, "step": 1951 }, { "epoch": 0.9375600384245918, "grad_norm": 0.46837940872225015, "learning_rate": 8.68672754447348e-06, "loss": 0.0539, "step": 1952 }, { "epoch": 0.9380403458213257, "grad_norm": 0.4192145916135169, "learning_rate": 8.684839230377475e-06, "loss": 0.0324, "step": 1953 }, { "epoch": 0.9385206532180596, "grad_norm": 0.5489777561757881, "learning_rate": 8.682949765239636e-06, "loss": 0.0357, "step": 1954 }, { "epoch": 0.9390009606147934, "grad_norm": 0.34093817269965004, "learning_rate": 8.681059149650181e-06, "loss": 0.0268, "step": 1955 }, { "epoch": 0.9394812680115274, "grad_norm": 0.395654696038833, "learning_rate": 8.679167384199686e-06, "loss": 0.0286, "step": 1956 }, { "epoch": 0.9399615754082613, "grad_norm": 0.2944331212675771, "learning_rate": 8.677274469479083e-06, "loss": 0.0263, "step": 1957 }, { "epoch": 0.9404418828049952, "grad_norm": 0.6932183759828536, "learning_rate": 8.67538040607967e-06, "loss": 0.0459, "step": 1958 }, { "epoch": 0.9409221902017291, "grad_norm": 0.9103586040099304, "learning_rate": 8.6734851945931e-06, "loss": 0.046, "step": 1959 }, { "epoch": 0.9414024975984631, "grad_norm": 0.4687709996696501, "learning_rate": 8.671588835611381e-06, "loss": 0.0448, "step": 1960 }, { "epoch": 0.9418828049951969, "grad_norm": 0.9039414492489983, "learning_rate": 8.669691329726888e-06, "loss": 0.0412, "step": 1961 }, { "epoch": 0.9423631123919308, "grad_norm": 0.5555653981562542, "learning_rate": 8.667792677532346e-06, "loss": 0.0339, "step": 1962 }, { "epoch": 0.9428434197886647, "grad_norm": 0.444243469483319, "learning_rate": 8.665892879620843e-06, "loss": 0.0427, "step": 1963 }, { "epoch": 0.9433237271853987, "grad_norm": 0.342635961775897, "learning_rate": 8.663991936585821e-06, "loss": 0.0337, "step": 1964 }, { "epoch": 0.9438040345821326, "grad_norm": 0.6397758038332509, "learning_rate": 8.662089849021086e-06, "loss": 0.0505, "step": 1965 }, { "epoch": 0.9442843419788665, "grad_norm": 0.5964279866861497, "learning_rate": 8.660186617520792e-06, "loss": 0.0358, "step": 1966 }, { "epoch": 0.9447646493756003, "grad_norm": 0.5659109590321414, "learning_rate": 8.658282242679461e-06, "loss": 0.0418, "step": 1967 }, { "epoch": 0.9452449567723343, "grad_norm": 0.506925916793818, "learning_rate": 8.656376725091965e-06, "loss": 0.0417, "step": 1968 }, { "epoch": 0.9457252641690682, "grad_norm": 0.48404249293897045, "learning_rate": 8.654470065353535e-06, "loss": 0.0429, "step": 1969 }, { "epoch": 0.9462055715658021, "grad_norm": 1.6891950338293646, "learning_rate": 8.652562264059758e-06, "loss": 0.0376, "step": 1970 }, { "epoch": 0.946685878962536, "grad_norm": 1.4094881449650138, "learning_rate": 8.65065332180658e-06, "loss": 0.0351, "step": 1971 }, { "epoch": 0.94716618635927, "grad_norm": 0.5177625328838332, "learning_rate": 8.6487432391903e-06, "loss": 0.0343, "step": 1972 }, { "epoch": 0.9476464937560038, "grad_norm": 0.5743763929907509, "learning_rate": 8.646832016807576e-06, "loss": 0.0366, "step": 1973 }, { "epoch": 0.9481268011527377, "grad_norm": 1.2258062310893443, "learning_rate": 8.644919655255421e-06, "loss": 0.0508, "step": 1974 }, { "epoch": 0.9486071085494716, "grad_norm": 0.9221165067101089, "learning_rate": 8.643006155131204e-06, "loss": 0.0673, "step": 1975 }, { "epoch": 0.9490874159462056, "grad_norm": 0.6966238615893148, "learning_rate": 8.641091517032648e-06, "loss": 0.0738, "step": 1976 }, { "epoch": 0.9495677233429395, "grad_norm": 0.4344930032517037, "learning_rate": 8.639175741557835e-06, "loss": 0.0326, "step": 1977 }, { "epoch": 0.9500480307396734, "grad_norm": 0.7423186859550626, "learning_rate": 8.6372588293052e-06, "loss": 0.0446, "step": 1978 }, { "epoch": 0.9505283381364072, "grad_norm": 0.4731476879098025, "learning_rate": 8.635340780873531e-06, "loss": 0.0353, "step": 1979 }, { "epoch": 0.9510086455331412, "grad_norm": 0.6600870526226297, "learning_rate": 8.633421596861977e-06, "loss": 0.0569, "step": 1980 }, { "epoch": 0.9514889529298751, "grad_norm": 0.5036533651424044, "learning_rate": 8.631501277870034e-06, "loss": 0.0406, "step": 1981 }, { "epoch": 0.951969260326609, "grad_norm": 1.3169404498677184, "learning_rate": 8.62957982449756e-06, "loss": 0.0786, "step": 1982 }, { "epoch": 0.952449567723343, "grad_norm": 0.7672144324242122, "learning_rate": 8.627657237344762e-06, "loss": 0.0437, "step": 1983 }, { "epoch": 0.9529298751200769, "grad_norm": 0.41972977550479074, "learning_rate": 8.625733517012202e-06, "loss": 0.0477, "step": 1984 }, { "epoch": 0.9534101825168108, "grad_norm": 0.507878350787633, "learning_rate": 8.6238086641008e-06, "loss": 0.0367, "step": 1985 }, { "epoch": 0.9538904899135446, "grad_norm": 0.516358838359933, "learning_rate": 8.621882679211826e-06, "loss": 0.0478, "step": 1986 }, { "epoch": 0.9543707973102786, "grad_norm": 0.3726975644915236, "learning_rate": 8.619955562946902e-06, "loss": 0.0341, "step": 1987 }, { "epoch": 0.9548511047070125, "grad_norm": 0.5774338343855023, "learning_rate": 8.618027315908009e-06, "loss": 0.0454, "step": 1988 }, { "epoch": 0.9553314121037464, "grad_norm": 0.710615004605156, "learning_rate": 8.616097938697476e-06, "loss": 0.0448, "step": 1989 }, { "epoch": 0.9558117195004803, "grad_norm": 0.9510665106194851, "learning_rate": 8.614167431917986e-06, "loss": 0.0543, "step": 1990 }, { "epoch": 0.9562920268972143, "grad_norm": 0.4147522768624343, "learning_rate": 8.612235796172579e-06, "loss": 0.0328, "step": 1991 }, { "epoch": 0.9567723342939481, "grad_norm": 0.5983476792554373, "learning_rate": 8.610303032064642e-06, "loss": 0.0606, "step": 1992 }, { "epoch": 0.957252641690682, "grad_norm": 0.3768025246044265, "learning_rate": 8.60836914019792e-06, "loss": 0.0323, "step": 1993 }, { "epoch": 0.9577329490874159, "grad_norm": 0.5553521298460753, "learning_rate": 8.606434121176504e-06, "loss": 0.0502, "step": 1994 }, { "epoch": 0.9582132564841499, "grad_norm": 0.6171063653286382, "learning_rate": 8.60449797560484e-06, "loss": 0.0505, "step": 1995 }, { "epoch": 0.9586935638808838, "grad_norm": 0.5729885075449906, "learning_rate": 8.60256070408773e-06, "loss": 0.0534, "step": 1996 }, { "epoch": 0.9591738712776177, "grad_norm": 0.4954254466131748, "learning_rate": 8.600622307230323e-06, "loss": 0.0339, "step": 1997 }, { "epoch": 0.9596541786743515, "grad_norm": 0.36497637328284854, "learning_rate": 8.598682785638119e-06, "loss": 0.0293, "step": 1998 }, { "epoch": 0.9601344860710855, "grad_norm": 0.5588614544226611, "learning_rate": 8.59674213991697e-06, "loss": 0.0497, "step": 1999 }, { "epoch": 0.9606147934678194, "grad_norm": 0.34528031432849365, "learning_rate": 8.594800370673083e-06, "loss": 0.032, "step": 2000 }, { "epoch": 0.9610951008645533, "grad_norm": 0.6906177073677481, "learning_rate": 8.592857478513011e-06, "loss": 0.0485, "step": 2001 }, { "epoch": 0.9615754082612872, "grad_norm": 0.3546432685965284, "learning_rate": 8.590913464043661e-06, "loss": 0.0297, "step": 2002 }, { "epoch": 0.9620557156580212, "grad_norm": 0.7379863818566214, "learning_rate": 8.58896832787229e-06, "loss": 0.0551, "step": 2003 }, { "epoch": 0.962536023054755, "grad_norm": 0.43001383506184815, "learning_rate": 8.5870220706065e-06, "loss": 0.0345, "step": 2004 }, { "epoch": 0.9630163304514889, "grad_norm": 0.40952514169054416, "learning_rate": 8.585074692854254e-06, "loss": 0.0387, "step": 2005 }, { "epoch": 0.9634966378482228, "grad_norm": 0.5621926001572276, "learning_rate": 8.583126195223854e-06, "loss": 0.0453, "step": 2006 }, { "epoch": 0.9639769452449568, "grad_norm": 0.6202940152268431, "learning_rate": 8.581176578323962e-06, "loss": 0.0455, "step": 2007 }, { "epoch": 0.9644572526416907, "grad_norm": 0.4253832733284014, "learning_rate": 8.579225842763578e-06, "loss": 0.038, "step": 2008 }, { "epoch": 0.9649375600384246, "grad_norm": 0.5301193530668448, "learning_rate": 8.577273989152063e-06, "loss": 0.045, "step": 2009 }, { "epoch": 0.9654178674351584, "grad_norm": 0.4447740781906591, "learning_rate": 8.575321018099122e-06, "loss": 0.0463, "step": 2010 }, { "epoch": 0.9658981748318924, "grad_norm": 0.5647765842461724, "learning_rate": 8.573366930214807e-06, "loss": 0.0382, "step": 2011 }, { "epoch": 0.9663784822286263, "grad_norm": 0.3880514422764617, "learning_rate": 8.571411726109518e-06, "loss": 0.0406, "step": 2012 }, { "epoch": 0.9668587896253602, "grad_norm": 0.49001392319362364, "learning_rate": 8.569455406394013e-06, "loss": 0.0379, "step": 2013 }, { "epoch": 0.9673390970220941, "grad_norm": 0.5021370712990999, "learning_rate": 8.567497971679387e-06, "loss": 0.0422, "step": 2014 }, { "epoch": 0.9678194044188281, "grad_norm": 0.46323226363095604, "learning_rate": 8.565539422577093e-06, "loss": 0.0404, "step": 2015 }, { "epoch": 0.968299711815562, "grad_norm": 0.41870207201506, "learning_rate": 8.563579759698925e-06, "loss": 0.0374, "step": 2016 }, { "epoch": 0.9687800192122958, "grad_norm": 0.45414111344706365, "learning_rate": 8.561618983657028e-06, "loss": 0.0418, "step": 2017 }, { "epoch": 0.9692603266090298, "grad_norm": 0.5305044501921113, "learning_rate": 8.559657095063893e-06, "loss": 0.0416, "step": 2018 }, { "epoch": 0.9697406340057637, "grad_norm": 0.5045574789523771, "learning_rate": 8.557694094532361e-06, "loss": 0.0324, "step": 2019 }, { "epoch": 0.9702209414024976, "grad_norm": 0.4492848331956523, "learning_rate": 8.555729982675619e-06, "loss": 0.0434, "step": 2020 }, { "epoch": 0.9707012487992315, "grad_norm": 0.4461437770479501, "learning_rate": 8.5537647601072e-06, "loss": 0.0363, "step": 2021 }, { "epoch": 0.9711815561959655, "grad_norm": 0.3915155679851884, "learning_rate": 8.551798427440985e-06, "loss": 0.0325, "step": 2022 }, { "epoch": 0.9716618635926993, "grad_norm": 0.4611826440449828, "learning_rate": 8.549830985291206e-06, "loss": 0.0485, "step": 2023 }, { "epoch": 0.9721421709894332, "grad_norm": 0.37428792596353483, "learning_rate": 8.547862434272431e-06, "loss": 0.0351, "step": 2024 }, { "epoch": 0.9726224783861671, "grad_norm": 0.323502628978833, "learning_rate": 8.545892774999589e-06, "loss": 0.0274, "step": 2025 }, { "epoch": 0.9731027857829011, "grad_norm": 0.6817640871809518, "learning_rate": 8.543922008087938e-06, "loss": 0.0429, "step": 2026 }, { "epoch": 0.973583093179635, "grad_norm": 0.503635291249707, "learning_rate": 8.541950134153099e-06, "loss": 0.0365, "step": 2027 }, { "epoch": 0.9740634005763689, "grad_norm": 0.5092392383162542, "learning_rate": 8.539977153811024e-06, "loss": 0.0412, "step": 2028 }, { "epoch": 0.9745437079731027, "grad_norm": 0.5178003304544473, "learning_rate": 8.538003067678022e-06, "loss": 0.0554, "step": 2029 }, { "epoch": 0.9750240153698367, "grad_norm": 0.35350741688529513, "learning_rate": 8.536027876370743e-06, "loss": 0.0346, "step": 2030 }, { "epoch": 0.9755043227665706, "grad_norm": 0.36618272757164555, "learning_rate": 8.53405158050618e-06, "loss": 0.0351, "step": 2031 }, { "epoch": 0.9759846301633045, "grad_norm": 0.4057826094482221, "learning_rate": 8.532074180701674e-06, "loss": 0.0394, "step": 2032 }, { "epoch": 0.9764649375600384, "grad_norm": 0.5037133903672764, "learning_rate": 8.53009567757491e-06, "loss": 0.0353, "step": 2033 }, { "epoch": 0.9769452449567724, "grad_norm": 0.45708751529472025, "learning_rate": 8.528116071743917e-06, "loss": 0.039, "step": 2034 }, { "epoch": 0.9774255523535063, "grad_norm": 0.4474137964593103, "learning_rate": 8.52613536382707e-06, "loss": 0.0482, "step": 2035 }, { "epoch": 0.9779058597502401, "grad_norm": 0.8437380174457212, "learning_rate": 8.524153554443088e-06, "loss": 0.0457, "step": 2036 }, { "epoch": 0.978386167146974, "grad_norm": 0.4317132378185624, "learning_rate": 8.522170644211032e-06, "loss": 0.0433, "step": 2037 }, { "epoch": 0.978866474543708, "grad_norm": 0.5306878906801935, "learning_rate": 8.520186633750309e-06, "loss": 0.0442, "step": 2038 }, { "epoch": 0.9793467819404419, "grad_norm": 0.4430347167684453, "learning_rate": 8.518201523680668e-06, "loss": 0.0472, "step": 2039 }, { "epoch": 0.9798270893371758, "grad_norm": 0.40367863591910896, "learning_rate": 8.516215314622203e-06, "loss": 0.0547, "step": 2040 }, { "epoch": 0.9803073967339097, "grad_norm": 1.0144384311645978, "learning_rate": 8.514228007195354e-06, "loss": 0.0632, "step": 2041 }, { "epoch": 0.9807877041306436, "grad_norm": 0.6255894136401294, "learning_rate": 8.512239602020894e-06, "loss": 0.0374, "step": 2042 }, { "epoch": 0.9812680115273775, "grad_norm": 0.29213416072835724, "learning_rate": 8.510250099719953e-06, "loss": 0.0302, "step": 2043 }, { "epoch": 0.9817483189241114, "grad_norm": 0.48617017713788496, "learning_rate": 8.50825950091399e-06, "loss": 0.049, "step": 2044 }, { "epoch": 0.9822286263208453, "grad_norm": 0.5170970045891808, "learning_rate": 8.506267806224817e-06, "loss": 0.0418, "step": 2045 }, { "epoch": 0.9827089337175793, "grad_norm": 0.5619222767720121, "learning_rate": 8.504275016274584e-06, "loss": 0.0477, "step": 2046 }, { "epoch": 0.9831892411143132, "grad_norm": 0.34210045056579325, "learning_rate": 8.502281131685783e-06, "loss": 0.0352, "step": 2047 }, { "epoch": 0.983669548511047, "grad_norm": 0.4991446492244405, "learning_rate": 8.500286153081248e-06, "loss": 0.0301, "step": 2048 }, { "epoch": 0.984149855907781, "grad_norm": 0.44922768509123856, "learning_rate": 8.498290081084156e-06, "loss": 0.0341, "step": 2049 }, { "epoch": 0.9846301633045149, "grad_norm": 0.307944880192032, "learning_rate": 8.496292916318023e-06, "loss": 0.025, "step": 2050 }, { "epoch": 0.9851104707012488, "grad_norm": 0.5664099644492291, "learning_rate": 8.494294659406709e-06, "loss": 0.0487, "step": 2051 }, { "epoch": 0.9855907780979827, "grad_norm": 0.7989706431633349, "learning_rate": 8.492295310974416e-06, "loss": 0.0515, "step": 2052 }, { "epoch": 0.9860710854947167, "grad_norm": 0.4172068934478134, "learning_rate": 8.490294871645681e-06, "loss": 0.037, "step": 2053 }, { "epoch": 0.9865513928914506, "grad_norm": 0.44711071451716344, "learning_rate": 8.488293342045391e-06, "loss": 0.0358, "step": 2054 }, { "epoch": 0.9870317002881844, "grad_norm": 0.394037791565088, "learning_rate": 8.486290722798765e-06, "loss": 0.0352, "step": 2055 }, { "epoch": 0.9875120076849183, "grad_norm": 0.44258059109076087, "learning_rate": 8.484287014531366e-06, "loss": 0.05, "step": 2056 }, { "epoch": 0.9879923150816523, "grad_norm": 0.4913903621371682, "learning_rate": 8.482282217869096e-06, "loss": 0.0418, "step": 2057 }, { "epoch": 0.9884726224783862, "grad_norm": 0.5964000018471872, "learning_rate": 8.480276333438203e-06, "loss": 0.0394, "step": 2058 }, { "epoch": 0.9889529298751201, "grad_norm": 0.42025505935398855, "learning_rate": 8.478269361865264e-06, "loss": 0.0315, "step": 2059 }, { "epoch": 0.989433237271854, "grad_norm": 0.3825968875041374, "learning_rate": 8.476261303777205e-06, "loss": 0.0267, "step": 2060 }, { "epoch": 0.9899135446685879, "grad_norm": 0.5266897373376677, "learning_rate": 8.474252159801287e-06, "loss": 0.0542, "step": 2061 }, { "epoch": 0.9903938520653218, "grad_norm": 0.9138199723571926, "learning_rate": 8.472241930565108e-06, "loss": 0.043, "step": 2062 }, { "epoch": 0.9908741594620557, "grad_norm": 0.4491153125193229, "learning_rate": 8.470230616696613e-06, "loss": 0.0416, "step": 2063 }, { "epoch": 0.9913544668587896, "grad_norm": 0.39648825911272, "learning_rate": 8.468218218824078e-06, "loss": 0.0308, "step": 2064 }, { "epoch": 0.9918347742555236, "grad_norm": 0.4266101634870432, "learning_rate": 8.46620473757612e-06, "loss": 0.0398, "step": 2065 }, { "epoch": 0.9923150816522575, "grad_norm": 0.46728827118397764, "learning_rate": 8.464190173581698e-06, "loss": 0.0332, "step": 2066 }, { "epoch": 0.9927953890489913, "grad_norm": 0.402026280201352, "learning_rate": 8.462174527470102e-06, "loss": 0.0306, "step": 2067 }, { "epoch": 0.9932756964457252, "grad_norm": 1.2753342168137747, "learning_rate": 8.460157799870967e-06, "loss": 0.0422, "step": 2068 }, { "epoch": 0.9937560038424592, "grad_norm": 0.48784901997970864, "learning_rate": 8.45813999141426e-06, "loss": 0.045, "step": 2069 }, { "epoch": 0.9942363112391931, "grad_norm": 0.4598639642272694, "learning_rate": 8.456121102730293e-06, "loss": 0.0339, "step": 2070 }, { "epoch": 0.994716618635927, "grad_norm": 0.43585915181157464, "learning_rate": 8.454101134449706e-06, "loss": 0.0495, "step": 2071 }, { "epoch": 0.9951969260326609, "grad_norm": 0.47450866187493235, "learning_rate": 8.452080087203484e-06, "loss": 0.0454, "step": 2072 }, { "epoch": 0.9956772334293948, "grad_norm": 0.40100519344897884, "learning_rate": 8.450057961622945e-06, "loss": 0.032, "step": 2073 }, { "epoch": 0.9961575408261287, "grad_norm": 0.3834770013645444, "learning_rate": 8.448034758339747e-06, "loss": 0.0366, "step": 2074 }, { "epoch": 0.9966378482228626, "grad_norm": 0.4018989482709451, "learning_rate": 8.446010477985882e-06, "loss": 0.0434, "step": 2075 }, { "epoch": 0.9971181556195965, "grad_norm": 0.44809689260265556, "learning_rate": 8.443985121193679e-06, "loss": 0.0464, "step": 2076 }, { "epoch": 0.9975984630163305, "grad_norm": 0.2905562947958189, "learning_rate": 8.441958688595802e-06, "loss": 0.0282, "step": 2077 }, { "epoch": 0.9980787704130644, "grad_norm": 0.7881415875093454, "learning_rate": 8.439931180825253e-06, "loss": 0.037, "step": 2078 }, { "epoch": 0.9985590778097982, "grad_norm": 0.38631353813613173, "learning_rate": 8.437902598515371e-06, "loss": 0.0406, "step": 2079 }, { "epoch": 0.9990393852065321, "grad_norm": 0.38950921399372224, "learning_rate": 8.435872942299827e-06, "loss": 0.0348, "step": 2080 }, { "epoch": 0.9995196926032661, "grad_norm": 0.5056529616402811, "learning_rate": 8.433842212812632e-06, "loss": 0.0509, "step": 2081 }, { "epoch": 1.0, "grad_norm": 0.5020057590352653, "learning_rate": 8.431810410688126e-06, "loss": 0.0373, "step": 2082 }, { "epoch": 1.0, "eval_loss": 0.03837773576378822, "eval_runtime": 509.5894, "eval_samples_per_second": 32.828, "eval_steps_per_second": 1.026, "step": 2082 } ], "logging_steps": 1, "max_steps": 6246, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.075959801947095e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }