random_5gtPDT2cZIM0VR8E / trainer_state.json
cutelemonlili's picture
Add files using upload-large-folder tool
80772b8 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 782,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0025575447570332483,
"grad_norm": 15.392987058824561,
"learning_rate": 9.999959651660741e-06,
"loss": 1.0123,
"step": 1
},
{
"epoch": 0.005115089514066497,
"grad_norm": 9.468512560286664,
"learning_rate": 9.999838607294157e-06,
"loss": 0.7403,
"step": 2
},
{
"epoch": 0.0076726342710997444,
"grad_norm": 9.243581351204085,
"learning_rate": 9.999636868853824e-06,
"loss": 0.5837,
"step": 3
},
{
"epoch": 0.010230179028132993,
"grad_norm": 6.9496549198580935,
"learning_rate": 9.999354439595668e-06,
"loss": 0.4631,
"step": 4
},
{
"epoch": 0.01278772378516624,
"grad_norm": 5.707870970454734,
"learning_rate": 9.998991324077906e-06,
"loss": 0.3324,
"step": 5
},
{
"epoch": 0.015345268542199489,
"grad_norm": 4.709511789953965,
"learning_rate": 9.998547528160987e-06,
"loss": 0.3558,
"step": 6
},
{
"epoch": 0.017902813299232736,
"grad_norm": 3.5534524642812655,
"learning_rate": 9.998023059007477e-06,
"loss": 0.2853,
"step": 7
},
{
"epoch": 0.020460358056265986,
"grad_norm": 4.702487004348668,
"learning_rate": 9.997417925081963e-06,
"loss": 0.2791,
"step": 8
},
{
"epoch": 0.023017902813299233,
"grad_norm": 5.73219181182079,
"learning_rate": 9.996732136150902e-06,
"loss": 0.2954,
"step": 9
},
{
"epoch": 0.02557544757033248,
"grad_norm": 3.782084054111381,
"learning_rate": 9.995965703282472e-06,
"loss": 0.2448,
"step": 10
},
{
"epoch": 0.028132992327365727,
"grad_norm": 4.761454357292183,
"learning_rate": 9.995118638846394e-06,
"loss": 0.3343,
"step": 11
},
{
"epoch": 0.030690537084398978,
"grad_norm": 4.862414967949473,
"learning_rate": 9.99419095651372e-06,
"loss": 0.3326,
"step": 12
},
{
"epoch": 0.03324808184143223,
"grad_norm": 4.287681311749821,
"learning_rate": 9.993182671256633e-06,
"loss": 0.3317,
"step": 13
},
{
"epoch": 0.03580562659846547,
"grad_norm": 4.548249117039148,
"learning_rate": 9.992093799348182e-06,
"loss": 0.2939,
"step": 14
},
{
"epoch": 0.03836317135549872,
"grad_norm": 4.306374147946709,
"learning_rate": 9.990924358362037e-06,
"loss": 0.2409,
"step": 15
},
{
"epoch": 0.04092071611253197,
"grad_norm": 3.4314319732642176,
"learning_rate": 9.9896743671722e-06,
"loss": 0.3049,
"step": 16
},
{
"epoch": 0.043478260869565216,
"grad_norm": 4.114297236669634,
"learning_rate": 9.988343845952697e-06,
"loss": 0.314,
"step": 17
},
{
"epoch": 0.04603580562659847,
"grad_norm": 3.6558902621855167,
"learning_rate": 9.986932816177258e-06,
"loss": 0.2786,
"step": 18
},
{
"epoch": 0.04859335038363171,
"grad_norm": 3.451023121802292,
"learning_rate": 9.985441300618966e-06,
"loss": 0.2949,
"step": 19
},
{
"epoch": 0.05115089514066496,
"grad_norm": 3.8360280897630745,
"learning_rate": 9.98386932334989e-06,
"loss": 0.2868,
"step": 20
},
{
"epoch": 0.05370843989769821,
"grad_norm": 3.104220866344891,
"learning_rate": 9.982216909740703e-06,
"loss": 0.2371,
"step": 21
},
{
"epoch": 0.056265984654731455,
"grad_norm": 3.2036209599473193,
"learning_rate": 9.980484086460258e-06,
"loss": 0.2853,
"step": 22
},
{
"epoch": 0.058823529411764705,
"grad_norm": 3.997000948759866,
"learning_rate": 9.978670881475173e-06,
"loss": 0.2551,
"step": 23
},
{
"epoch": 0.061381074168797956,
"grad_norm": 4.22698810804951,
"learning_rate": 9.976777324049374e-06,
"loss": 0.2897,
"step": 24
},
{
"epoch": 0.0639386189258312,
"grad_norm": 3.666060568124115,
"learning_rate": 9.974803444743617e-06,
"loss": 0.2985,
"step": 25
},
{
"epoch": 0.06649616368286446,
"grad_norm": 3.7518838770389342,
"learning_rate": 9.972749275415005e-06,
"loss": 0.2834,
"step": 26
},
{
"epoch": 0.06905370843989769,
"grad_norm": 3.870535915573906,
"learning_rate": 9.970614849216465e-06,
"loss": 0.2385,
"step": 27
},
{
"epoch": 0.07161125319693094,
"grad_norm": 3.6607655718950056,
"learning_rate": 9.96840020059622e-06,
"loss": 0.3131,
"step": 28
},
{
"epoch": 0.0741687979539642,
"grad_norm": 3.475270449404676,
"learning_rate": 9.966105365297226e-06,
"loss": 0.2588,
"step": 29
},
{
"epoch": 0.07672634271099744,
"grad_norm": 4.200264770629892,
"learning_rate": 9.963730380356599e-06,
"loss": 0.2746,
"step": 30
},
{
"epoch": 0.0792838874680307,
"grad_norm": 4.372784076877616,
"learning_rate": 9.96127528410502e-06,
"loss": 0.3288,
"step": 31
},
{
"epoch": 0.08184143222506395,
"grad_norm": 3.6840961909503727,
"learning_rate": 9.958740116166113e-06,
"loss": 0.2797,
"step": 32
},
{
"epoch": 0.08439897698209718,
"grad_norm": 3.660857860053387,
"learning_rate": 9.9561249174558e-06,
"loss": 0.2552,
"step": 33
},
{
"epoch": 0.08695652173913043,
"grad_norm": 3.843432399530098,
"learning_rate": 9.953429730181653e-06,
"loss": 0.3066,
"step": 34
},
{
"epoch": 0.08951406649616368,
"grad_norm": 3.863125954202528,
"learning_rate": 9.950654597842209e-06,
"loss": 0.281,
"step": 35
},
{
"epoch": 0.09207161125319693,
"grad_norm": 3.233755549464827,
"learning_rate": 9.947799565226253e-06,
"loss": 0.2684,
"step": 36
},
{
"epoch": 0.09462915601023018,
"grad_norm": 4.19966407483748,
"learning_rate": 9.944864678412118e-06,
"loss": 0.2856,
"step": 37
},
{
"epoch": 0.09718670076726342,
"grad_norm": 3.165753312848627,
"learning_rate": 9.94184998476693e-06,
"loss": 0.24,
"step": 38
},
{
"epoch": 0.09974424552429667,
"grad_norm": 3.4710302005614277,
"learning_rate": 9.938755532945838e-06,
"loss": 0.293,
"step": 39
},
{
"epoch": 0.10230179028132992,
"grad_norm": 3.2439371920078663,
"learning_rate": 9.93558137289124e-06,
"loss": 0.2786,
"step": 40
},
{
"epoch": 0.10485933503836317,
"grad_norm": 3.133329636506038,
"learning_rate": 9.932327555831972e-06,
"loss": 0.1905,
"step": 41
},
{
"epoch": 0.10741687979539642,
"grad_norm": 3.041802191982791,
"learning_rate": 9.928994134282477e-06,
"loss": 0.2307,
"step": 42
},
{
"epoch": 0.10997442455242967,
"grad_norm": 3.44216600685852,
"learning_rate": 9.925581162041967e-06,
"loss": 0.2273,
"step": 43
},
{
"epoch": 0.11253196930946291,
"grad_norm": 3.2284774978139663,
"learning_rate": 9.922088694193546e-06,
"loss": 0.2279,
"step": 44
},
{
"epoch": 0.11508951406649616,
"grad_norm": 2.7419938932220442,
"learning_rate": 9.918516787103322e-06,
"loss": 0.2278,
"step": 45
},
{
"epoch": 0.11764705882352941,
"grad_norm": 3.7438191859843313,
"learning_rate": 9.91486549841951e-06,
"loss": 0.2512,
"step": 46
},
{
"epoch": 0.12020460358056266,
"grad_norm": 3.435535432268392,
"learning_rate": 9.911134887071477e-06,
"loss": 0.2619,
"step": 47
},
{
"epoch": 0.12276214833759591,
"grad_norm": 3.4627560471149756,
"learning_rate": 9.907325013268816e-06,
"loss": 0.2711,
"step": 48
},
{
"epoch": 0.12531969309462915,
"grad_norm": 2.7213735709949196,
"learning_rate": 9.903435938500356e-06,
"loss": 0.1918,
"step": 49
},
{
"epoch": 0.1278772378516624,
"grad_norm": 3.0775197585007343,
"learning_rate": 9.899467725533181e-06,
"loss": 0.2259,
"step": 50
},
{
"epoch": 0.13043478260869565,
"grad_norm": 3.6206763525611274,
"learning_rate": 9.895420438411616e-06,
"loss": 0.2572,
"step": 51
},
{
"epoch": 0.1329923273657289,
"grad_norm": 3.898289061481641,
"learning_rate": 9.89129414245618e-06,
"loss": 0.2254,
"step": 52
},
{
"epoch": 0.13554987212276215,
"grad_norm": 3.6029485399959778,
"learning_rate": 9.887088904262557e-06,
"loss": 0.2347,
"step": 53
},
{
"epoch": 0.13810741687979539,
"grad_norm": 3.4988251793641676,
"learning_rate": 9.882804791700488e-06,
"loss": 0.2594,
"step": 54
},
{
"epoch": 0.14066496163682865,
"grad_norm": 3.305043823936606,
"learning_rate": 9.878441873912712e-06,
"loss": 0.2443,
"step": 55
},
{
"epoch": 0.1432225063938619,
"grad_norm": 3.8391979100290206,
"learning_rate": 9.87400022131382e-06,
"loss": 0.2909,
"step": 56
},
{
"epoch": 0.14578005115089515,
"grad_norm": 34.49457439956216,
"learning_rate": 9.869479905589136e-06,
"loss": 0.1952,
"step": 57
},
{
"epoch": 0.1483375959079284,
"grad_norm": 5.486937250031213,
"learning_rate": 9.864880999693551e-06,
"loss": 0.1897,
"step": 58
},
{
"epoch": 0.15089514066496162,
"grad_norm": 3.457559155846861,
"learning_rate": 9.860203577850353e-06,
"loss": 0.2333,
"step": 59
},
{
"epoch": 0.1534526854219949,
"grad_norm": 3.551347913898386,
"learning_rate": 9.855447715550024e-06,
"loss": 0.232,
"step": 60
},
{
"epoch": 0.15601023017902813,
"grad_norm": 3.029321243204479,
"learning_rate": 9.850613489549018e-06,
"loss": 0.1929,
"step": 61
},
{
"epoch": 0.1585677749360614,
"grad_norm": 3.994451029674619,
"learning_rate": 9.845700977868536e-06,
"loss": 0.2771,
"step": 62
},
{
"epoch": 0.16112531969309463,
"grad_norm": 2.9207143827429594,
"learning_rate": 9.840710259793251e-06,
"loss": 0.1973,
"step": 63
},
{
"epoch": 0.1636828644501279,
"grad_norm": 3.288151604488407,
"learning_rate": 9.835641415870038e-06,
"loss": 0.2833,
"step": 64
},
{
"epoch": 0.16624040920716113,
"grad_norm": 3.1704268328889285,
"learning_rate": 9.830494527906671e-06,
"loss": 0.1916,
"step": 65
},
{
"epoch": 0.16879795396419436,
"grad_norm": 3.0478461733192677,
"learning_rate": 9.825269678970502e-06,
"loss": 0.227,
"step": 66
},
{
"epoch": 0.17135549872122763,
"grad_norm": 3.247187893935764,
"learning_rate": 9.819966953387122e-06,
"loss": 0.2379,
"step": 67
},
{
"epoch": 0.17391304347826086,
"grad_norm": 3.7351933963260597,
"learning_rate": 9.814586436738998e-06,
"loss": 0.2284,
"step": 68
},
{
"epoch": 0.17647058823529413,
"grad_norm": 3.562240712811416,
"learning_rate": 9.809128215864096e-06,
"loss": 0.2875,
"step": 69
},
{
"epoch": 0.17902813299232737,
"grad_norm": 2.6707181487795264,
"learning_rate": 9.803592378854476e-06,
"loss": 0.2144,
"step": 70
},
{
"epoch": 0.1815856777493606,
"grad_norm": 2.932314569007143,
"learning_rate": 9.797979015054868e-06,
"loss": 0.2432,
"step": 71
},
{
"epoch": 0.18414322250639387,
"grad_norm": 3.2103231515107105,
"learning_rate": 9.792288215061237e-06,
"loss": 0.2341,
"step": 72
},
{
"epoch": 0.1867007672634271,
"grad_norm": 3.640482608271676,
"learning_rate": 9.786520070719313e-06,
"loss": 0.1941,
"step": 73
},
{
"epoch": 0.18925831202046037,
"grad_norm": 2.9037943123818075,
"learning_rate": 9.780674675123113e-06,
"loss": 0.204,
"step": 74
},
{
"epoch": 0.1918158567774936,
"grad_norm": 3.158767038474441,
"learning_rate": 9.77475212261344e-06,
"loss": 0.2453,
"step": 75
},
{
"epoch": 0.19437340153452684,
"grad_norm": 3.1234088193628917,
"learning_rate": 9.768752508776358e-06,
"loss": 0.1783,
"step": 76
},
{
"epoch": 0.1969309462915601,
"grad_norm": 2.8268077914257312,
"learning_rate": 9.762675930441647e-06,
"loss": 0.1986,
"step": 77
},
{
"epoch": 0.19948849104859334,
"grad_norm": 2.9513681306595343,
"learning_rate": 9.756522485681247e-06,
"loss": 0.2365,
"step": 78
},
{
"epoch": 0.2020460358056266,
"grad_norm": 4.828985495990047,
"learning_rate": 9.750292273807666e-06,
"loss": 0.2922,
"step": 79
},
{
"epoch": 0.20460358056265984,
"grad_norm": 4.12969908799372,
"learning_rate": 9.743985395372387e-06,
"loss": 0.2747,
"step": 80
},
{
"epoch": 0.2071611253196931,
"grad_norm": 3.8828623504287374,
"learning_rate": 9.737601952164238e-06,
"loss": 0.2756,
"step": 81
},
{
"epoch": 0.20971867007672634,
"grad_norm": 3.1532945515057427,
"learning_rate": 9.73114204720775e-06,
"loss": 0.2399,
"step": 82
},
{
"epoch": 0.21227621483375958,
"grad_norm": 2.768144038570148,
"learning_rate": 9.724605784761501e-06,
"loss": 0.2225,
"step": 83
},
{
"epoch": 0.21483375959079284,
"grad_norm": 3.0936033820891544,
"learning_rate": 9.717993270316421e-06,
"loss": 0.1855,
"step": 84
},
{
"epoch": 0.21739130434782608,
"grad_norm": 2.874646258205036,
"learning_rate": 9.711304610594104e-06,
"loss": 0.2511,
"step": 85
},
{
"epoch": 0.21994884910485935,
"grad_norm": 2.631348523046017,
"learning_rate": 9.704539913545073e-06,
"loss": 0.2081,
"step": 86
},
{
"epoch": 0.22250639386189258,
"grad_norm": 2.8254213731499336,
"learning_rate": 9.697699288347043e-06,
"loss": 0.1861,
"step": 87
},
{
"epoch": 0.22506393861892582,
"grad_norm": 2.4651414344097526,
"learning_rate": 9.690782845403164e-06,
"loss": 0.182,
"step": 88
},
{
"epoch": 0.22762148337595908,
"grad_norm": 2.837322369457593,
"learning_rate": 9.683790696340229e-06,
"loss": 0.211,
"step": 89
},
{
"epoch": 0.23017902813299232,
"grad_norm": 3.370816547248047,
"learning_rate": 9.676722954006878e-06,
"loss": 0.2782,
"step": 90
},
{
"epoch": 0.23273657289002558,
"grad_norm": 3.7835641834887843,
"learning_rate": 9.669579732471779e-06,
"loss": 0.3041,
"step": 91
},
{
"epoch": 0.23529411764705882,
"grad_norm": 3.296489483167239,
"learning_rate": 9.66236114702178e-06,
"loss": 0.23,
"step": 92
},
{
"epoch": 0.23785166240409208,
"grad_norm": 2.8890092480403564,
"learning_rate": 9.655067314160058e-06,
"loss": 0.2149,
"step": 93
},
{
"epoch": 0.24040920716112532,
"grad_norm": 3.429189900304709,
"learning_rate": 9.647698351604227e-06,
"loss": 0.301,
"step": 94
},
{
"epoch": 0.24296675191815856,
"grad_norm": 3.3850684920397245,
"learning_rate": 9.640254378284447e-06,
"loss": 0.2723,
"step": 95
},
{
"epoch": 0.24552429667519182,
"grad_norm": 3.0904355382979016,
"learning_rate": 9.632735514341508e-06,
"loss": 0.2598,
"step": 96
},
{
"epoch": 0.24808184143222506,
"grad_norm": 3.7186808278209,
"learning_rate": 9.625141881124874e-06,
"loss": 0.2948,
"step": 97
},
{
"epoch": 0.2506393861892583,
"grad_norm": 3.150975002506145,
"learning_rate": 9.617473601190743e-06,
"loss": 0.2143,
"step": 98
},
{
"epoch": 0.2531969309462916,
"grad_norm": 23.69858423180905,
"learning_rate": 9.609730798300056e-06,
"loss": 0.1867,
"step": 99
},
{
"epoch": 0.2557544757033248,
"grad_norm": 2.9299967673432215,
"learning_rate": 9.601913597416513e-06,
"loss": 0.2168,
"step": 100
},
{
"epoch": 0.25831202046035806,
"grad_norm": 4.058096387207032,
"learning_rate": 9.594022124704541e-06,
"loss": 0.3437,
"step": 101
},
{
"epoch": 0.2608695652173913,
"grad_norm": 2.4370542742741614,
"learning_rate": 9.586056507527266e-06,
"loss": 0.2073,
"step": 102
},
{
"epoch": 0.26342710997442453,
"grad_norm": 3.1921593576184355,
"learning_rate": 9.578016874444459e-06,
"loss": 0.216,
"step": 103
},
{
"epoch": 0.2659846547314578,
"grad_norm": 2.1088920969603056,
"learning_rate": 9.569903355210457e-06,
"loss": 0.1807,
"step": 104
},
{
"epoch": 0.26854219948849106,
"grad_norm": 3.2993263422325976,
"learning_rate": 9.561716080772072e-06,
"loss": 0.2389,
"step": 105
},
{
"epoch": 0.2710997442455243,
"grad_norm": 3.7311120165895475,
"learning_rate": 9.55345518326647e-06,
"loss": 0.2998,
"step": 106
},
{
"epoch": 0.27365728900255754,
"grad_norm": 4.097055377202607,
"learning_rate": 9.545120796019056e-06,
"loss": 0.2108,
"step": 107
},
{
"epoch": 0.27621483375959077,
"grad_norm": 3.2362396004576657,
"learning_rate": 9.5367130535413e-06,
"loss": 0.2854,
"step": 108
},
{
"epoch": 0.27877237851662406,
"grad_norm": 3.371909008316518,
"learning_rate": 9.528232091528578e-06,
"loss": 0.2293,
"step": 109
},
{
"epoch": 0.2813299232736573,
"grad_norm": 2.5846598885848495,
"learning_rate": 9.519678046857987e-06,
"loss": 0.232,
"step": 110
},
{
"epoch": 0.28388746803069054,
"grad_norm": 2.8625625809663573,
"learning_rate": 9.511051057586125e-06,
"loss": 0.2534,
"step": 111
},
{
"epoch": 0.2864450127877238,
"grad_norm": 2.4251750007131037,
"learning_rate": 9.502351262946865e-06,
"loss": 0.224,
"step": 112
},
{
"epoch": 0.289002557544757,
"grad_norm": 2.4385429235387477,
"learning_rate": 9.493578803349117e-06,
"loss": 0.1934,
"step": 113
},
{
"epoch": 0.2915601023017903,
"grad_norm": 3.112915617413732,
"learning_rate": 9.48473382037455e-06,
"loss": 0.2592,
"step": 114
},
{
"epoch": 0.29411764705882354,
"grad_norm": 3.1095060452385375,
"learning_rate": 9.475816456775313e-06,
"loss": 0.2748,
"step": 115
},
{
"epoch": 0.2966751918158568,
"grad_norm": 3.193274362044026,
"learning_rate": 9.466826856471728e-06,
"loss": 0.2443,
"step": 116
},
{
"epoch": 0.29923273657289,
"grad_norm": 2.805529582638814,
"learning_rate": 9.457765164549979e-06,
"loss": 0.2351,
"step": 117
},
{
"epoch": 0.30179028132992325,
"grad_norm": 4.497751524982449,
"learning_rate": 9.448631527259749e-06,
"loss": 0.3551,
"step": 118
},
{
"epoch": 0.30434782608695654,
"grad_norm": 2.9074901684200163,
"learning_rate": 9.439426092011877e-06,
"loss": 0.2004,
"step": 119
},
{
"epoch": 0.3069053708439898,
"grad_norm": 2.974411031118463,
"learning_rate": 9.430149007375974e-06,
"loss": 0.2605,
"step": 120
},
{
"epoch": 0.309462915601023,
"grad_norm": 2.8491332590658702,
"learning_rate": 9.42080042307802e-06,
"loss": 0.2337,
"step": 121
},
{
"epoch": 0.31202046035805625,
"grad_norm": 2.3669589821998915,
"learning_rate": 9.411380489997962e-06,
"loss": 0.1974,
"step": 122
},
{
"epoch": 0.3145780051150895,
"grad_norm": 3.347163072711235,
"learning_rate": 9.401889360167256e-06,
"loss": 0.2662,
"step": 123
},
{
"epoch": 0.3171355498721228,
"grad_norm": 2.827025371885836,
"learning_rate": 9.392327186766434e-06,
"loss": 0.2275,
"step": 124
},
{
"epoch": 0.319693094629156,
"grad_norm": 2.4402905299053983,
"learning_rate": 9.382694124122624e-06,
"loss": 0.2067,
"step": 125
},
{
"epoch": 0.32225063938618925,
"grad_norm": 2.595701436925841,
"learning_rate": 9.372990327707057e-06,
"loss": 0.1936,
"step": 126
},
{
"epoch": 0.3248081841432225,
"grad_norm": 2.972919332628706,
"learning_rate": 9.36321595413256e-06,
"loss": 0.2284,
"step": 127
},
{
"epoch": 0.3273657289002558,
"grad_norm": 2.96761954478865,
"learning_rate": 9.353371161151032e-06,
"loss": 0.3038,
"step": 128
},
{
"epoch": 0.329923273657289,
"grad_norm": 2.6927171651588337,
"learning_rate": 9.34345610765089e-06,
"loss": 0.1787,
"step": 129
},
{
"epoch": 0.33248081841432225,
"grad_norm": 3.5431583100736863,
"learning_rate": 9.333470953654513e-06,
"loss": 0.3147,
"step": 130
},
{
"epoch": 0.3350383631713555,
"grad_norm": 2.632290215476919,
"learning_rate": 9.32341586031565e-06,
"loss": 0.2647,
"step": 131
},
{
"epoch": 0.3375959079283887,
"grad_norm": 3.2469034068085736,
"learning_rate": 9.31329098991683e-06,
"loss": 0.2767,
"step": 132
},
{
"epoch": 0.340153452685422,
"grad_norm": 3.3387196083764823,
"learning_rate": 9.303096505866734e-06,
"loss": 0.2513,
"step": 133
},
{
"epoch": 0.34271099744245526,
"grad_norm": 2.6511165133843186,
"learning_rate": 9.292832572697566e-06,
"loss": 0.2379,
"step": 134
},
{
"epoch": 0.3452685421994885,
"grad_norm": 2.270613271667338,
"learning_rate": 9.282499356062385e-06,
"loss": 0.2181,
"step": 135
},
{
"epoch": 0.34782608695652173,
"grad_norm": 2.7845455382154585,
"learning_rate": 9.272097022732444e-06,
"loss": 0.1959,
"step": 136
},
{
"epoch": 0.35038363171355497,
"grad_norm": 3.359968320145697,
"learning_rate": 9.261625740594494e-06,
"loss": 0.2432,
"step": 137
},
{
"epoch": 0.35294117647058826,
"grad_norm": 3.225209792269358,
"learning_rate": 9.251085678648072e-06,
"loss": 0.2534,
"step": 138
},
{
"epoch": 0.3554987212276215,
"grad_norm": 2.833166106658051,
"learning_rate": 9.240477007002777e-06,
"loss": 0.2156,
"step": 139
},
{
"epoch": 0.35805626598465473,
"grad_norm": 3.4490482806712626,
"learning_rate": 9.22979989687552e-06,
"loss": 0.2963,
"step": 140
},
{
"epoch": 0.36061381074168797,
"grad_norm": 3.1464710421043365,
"learning_rate": 9.219054520587766e-06,
"loss": 0.1821,
"step": 141
},
{
"epoch": 0.3631713554987212,
"grad_norm": 2.5959703410215114,
"learning_rate": 9.208241051562753e-06,
"loss": 0.2277,
"step": 142
},
{
"epoch": 0.3657289002557545,
"grad_norm": 3.044870699089069,
"learning_rate": 9.197359664322684e-06,
"loss": 0.2234,
"step": 143
},
{
"epoch": 0.36828644501278773,
"grad_norm": 3.4399399781668403,
"learning_rate": 9.186410534485924e-06,
"loss": 0.2574,
"step": 144
},
{
"epoch": 0.37084398976982097,
"grad_norm": 2.673893462327474,
"learning_rate": 9.175393838764153e-06,
"loss": 0.2054,
"step": 145
},
{
"epoch": 0.3734015345268542,
"grad_norm": 2.444687752494406,
"learning_rate": 9.164309754959523e-06,
"loss": 0.207,
"step": 146
},
{
"epoch": 0.37595907928388744,
"grad_norm": 2.6282321631694248,
"learning_rate": 9.153158461961782e-06,
"loss": 0.1948,
"step": 147
},
{
"epoch": 0.37851662404092073,
"grad_norm": 2.4890084427480588,
"learning_rate": 9.14194013974539e-06,
"loss": 0.186,
"step": 148
},
{
"epoch": 0.38107416879795397,
"grad_norm": 2.5896447502938633,
"learning_rate": 9.130654969366619e-06,
"loss": 0.2275,
"step": 149
},
{
"epoch": 0.3836317135549872,
"grad_norm": 2.564119331980231,
"learning_rate": 9.11930313296062e-06,
"loss": 0.213,
"step": 150
},
{
"epoch": 0.38618925831202044,
"grad_norm": 3.224590304098589,
"learning_rate": 9.107884813738492e-06,
"loss": 0.283,
"step": 151
},
{
"epoch": 0.3887468030690537,
"grad_norm": 2.715718216923276,
"learning_rate": 9.096400195984322e-06,
"loss": 0.1769,
"step": 152
},
{
"epoch": 0.391304347826087,
"grad_norm": 2.9116689180007698,
"learning_rate": 9.08484946505221e-06,
"loss": 0.2214,
"step": 153
},
{
"epoch": 0.3938618925831202,
"grad_norm": 2.855052145835007,
"learning_rate": 9.073232807363283e-06,
"loss": 0.2181,
"step": 154
},
{
"epoch": 0.39641943734015345,
"grad_norm": 3.579582085441427,
"learning_rate": 9.061550410402677e-06,
"loss": 0.3031,
"step": 155
},
{
"epoch": 0.3989769820971867,
"grad_norm": 3.3622889576332122,
"learning_rate": 9.049802462716521e-06,
"loss": 0.2345,
"step": 156
},
{
"epoch": 0.40153452685422,
"grad_norm": 3.5682342482734684,
"learning_rate": 9.037989153908882e-06,
"loss": 0.2558,
"step": 157
},
{
"epoch": 0.4040920716112532,
"grad_norm": 2.541343662604646,
"learning_rate": 9.026110674638722e-06,
"loss": 0.2171,
"step": 158
},
{
"epoch": 0.40664961636828645,
"grad_norm": 2.7014269251115097,
"learning_rate": 9.0141672166168e-06,
"loss": 0.1616,
"step": 159
},
{
"epoch": 0.4092071611253197,
"grad_norm": 3.069690328482181,
"learning_rate": 9.002158972602599e-06,
"loss": 0.3043,
"step": 160
},
{
"epoch": 0.4117647058823529,
"grad_norm": 2.922623620430193,
"learning_rate": 8.990086136401199e-06,
"loss": 0.2376,
"step": 161
},
{
"epoch": 0.4143222506393862,
"grad_norm": 3.792442645821336,
"learning_rate": 8.977948902860154e-06,
"loss": 0.2899,
"step": 162
},
{
"epoch": 0.41687979539641945,
"grad_norm": 2.831662241450309,
"learning_rate": 8.965747467866355e-06,
"loss": 0.1795,
"step": 163
},
{
"epoch": 0.4194373401534527,
"grad_norm": 2.800496787119197,
"learning_rate": 8.953482028342853e-06,
"loss": 0.2936,
"step": 164
},
{
"epoch": 0.4219948849104859,
"grad_norm": 2.5546124203822327,
"learning_rate": 8.9411527822457e-06,
"loss": 0.2064,
"step": 165
},
{
"epoch": 0.42455242966751916,
"grad_norm": 3.0514576119776153,
"learning_rate": 8.92875992856073e-06,
"loss": 0.2617,
"step": 166
},
{
"epoch": 0.42710997442455245,
"grad_norm": 2.283609395848259,
"learning_rate": 8.916303667300373e-06,
"loss": 0.181,
"step": 167
},
{
"epoch": 0.4296675191815857,
"grad_norm": 3.1823314702001184,
"learning_rate": 8.903784199500412e-06,
"loss": 0.2184,
"step": 168
},
{
"epoch": 0.4322250639386189,
"grad_norm": 3.087529114047624,
"learning_rate": 8.89120172721674e-06,
"loss": 0.218,
"step": 169
},
{
"epoch": 0.43478260869565216,
"grad_norm": 3.2616256957739957,
"learning_rate": 8.8785564535221e-06,
"loss": 0.2138,
"step": 170
},
{
"epoch": 0.4373401534526854,
"grad_norm": 3.2597963602492928,
"learning_rate": 8.86584858250281e-06,
"loss": 0.2471,
"step": 171
},
{
"epoch": 0.4398976982097187,
"grad_norm": 2.892437025795672,
"learning_rate": 8.853078319255466e-06,
"loss": 0.2423,
"step": 172
},
{
"epoch": 0.4424552429667519,
"grad_norm": 2.7606976408563257,
"learning_rate": 8.840245869883635e-06,
"loss": 0.2559,
"step": 173
},
{
"epoch": 0.44501278772378516,
"grad_norm": 2.172044285789016,
"learning_rate": 8.827351441494525e-06,
"loss": 0.204,
"step": 174
},
{
"epoch": 0.4475703324808184,
"grad_norm": 3.0017261477691104,
"learning_rate": 8.814395242195642e-06,
"loss": 0.2776,
"step": 175
},
{
"epoch": 0.45012787723785164,
"grad_norm": 2.648838834500726,
"learning_rate": 8.80137748109144e-06,
"loss": 0.2085,
"step": 176
},
{
"epoch": 0.45268542199488493,
"grad_norm": 2.035533604532462,
"learning_rate": 8.78829836827993e-06,
"loss": 0.1803,
"step": 177
},
{
"epoch": 0.45524296675191817,
"grad_norm": 2.2008376049863148,
"learning_rate": 8.77515811484931e-06,
"loss": 0.2172,
"step": 178
},
{
"epoch": 0.4578005115089514,
"grad_norm": 2.151726038676321,
"learning_rate": 8.761956932874539e-06,
"loss": 0.1737,
"step": 179
},
{
"epoch": 0.46035805626598464,
"grad_norm": 2.8305977619166263,
"learning_rate": 8.748695035413925e-06,
"loss": 0.2258,
"step": 180
},
{
"epoch": 0.4629156010230179,
"grad_norm": 2.2823582846606705,
"learning_rate": 8.735372636505681e-06,
"loss": 0.2186,
"step": 181
},
{
"epoch": 0.46547314578005117,
"grad_norm": 2.423531563066667,
"learning_rate": 8.72198995116448e-06,
"loss": 0.2407,
"step": 182
},
{
"epoch": 0.4680306905370844,
"grad_norm": 2.534940058147962,
"learning_rate": 8.708547195377968e-06,
"loss": 0.2939,
"step": 183
},
{
"epoch": 0.47058823529411764,
"grad_norm": 2.8644782418220296,
"learning_rate": 8.695044586103297e-06,
"loss": 0.2317,
"step": 184
},
{
"epoch": 0.4731457800511509,
"grad_norm": 3.673887217855088,
"learning_rate": 8.68148234126361e-06,
"loss": 0.2833,
"step": 185
},
{
"epoch": 0.47570332480818417,
"grad_norm": 2.749104058931232,
"learning_rate": 8.667860679744529e-06,
"loss": 0.205,
"step": 186
},
{
"epoch": 0.4782608695652174,
"grad_norm": 2.881858956427076,
"learning_rate": 8.65417982139062e-06,
"loss": 0.2373,
"step": 187
},
{
"epoch": 0.48081841432225064,
"grad_norm": 2.441774845496025,
"learning_rate": 8.640439987001855e-06,
"loss": 0.1988,
"step": 188
},
{
"epoch": 0.4833759590792839,
"grad_norm": 2.680220627075973,
"learning_rate": 8.626641398330027e-06,
"loss": 0.2128,
"step": 189
},
{
"epoch": 0.4859335038363171,
"grad_norm": 2.1697481026879144,
"learning_rate": 8.612784278075195e-06,
"loss": 0.2085,
"step": 190
},
{
"epoch": 0.4884910485933504,
"grad_norm": 2.682355105066521,
"learning_rate": 8.598868849882074e-06,
"loss": 0.2354,
"step": 191
},
{
"epoch": 0.49104859335038364,
"grad_norm": 2.470410385763528,
"learning_rate": 8.58489533833643e-06,
"loss": 0.1969,
"step": 192
},
{
"epoch": 0.4936061381074169,
"grad_norm": 2.311090258410845,
"learning_rate": 8.570863968961456e-06,
"loss": 0.1628,
"step": 193
},
{
"epoch": 0.4961636828644501,
"grad_norm": 2.069853143088041,
"learning_rate": 8.556774968214134e-06,
"loss": 0.2108,
"step": 194
},
{
"epoch": 0.49872122762148335,
"grad_norm": 2.245346110916918,
"learning_rate": 8.542628563481577e-06,
"loss": 0.2197,
"step": 195
},
{
"epoch": 0.5012787723785166,
"grad_norm": 2.5537303598871426,
"learning_rate": 8.52842498307736e-06,
"loss": 0.2529,
"step": 196
},
{
"epoch": 0.5038363171355499,
"grad_norm": 2.7454118710249924,
"learning_rate": 8.514164456237835e-06,
"loss": 0.2372,
"step": 197
},
{
"epoch": 0.5063938618925832,
"grad_norm": 2.6858558224066176,
"learning_rate": 8.499847213118431e-06,
"loss": 0.256,
"step": 198
},
{
"epoch": 0.5089514066496164,
"grad_norm": 2.790784827198894,
"learning_rate": 8.485473484789944e-06,
"loss": 0.263,
"step": 199
},
{
"epoch": 0.5115089514066496,
"grad_norm": 2.243606795545935,
"learning_rate": 8.471043503234796e-06,
"loss": 0.2317,
"step": 200
},
{
"epoch": 0.5140664961636828,
"grad_norm": 3.2990716039658685,
"learning_rate": 8.45655750134331e-06,
"loss": 0.2392,
"step": 201
},
{
"epoch": 0.5166240409207161,
"grad_norm": 2.193537985146762,
"learning_rate": 8.442015712909926e-06,
"loss": 0.1975,
"step": 202
},
{
"epoch": 0.5191815856777494,
"grad_norm": 2.3337401949778496,
"learning_rate": 8.427418372629456e-06,
"loss": 0.199,
"step": 203
},
{
"epoch": 0.5217391304347826,
"grad_norm": 2.472600441004196,
"learning_rate": 8.412765716093273e-06,
"loss": 0.2034,
"step": 204
},
{
"epoch": 0.5242966751918159,
"grad_norm": 1.9955637536104787,
"learning_rate": 8.398057979785515e-06,
"loss": 0.1746,
"step": 205
},
{
"epoch": 0.5268542199488491,
"grad_norm": 2.4509873939651987,
"learning_rate": 8.383295401079284e-06,
"loss": 0.2152,
"step": 206
},
{
"epoch": 0.5294117647058824,
"grad_norm": 2.8716129683382348,
"learning_rate": 8.368478218232787e-06,
"loss": 0.2504,
"step": 207
},
{
"epoch": 0.5319693094629157,
"grad_norm": 2.243120617246336,
"learning_rate": 8.353606670385514e-06,
"loss": 0.1801,
"step": 208
},
{
"epoch": 0.5345268542199488,
"grad_norm": 2.215427775181383,
"learning_rate": 8.338680997554372e-06,
"loss": 0.2024,
"step": 209
},
{
"epoch": 0.5370843989769821,
"grad_norm": 2.2038368206477386,
"learning_rate": 8.3237014406298e-06,
"loss": 0.2008,
"step": 210
},
{
"epoch": 0.5396419437340153,
"grad_norm": 2.2158803371127354,
"learning_rate": 8.308668241371897e-06,
"loss": 0.1498,
"step": 211
},
{
"epoch": 0.5421994884910486,
"grad_norm": 2.1939303716398544,
"learning_rate": 8.293581642406517e-06,
"loss": 0.1791,
"step": 212
},
{
"epoch": 0.5447570332480819,
"grad_norm": 3.5427690009814103,
"learning_rate": 8.278441887221338e-06,
"loss": 0.2976,
"step": 213
},
{
"epoch": 0.5473145780051151,
"grad_norm": 2.2422966046960426,
"learning_rate": 8.263249220161957e-06,
"loss": 0.1768,
"step": 214
},
{
"epoch": 0.5498721227621484,
"grad_norm": 2.3319047512662,
"learning_rate": 8.248003886427927e-06,
"loss": 0.1948,
"step": 215
},
{
"epoch": 0.5524296675191815,
"grad_norm": 2.213024238241374,
"learning_rate": 8.232706132068806e-06,
"loss": 0.1195,
"step": 216
},
{
"epoch": 0.5549872122762148,
"grad_norm": 3.1675916876159262,
"learning_rate": 8.217356203980187e-06,
"loss": 0.193,
"step": 217
},
{
"epoch": 0.5575447570332481,
"grad_norm": 2.1931556698548826,
"learning_rate": 8.201954349899712e-06,
"loss": 0.2183,
"step": 218
},
{
"epoch": 0.5601023017902813,
"grad_norm": 2.1548375973876848,
"learning_rate": 8.186500818403076e-06,
"loss": 0.1331,
"step": 219
},
{
"epoch": 0.5626598465473146,
"grad_norm": 2.8937805193605017,
"learning_rate": 8.17099585890001e-06,
"loss": 0.2457,
"step": 220
},
{
"epoch": 0.5652173913043478,
"grad_norm": 2.1169776599974295,
"learning_rate": 8.155439721630265e-06,
"loss": 0.169,
"step": 221
},
{
"epoch": 0.5677749360613811,
"grad_norm": 2.4695153062225788,
"learning_rate": 8.139832657659557e-06,
"loss": 0.2079,
"step": 222
},
{
"epoch": 0.5703324808184144,
"grad_norm": 2.7432218121780942,
"learning_rate": 8.124174918875532e-06,
"loss": 0.2972,
"step": 223
},
{
"epoch": 0.5728900255754475,
"grad_norm": 2.8180099396258296,
"learning_rate": 8.108466757983695e-06,
"loss": 0.2098,
"step": 224
},
{
"epoch": 0.5754475703324808,
"grad_norm": 2.3696765683916508,
"learning_rate": 8.092708428503324e-06,
"loss": 0.1861,
"step": 225
},
{
"epoch": 0.578005115089514,
"grad_norm": 3.7161214440329964,
"learning_rate": 8.076900184763394e-06,
"loss": 0.2049,
"step": 226
},
{
"epoch": 0.5805626598465473,
"grad_norm": 2.353616152240531,
"learning_rate": 8.061042281898453e-06,
"loss": 0.2029,
"step": 227
},
{
"epoch": 0.5831202046035806,
"grad_norm": 2.6071768383498917,
"learning_rate": 8.04513497584452e-06,
"loss": 0.207,
"step": 228
},
{
"epoch": 0.5856777493606138,
"grad_norm": 2.871861928589528,
"learning_rate": 8.02917852333495e-06,
"loss": 0.274,
"step": 229
},
{
"epoch": 0.5882352941176471,
"grad_norm": 2.6285736054402036,
"learning_rate": 8.013173181896283e-06,
"loss": 0.2626,
"step": 230
},
{
"epoch": 0.5907928388746803,
"grad_norm": 2.4148081233137306,
"learning_rate": 7.9971192098441e-06,
"loss": 0.1275,
"step": 231
},
{
"epoch": 0.5933503836317136,
"grad_norm": 2.5465699530358643,
"learning_rate": 7.981016866278843e-06,
"loss": 0.1969,
"step": 232
},
{
"epoch": 0.5959079283887468,
"grad_norm": 2.45509535000344,
"learning_rate": 7.964866411081645e-06,
"loss": 0.2047,
"step": 233
},
{
"epoch": 0.59846547314578,
"grad_norm": 2.379257503910445,
"learning_rate": 7.94866810491012e-06,
"loss": 0.2049,
"step": 234
},
{
"epoch": 0.6010230179028133,
"grad_norm": 2.62551198927104,
"learning_rate": 7.93242220919417e-06,
"loss": 0.2194,
"step": 235
},
{
"epoch": 0.6035805626598465,
"grad_norm": 2.154755756722019,
"learning_rate": 7.916128986131761e-06,
"loss": 0.1858,
"step": 236
},
{
"epoch": 0.6061381074168798,
"grad_norm": 2.15917280811157,
"learning_rate": 7.899788698684687e-06,
"loss": 0.197,
"step": 237
},
{
"epoch": 0.6086956521739131,
"grad_norm": 2.9311613451724146,
"learning_rate": 7.883401610574338e-06,
"loss": 0.2576,
"step": 238
},
{
"epoch": 0.6112531969309463,
"grad_norm": 1.9858065195675783,
"learning_rate": 7.866967986277423e-06,
"loss": 0.1448,
"step": 239
},
{
"epoch": 0.6138107416879796,
"grad_norm": 2.4479250215085244,
"learning_rate": 7.850488091021726e-06,
"loss": 0.241,
"step": 240
},
{
"epoch": 0.6163682864450127,
"grad_norm": 2.9274230717517873,
"learning_rate": 7.833962190781809e-06,
"loss": 0.2245,
"step": 241
},
{
"epoch": 0.618925831202046,
"grad_norm": 2.0046579280922185,
"learning_rate": 7.817390552274721e-06,
"loss": 0.1377,
"step": 242
},
{
"epoch": 0.6214833759590793,
"grad_norm": 2.688691129256327,
"learning_rate": 7.800773442955703e-06,
"loss": 0.2236,
"step": 243
},
{
"epoch": 0.6240409207161125,
"grad_norm": 2.4420882136374384,
"learning_rate": 7.784111131013858e-06,
"loss": 0.1967,
"step": 244
},
{
"epoch": 0.6265984654731458,
"grad_norm": 2.395059320305546,
"learning_rate": 7.767403885367832e-06,
"loss": 0.1821,
"step": 245
},
{
"epoch": 0.629156010230179,
"grad_norm": 3.1294024434588468,
"learning_rate": 7.750651975661471e-06,
"loss": 0.2713,
"step": 246
},
{
"epoch": 0.6317135549872123,
"grad_norm": 2.4451446949008253,
"learning_rate": 7.733855672259472e-06,
"loss": 0.2237,
"step": 247
},
{
"epoch": 0.6342710997442456,
"grad_norm": 2.0190388654625173,
"learning_rate": 7.717015246243012e-06,
"loss": 0.1576,
"step": 248
},
{
"epoch": 0.6368286445012787,
"grad_norm": 2.8493759397799425,
"learning_rate": 7.700130969405377e-06,
"loss": 0.2196,
"step": 249
},
{
"epoch": 0.639386189258312,
"grad_norm": 2.9296895450026006,
"learning_rate": 7.683203114247587e-06,
"loss": 0.2143,
"step": 250
},
{
"epoch": 0.6419437340153452,
"grad_norm": 2.711043376771317,
"learning_rate": 7.66623195397397e-06,
"loss": 0.2276,
"step": 251
},
{
"epoch": 0.6445012787723785,
"grad_norm": 2.8063349249807827,
"learning_rate": 7.649217762487786e-06,
"loss": 0.243,
"step": 252
},
{
"epoch": 0.6470588235294118,
"grad_norm": 2.25381162153375,
"learning_rate": 7.63216081438678e-06,
"loss": 0.1902,
"step": 253
},
{
"epoch": 0.649616368286445,
"grad_norm": 2.0621493452179105,
"learning_rate": 7.615061384958764e-06,
"loss": 0.2048,
"step": 254
},
{
"epoch": 0.6521739130434783,
"grad_norm": 2.8241949462654294,
"learning_rate": 7.597919750177168e-06,
"loss": 0.2255,
"step": 255
},
{
"epoch": 0.6547314578005116,
"grad_norm": 2.283822865220664,
"learning_rate": 7.580736186696593e-06,
"loss": 0.1865,
"step": 256
},
{
"epoch": 0.6572890025575447,
"grad_norm": 2.9381754097047725,
"learning_rate": 7.563510971848339e-06,
"loss": 0.2054,
"step": 257
},
{
"epoch": 0.659846547314578,
"grad_norm": 2.411395623614636,
"learning_rate": 7.546244383635929e-06,
"loss": 0.2523,
"step": 258
},
{
"epoch": 0.6624040920716112,
"grad_norm": 2.3344681353933594,
"learning_rate": 7.528936700730627e-06,
"loss": 0.2203,
"step": 259
},
{
"epoch": 0.6649616368286445,
"grad_norm": 2.201988420125249,
"learning_rate": 7.5115882024669375e-06,
"loss": 0.1944,
"step": 260
},
{
"epoch": 0.6675191815856778,
"grad_norm": 2.560389049658497,
"learning_rate": 7.494199168838099e-06,
"loss": 0.2616,
"step": 261
},
{
"epoch": 0.670076726342711,
"grad_norm": 2.8818559971982523,
"learning_rate": 7.476769880491561e-06,
"loss": 0.2532,
"step": 262
},
{
"epoch": 0.6726342710997443,
"grad_norm": 2.3605699658219814,
"learning_rate": 7.459300618724462e-06,
"loss": 0.1874,
"step": 263
},
{
"epoch": 0.6751918158567775,
"grad_norm": 3.6724541267723065,
"learning_rate": 7.44179166547908e-06,
"loss": 0.2658,
"step": 264
},
{
"epoch": 0.6777493606138107,
"grad_norm": 2.6188857770286047,
"learning_rate": 7.42424330333829e-06,
"loss": 0.2566,
"step": 265
},
{
"epoch": 0.680306905370844,
"grad_norm": 2.379062702399823,
"learning_rate": 7.406655815520998e-06,
"loss": 0.1988,
"step": 266
},
{
"epoch": 0.6828644501278772,
"grad_norm": 2.623687689325637,
"learning_rate": 7.389029485877577e-06,
"loss": 0.2168,
"step": 267
},
{
"epoch": 0.6854219948849105,
"grad_norm": 2.0590068225537355,
"learning_rate": 7.371364598885276e-06,
"loss": 0.1858,
"step": 268
},
{
"epoch": 0.6879795396419437,
"grad_norm": 2.022043648648239,
"learning_rate": 7.353661439643638e-06,
"loss": 0.1643,
"step": 269
},
{
"epoch": 0.690537084398977,
"grad_norm": 2.1352711030685865,
"learning_rate": 7.335920293869891e-06,
"loss": 0.1904,
"step": 270
},
{
"epoch": 0.6930946291560103,
"grad_norm": 2.1395975567372973,
"learning_rate": 7.318141447894344e-06,
"loss": 0.1301,
"step": 271
},
{
"epoch": 0.6956521739130435,
"grad_norm": 2.358500093463299,
"learning_rate": 7.300325188655762e-06,
"loss": 0.233,
"step": 272
},
{
"epoch": 0.6982097186700768,
"grad_norm": 2.7648498326794475,
"learning_rate": 7.28247180369673e-06,
"loss": 0.2265,
"step": 273
},
{
"epoch": 0.7007672634271099,
"grad_norm": 3.2503327399881763,
"learning_rate": 7.264581581159024e-06,
"loss": 0.2357,
"step": 274
},
{
"epoch": 0.7033248081841432,
"grad_norm": 2.5417867541200874,
"learning_rate": 7.246654809778951e-06,
"loss": 0.2498,
"step": 275
},
{
"epoch": 0.7058823529411765,
"grad_norm": 2.3961400191083806,
"learning_rate": 7.2286917788826926e-06,
"loss": 0.1844,
"step": 276
},
{
"epoch": 0.7084398976982097,
"grad_norm": 1.9891068084294126,
"learning_rate": 7.210692778381634e-06,
"loss": 0.193,
"step": 277
},
{
"epoch": 0.710997442455243,
"grad_norm": 3.2392544755285453,
"learning_rate": 7.192658098767686e-06,
"loss": 0.2539,
"step": 278
},
{
"epoch": 0.7135549872122762,
"grad_norm": 2.1786844692725773,
"learning_rate": 7.174588031108598e-06,
"loss": 0.1915,
"step": 279
},
{
"epoch": 0.7161125319693095,
"grad_norm": 2.6661413865093713,
"learning_rate": 7.1564828670432595e-06,
"loss": 0.2342,
"step": 280
},
{
"epoch": 0.7186700767263428,
"grad_norm": 2.9051881715043426,
"learning_rate": 7.138342898776989e-06,
"loss": 0.2147,
"step": 281
},
{
"epoch": 0.7212276214833759,
"grad_norm": 2.7853479622331436,
"learning_rate": 7.120168419076825e-06,
"loss": 0.2404,
"step": 282
},
{
"epoch": 0.7237851662404092,
"grad_norm": 2.62022869243389,
"learning_rate": 7.101959721266798e-06,
"loss": 0.2654,
"step": 283
},
{
"epoch": 0.7263427109974424,
"grad_norm": 2.8497861912336493,
"learning_rate": 7.083717099223192e-06,
"loss": 0.2363,
"step": 284
},
{
"epoch": 0.7289002557544757,
"grad_norm": 1.927888471947111,
"learning_rate": 7.0654408473698084e-06,
"loss": 0.1749,
"step": 285
},
{
"epoch": 0.731457800511509,
"grad_norm": 2.1417743769308846,
"learning_rate": 7.047131260673214e-06,
"loss": 0.1625,
"step": 286
},
{
"epoch": 0.7340153452685422,
"grad_norm": 2.7428996805182893,
"learning_rate": 7.0287886346379755e-06,
"loss": 0.2409,
"step": 287
},
{
"epoch": 0.7365728900255755,
"grad_norm": 2.882244454025018,
"learning_rate": 7.010413265301888e-06,
"loss": 0.2332,
"step": 288
},
{
"epoch": 0.7391304347826086,
"grad_norm": 2.289617076560806,
"learning_rate": 6.9920054492312086e-06,
"loss": 0.2432,
"step": 289
},
{
"epoch": 0.7416879795396419,
"grad_norm": 2.6660050033924674,
"learning_rate": 6.97356548351586e-06,
"loss": 0.2138,
"step": 290
},
{
"epoch": 0.7442455242966752,
"grad_norm": 2.2449127788273984,
"learning_rate": 6.9550936657646386e-06,
"loss": 0.1914,
"step": 291
},
{
"epoch": 0.7468030690537084,
"grad_norm": 2.578986965277836,
"learning_rate": 6.936590294100414e-06,
"loss": 0.1689,
"step": 292
},
{
"epoch": 0.7493606138107417,
"grad_norm": 2.678033357388885,
"learning_rate": 6.918055667155311e-06,
"loss": 0.1851,
"step": 293
},
{
"epoch": 0.7519181585677749,
"grad_norm": 2.2424142224493058,
"learning_rate": 6.899490084065897e-06,
"loss": 0.1656,
"step": 294
},
{
"epoch": 0.7544757033248082,
"grad_norm": 2.601517395679842,
"learning_rate": 6.8808938444683505e-06,
"loss": 0.1847,
"step": 295
},
{
"epoch": 0.7570332480818415,
"grad_norm": 2.6747405794799053,
"learning_rate": 6.862267248493624e-06,
"loss": 0.1879,
"step": 296
},
{
"epoch": 0.7595907928388747,
"grad_norm": 2.7534418874581807,
"learning_rate": 6.843610596762606e-06,
"loss": 0.2088,
"step": 297
},
{
"epoch": 0.7621483375959079,
"grad_norm": 2.4987719982507257,
"learning_rate": 6.824924190381257e-06,
"loss": 0.1564,
"step": 298
},
{
"epoch": 0.7647058823529411,
"grad_norm": 3.011560955755456,
"learning_rate": 6.806208330935766e-06,
"loss": 0.2279,
"step": 299
},
{
"epoch": 0.7672634271099744,
"grad_norm": 2.47182094190269,
"learning_rate": 6.7874633204876705e-06,
"loss": 0.1486,
"step": 300
},
{
"epoch": 0.7698209718670077,
"grad_norm": 2.325842317060422,
"learning_rate": 6.768689461568987e-06,
"loss": 0.188,
"step": 301
},
{
"epoch": 0.7723785166240409,
"grad_norm": 1.9846873076028098,
"learning_rate": 6.7498870571773275e-06,
"loss": 0.1817,
"step": 302
},
{
"epoch": 0.7749360613810742,
"grad_norm": 2.2567074601145594,
"learning_rate": 6.731056410771008e-06,
"loss": 0.1887,
"step": 303
},
{
"epoch": 0.7774936061381074,
"grad_norm": 2.6561654913648347,
"learning_rate": 6.712197826264154e-06,
"loss": 0.2012,
"step": 304
},
{
"epoch": 0.7800511508951407,
"grad_norm": 2.5575096912130033,
"learning_rate": 6.69331160802179e-06,
"loss": 0.193,
"step": 305
},
{
"epoch": 0.782608695652174,
"grad_norm": 2.3711184342063443,
"learning_rate": 6.674398060854931e-06,
"loss": 0.1485,
"step": 306
},
{
"epoch": 0.7851662404092071,
"grad_norm": 2.4200012568245115,
"learning_rate": 6.655457490015667e-06,
"loss": 0.2083,
"step": 307
},
{
"epoch": 0.7877237851662404,
"grad_norm": 1.9707518284091383,
"learning_rate": 6.636490201192229e-06,
"loss": 0.1813,
"step": 308
},
{
"epoch": 0.7902813299232737,
"grad_norm": 1.7181427371507578,
"learning_rate": 6.617496500504056e-06,
"loss": 0.0925,
"step": 309
},
{
"epoch": 0.7928388746803069,
"grad_norm": 2.3369129146849468,
"learning_rate": 6.5984766944968636e-06,
"loss": 0.1802,
"step": 310
},
{
"epoch": 0.7953964194373402,
"grad_norm": 2.631310750013353,
"learning_rate": 6.579431090137681e-06,
"loss": 0.195,
"step": 311
},
{
"epoch": 0.7979539641943734,
"grad_norm": 2.8165495006266985,
"learning_rate": 6.560359994809916e-06,
"loss": 0.3059,
"step": 312
},
{
"epoch": 0.8005115089514067,
"grad_norm": 2.5182286775381146,
"learning_rate": 6.541263716308375e-06,
"loss": 0.2114,
"step": 313
},
{
"epoch": 0.80306905370844,
"grad_norm": 3.1928195988480477,
"learning_rate": 6.522142562834307e-06,
"loss": 0.2564,
"step": 314
},
{
"epoch": 0.8056265984654731,
"grad_norm": 2.092609856779203,
"learning_rate": 6.502996842990431e-06,
"loss": 0.1982,
"step": 315
},
{
"epoch": 0.8081841432225064,
"grad_norm": 2.664912965399812,
"learning_rate": 6.483826865775941e-06,
"loss": 0.2371,
"step": 316
},
{
"epoch": 0.8107416879795396,
"grad_norm": 2.375675995524453,
"learning_rate": 6.46463294058154e-06,
"loss": 0.1792,
"step": 317
},
{
"epoch": 0.8132992327365729,
"grad_norm": 2.6330711752053557,
"learning_rate": 6.445415377184427e-06,
"loss": 0.2179,
"step": 318
},
{
"epoch": 0.8158567774936062,
"grad_norm": 2.377318815972245,
"learning_rate": 6.426174485743309e-06,
"loss": 0.1779,
"step": 319
},
{
"epoch": 0.8184143222506394,
"grad_norm": 2.047002161592419,
"learning_rate": 6.4069105767933944e-06,
"loss": 0.1914,
"step": 320
},
{
"epoch": 0.8209718670076727,
"grad_norm": 3.6604469943318816,
"learning_rate": 6.387623961241375e-06,
"loss": 0.269,
"step": 321
},
{
"epoch": 0.8235294117647058,
"grad_norm": 2.292929507372217,
"learning_rate": 6.368314950360416e-06,
"loss": 0.1641,
"step": 322
},
{
"epoch": 0.8260869565217391,
"grad_norm": 2.396691252809917,
"learning_rate": 6.348983855785122e-06,
"loss": 0.1481,
"step": 323
},
{
"epoch": 0.8286445012787724,
"grad_norm": 2.223495270908185,
"learning_rate": 6.3296309895065215e-06,
"loss": 0.1846,
"step": 324
},
{
"epoch": 0.8312020460358056,
"grad_norm": 2.77555766966037,
"learning_rate": 6.310256663867019e-06,
"loss": 0.1814,
"step": 325
},
{
"epoch": 0.8337595907928389,
"grad_norm": 3.0829837194013336,
"learning_rate": 6.290861191555359e-06,
"loss": 0.1887,
"step": 326
},
{
"epoch": 0.8363171355498721,
"grad_norm": 3.0603758797950085,
"learning_rate": 6.271444885601583e-06,
"loss": 0.2475,
"step": 327
},
{
"epoch": 0.8388746803069054,
"grad_norm": 2.8775211385673667,
"learning_rate": 6.252008059371968e-06,
"loss": 0.2194,
"step": 328
},
{
"epoch": 0.8414322250639387,
"grad_norm": 1.8794766662306952,
"learning_rate": 6.2325510265639785e-06,
"loss": 0.1518,
"step": 329
},
{
"epoch": 0.8439897698209718,
"grad_norm": 2.7832036572553225,
"learning_rate": 6.213074101201202e-06,
"loss": 0.1865,
"step": 330
},
{
"epoch": 0.8465473145780051,
"grad_norm": 2.3334923351434154,
"learning_rate": 6.193577597628268e-06,
"loss": 0.1868,
"step": 331
},
{
"epoch": 0.8491048593350383,
"grad_norm": 2.02193771649545,
"learning_rate": 6.174061830505801e-06,
"loss": 0.1716,
"step": 332
},
{
"epoch": 0.8516624040920716,
"grad_norm": 2.863073171004623,
"learning_rate": 6.154527114805312e-06,
"loss": 0.2744,
"step": 333
},
{
"epoch": 0.8542199488491049,
"grad_norm": 2.265853554867697,
"learning_rate": 6.1349737658041385e-06,
"loss": 0.2205,
"step": 334
},
{
"epoch": 0.8567774936061381,
"grad_norm": 2.3251872190009006,
"learning_rate": 6.115402099080345e-06,
"loss": 0.1638,
"step": 335
},
{
"epoch": 0.8593350383631714,
"grad_norm": 2.203257515903664,
"learning_rate": 6.095812430507627e-06,
"loss": 0.1257,
"step": 336
},
{
"epoch": 0.8618925831202046,
"grad_norm": 2.7857913202564677,
"learning_rate": 6.076205076250227e-06,
"loss": 0.2098,
"step": 337
},
{
"epoch": 0.8644501278772379,
"grad_norm": 2.6357008969113305,
"learning_rate": 6.056580352757813e-06,
"loss": 0.2217,
"step": 338
},
{
"epoch": 0.8670076726342711,
"grad_norm": 2.211934388483927,
"learning_rate": 6.036938576760388e-06,
"loss": 0.1638,
"step": 339
},
{
"epoch": 0.8695652173913043,
"grad_norm": 1.8993360773251713,
"learning_rate": 6.0172800652631706e-06,
"loss": 0.1501,
"step": 340
},
{
"epoch": 0.8721227621483376,
"grad_norm": 2.6231453652485888,
"learning_rate": 5.997605135541472e-06,
"loss": 0.1895,
"step": 341
},
{
"epoch": 0.8746803069053708,
"grad_norm": 2.5439694565298065,
"learning_rate": 5.977914105135594e-06,
"loss": 0.2258,
"step": 342
},
{
"epoch": 0.8772378516624041,
"grad_norm": 1.9699119486573233,
"learning_rate": 5.9582072918456805e-06,
"loss": 0.1393,
"step": 343
},
{
"epoch": 0.8797953964194374,
"grad_norm": 2.350677864064173,
"learning_rate": 5.938485013726612e-06,
"loss": 0.2093,
"step": 344
},
{
"epoch": 0.8823529411764706,
"grad_norm": 2.134194035147575,
"learning_rate": 5.918747589082853e-06,
"loss": 0.1793,
"step": 345
},
{
"epoch": 0.8849104859335039,
"grad_norm": 1.8936481603379223,
"learning_rate": 5.898995336463326e-06,
"loss": 0.1789,
"step": 346
},
{
"epoch": 0.887468030690537,
"grad_norm": 2.1007267584688045,
"learning_rate": 5.879228574656269e-06,
"loss": 0.1288,
"step": 347
},
{
"epoch": 0.8900255754475703,
"grad_norm": 3.051111060833368,
"learning_rate": 5.859447622684084e-06,
"loss": 0.2582,
"step": 348
},
{
"epoch": 0.8925831202046036,
"grad_norm": 2.236462377355688,
"learning_rate": 5.839652799798197e-06,
"loss": 0.1863,
"step": 349
},
{
"epoch": 0.8951406649616368,
"grad_norm": 3.1932643905276294,
"learning_rate": 5.819844425473899e-06,
"loss": 0.2649,
"step": 350
},
{
"epoch": 0.8976982097186701,
"grad_norm": 2.335957164933005,
"learning_rate": 5.800022819405194e-06,
"loss": 0.1919,
"step": 351
},
{
"epoch": 0.9002557544757033,
"grad_norm": 2.393627657617242,
"learning_rate": 5.780188301499636e-06,
"loss": 0.2248,
"step": 352
},
{
"epoch": 0.9028132992327366,
"grad_norm": 3.0419193632396593,
"learning_rate": 5.760341191873167e-06,
"loss": 0.2492,
"step": 353
},
{
"epoch": 0.9053708439897699,
"grad_norm": 2.789673666006855,
"learning_rate": 5.740481810844952e-06,
"loss": 0.2148,
"step": 354
},
{
"epoch": 0.907928388746803,
"grad_norm": 2.177256387807687,
"learning_rate": 5.720610478932211e-06,
"loss": 0.1572,
"step": 355
},
{
"epoch": 0.9104859335038363,
"grad_norm": 2.5575844668599004,
"learning_rate": 5.700727516845038e-06,
"loss": 0.1791,
"step": 356
},
{
"epoch": 0.9130434782608695,
"grad_norm": 2.159477204677751,
"learning_rate": 5.680833245481234e-06,
"loss": 0.2066,
"step": 357
},
{
"epoch": 0.9156010230179028,
"grad_norm": 2.620579576653764,
"learning_rate": 5.660927985921122e-06,
"loss": 0.2152,
"step": 358
},
{
"epoch": 0.9181585677749361,
"grad_norm": 2.4186482484527465,
"learning_rate": 5.641012059422369e-06,
"loss": 0.2289,
"step": 359
},
{
"epoch": 0.9207161125319693,
"grad_norm": 2.47409420791087,
"learning_rate": 5.621085787414799e-06,
"loss": 0.2251,
"step": 360
},
{
"epoch": 0.9232736572890026,
"grad_norm": 2.4391802732543675,
"learning_rate": 5.601149491495206e-06,
"loss": 0.2096,
"step": 361
},
{
"epoch": 0.9258312020460358,
"grad_norm": 2.1522496402929763,
"learning_rate": 5.581203493422161e-06,
"loss": 0.2098,
"step": 362
},
{
"epoch": 0.928388746803069,
"grad_norm": 2.051258819209543,
"learning_rate": 5.561248115110822e-06,
"loss": 0.1607,
"step": 363
},
{
"epoch": 0.9309462915601023,
"grad_norm": 2.826498031720456,
"learning_rate": 5.541283678627742e-06,
"loss": 0.1787,
"step": 364
},
{
"epoch": 0.9335038363171355,
"grad_norm": 1.9969316599555271,
"learning_rate": 5.521310506185661e-06,
"loss": 0.1489,
"step": 365
},
{
"epoch": 0.9360613810741688,
"grad_norm": 2.718834305160498,
"learning_rate": 5.501328920138314e-06,
"loss": 0.2199,
"step": 366
},
{
"epoch": 0.9386189258312021,
"grad_norm": 2.5390949603608517,
"learning_rate": 5.481339242975227e-06,
"loss": 0.1642,
"step": 367
},
{
"epoch": 0.9411764705882353,
"grad_norm": 2.5326981562462043,
"learning_rate": 5.46134179731651e-06,
"loss": 0.197,
"step": 368
},
{
"epoch": 0.9437340153452686,
"grad_norm": 2.3314348462475745,
"learning_rate": 5.441336905907653e-06,
"loss": 0.1948,
"step": 369
},
{
"epoch": 0.9462915601023018,
"grad_norm": 2.953899718678035,
"learning_rate": 5.421324891614312e-06,
"loss": 0.2213,
"step": 370
},
{
"epoch": 0.948849104859335,
"grad_norm": 1.8019263635252813,
"learning_rate": 5.4013060774171055e-06,
"loss": 0.1613,
"step": 371
},
{
"epoch": 0.9514066496163683,
"grad_norm": 2.6407174472584263,
"learning_rate": 5.3812807864063946e-06,
"loss": 0.2338,
"step": 372
},
{
"epoch": 0.9539641943734015,
"grad_norm": 2.222112415501572,
"learning_rate": 5.361249341777075e-06,
"loss": 0.1829,
"step": 373
},
{
"epoch": 0.9565217391304348,
"grad_norm": 2.2070653031081724,
"learning_rate": 5.341212066823356e-06,
"loss": 0.1999,
"step": 374
},
{
"epoch": 0.959079283887468,
"grad_norm": 2.7743426313075625,
"learning_rate": 5.321169284933543e-06,
"loss": 0.2185,
"step": 375
},
{
"epoch": 0.9616368286445013,
"grad_norm": 2.248190406589497,
"learning_rate": 5.3011213195848245e-06,
"loss": 0.2421,
"step": 376
},
{
"epoch": 0.9641943734015346,
"grad_norm": 2.7068752652927213,
"learning_rate": 5.281068494338039e-06,
"loss": 0.1734,
"step": 377
},
{
"epoch": 0.9667519181585678,
"grad_norm": 2.192826902317446,
"learning_rate": 5.26101113283247e-06,
"loss": 0.1515,
"step": 378
},
{
"epoch": 0.969309462915601,
"grad_norm": 2.7221466363290046,
"learning_rate": 5.240949558780605e-06,
"loss": 0.1996,
"step": 379
},
{
"epoch": 0.9718670076726342,
"grad_norm": 2.5816644472067947,
"learning_rate": 5.220884095962924e-06,
"loss": 0.2146,
"step": 380
},
{
"epoch": 0.9744245524296675,
"grad_norm": 2.6603299397494014,
"learning_rate": 5.200815068222666e-06,
"loss": 0.1963,
"step": 381
},
{
"epoch": 0.9769820971867008,
"grad_norm": 2.043440881296807,
"learning_rate": 5.1807427994606065e-06,
"loss": 0.1299,
"step": 382
},
{
"epoch": 0.979539641943734,
"grad_norm": 3.09946402567714,
"learning_rate": 5.1606676136298305e-06,
"loss": 0.2043,
"step": 383
},
{
"epoch": 0.9820971867007673,
"grad_norm": 2.0882076768035605,
"learning_rate": 5.140589834730503e-06,
"loss": 0.1652,
"step": 384
},
{
"epoch": 0.9846547314578005,
"grad_norm": 2.2528010881334164,
"learning_rate": 5.120509786804635e-06,
"loss": 0.2062,
"step": 385
},
{
"epoch": 0.9872122762148338,
"grad_norm": 2.555764169894337,
"learning_rate": 5.100427793930862e-06,
"loss": 0.2093,
"step": 386
},
{
"epoch": 0.989769820971867,
"grad_norm": 2.8157882383013435,
"learning_rate": 5.08034418021921e-06,
"loss": 0.2087,
"step": 387
},
{
"epoch": 0.9923273657289002,
"grad_norm": 1.9976132604345895,
"learning_rate": 5.06025926980586e-06,
"loss": 0.1367,
"step": 388
},
{
"epoch": 0.9948849104859335,
"grad_norm": 1.919401790892785,
"learning_rate": 5.040173386847926e-06,
"loss": 0.1615,
"step": 389
},
{
"epoch": 0.9974424552429667,
"grad_norm": 2.262839581690866,
"learning_rate": 5.0200868555182155e-06,
"loss": 0.2022,
"step": 390
},
{
"epoch": 1.0,
"grad_norm": 2.153092207417964,
"learning_rate": 5e-06,
"loss": 0.172,
"step": 391
},
{
"epoch": 1.0025575447570332,
"grad_norm": 1.5698547864039567,
"learning_rate": 4.979913144481785e-06,
"loss": 0.0923,
"step": 392
},
{
"epoch": 1.0051150895140666,
"grad_norm": 1.7196677143098391,
"learning_rate": 4.959826613152074e-06,
"loss": 0.0905,
"step": 393
},
{
"epoch": 1.0076726342710998,
"grad_norm": 1.6337325297432321,
"learning_rate": 4.939740730194141e-06,
"loss": 0.0937,
"step": 394
},
{
"epoch": 1.010230179028133,
"grad_norm": 1.546693698592758,
"learning_rate": 4.919655819780792e-06,
"loss": 0.0719,
"step": 395
},
{
"epoch": 1.0127877237851663,
"grad_norm": 1.210961794524791,
"learning_rate": 4.899572206069138e-06,
"loss": 0.0735,
"step": 396
},
{
"epoch": 1.0153452685421995,
"grad_norm": 1.4546180756017038,
"learning_rate": 4.879490213195366e-06,
"loss": 0.0834,
"step": 397
},
{
"epoch": 1.0179028132992327,
"grad_norm": 2.134512736663491,
"learning_rate": 4.8594101652694996e-06,
"loss": 0.1081,
"step": 398
},
{
"epoch": 1.020460358056266,
"grad_norm": 1.7947252889220031,
"learning_rate": 4.839332386370171e-06,
"loss": 0.0797,
"step": 399
},
{
"epoch": 1.0230179028132993,
"grad_norm": 1.4906221815114593,
"learning_rate": 4.819257200539394e-06,
"loss": 0.0809,
"step": 400
},
{
"epoch": 1.0255754475703325,
"grad_norm": 1.4752311089192027,
"learning_rate": 4.799184931777337e-06,
"loss": 0.0915,
"step": 401
},
{
"epoch": 1.0281329923273657,
"grad_norm": 1.750489629718268,
"learning_rate": 4.779115904037079e-06,
"loss": 0.0646,
"step": 402
},
{
"epoch": 1.030690537084399,
"grad_norm": 1.6512373102135207,
"learning_rate": 4.759050441219395e-06,
"loss": 0.0628,
"step": 403
},
{
"epoch": 1.0332480818414322,
"grad_norm": 1.691044169536105,
"learning_rate": 4.738988867167531e-06,
"loss": 0.063,
"step": 404
},
{
"epoch": 1.0358056265984654,
"grad_norm": 2.0322017046197156,
"learning_rate": 4.718931505661961e-06,
"loss": 0.0774,
"step": 405
},
{
"epoch": 1.0383631713554988,
"grad_norm": 1.9548414770728504,
"learning_rate": 4.698878680415176e-06,
"loss": 0.0761,
"step": 406
},
{
"epoch": 1.040920716112532,
"grad_norm": 1.8154231893157877,
"learning_rate": 4.678830715066458e-06,
"loss": 0.0663,
"step": 407
},
{
"epoch": 1.0434782608695652,
"grad_norm": 1.908769325692329,
"learning_rate": 4.6587879331766465e-06,
"loss": 0.0797,
"step": 408
},
{
"epoch": 1.0460358056265984,
"grad_norm": 1.8323259285844344,
"learning_rate": 4.638750658222927e-06,
"loss": 0.0924,
"step": 409
},
{
"epoch": 1.0485933503836318,
"grad_norm": 1.5414803826730832,
"learning_rate": 4.618719213593605e-06,
"loss": 0.0634,
"step": 410
},
{
"epoch": 1.051150895140665,
"grad_norm": 1.6613587791845974,
"learning_rate": 4.598693922582896e-06,
"loss": 0.0746,
"step": 411
},
{
"epoch": 1.0537084398976981,
"grad_norm": 2.2412707215008387,
"learning_rate": 4.5786751083856895e-06,
"loss": 0.0867,
"step": 412
},
{
"epoch": 1.0562659846547315,
"grad_norm": 1.9062228570199347,
"learning_rate": 4.558663094092348e-06,
"loss": 0.0911,
"step": 413
},
{
"epoch": 1.0588235294117647,
"grad_norm": 2.0731838014599084,
"learning_rate": 4.53865820268349e-06,
"loss": 0.101,
"step": 414
},
{
"epoch": 1.061381074168798,
"grad_norm": 2.269864643199699,
"learning_rate": 4.518660757024774e-06,
"loss": 0.0712,
"step": 415
},
{
"epoch": 1.0639386189258313,
"grad_norm": 1.980016755904863,
"learning_rate": 4.498671079861686e-06,
"loss": 0.0925,
"step": 416
},
{
"epoch": 1.0664961636828645,
"grad_norm": 1.8104325090018185,
"learning_rate": 4.478689493814341e-06,
"loss": 0.0938,
"step": 417
},
{
"epoch": 1.0690537084398977,
"grad_norm": 2.122828708966008,
"learning_rate": 4.4587163213722595e-06,
"loss": 0.098,
"step": 418
},
{
"epoch": 1.0716112531969308,
"grad_norm": 1.7514752364051867,
"learning_rate": 4.438751884889179e-06,
"loss": 0.0807,
"step": 419
},
{
"epoch": 1.0741687979539642,
"grad_norm": 1.704079437046455,
"learning_rate": 4.41879650657784e-06,
"loss": 0.063,
"step": 420
},
{
"epoch": 1.0767263427109974,
"grad_norm": 1.8407856408139254,
"learning_rate": 4.398850508504795e-06,
"loss": 0.0635,
"step": 421
},
{
"epoch": 1.0792838874680306,
"grad_norm": 1.3786332639992318,
"learning_rate": 4.3789142125852015e-06,
"loss": 0.0627,
"step": 422
},
{
"epoch": 1.081841432225064,
"grad_norm": 1.9012435964224539,
"learning_rate": 4.358987940577631e-06,
"loss": 0.0842,
"step": 423
},
{
"epoch": 1.0843989769820972,
"grad_norm": 1.8321649236974789,
"learning_rate": 4.339072014078879e-06,
"loss": 0.0716,
"step": 424
},
{
"epoch": 1.0869565217391304,
"grad_norm": 2.4106609123907665,
"learning_rate": 4.319166754518768e-06,
"loss": 0.0713,
"step": 425
},
{
"epoch": 1.0895140664961638,
"grad_norm": 2.439541112929831,
"learning_rate": 4.299272483154963e-06,
"loss": 0.0908,
"step": 426
},
{
"epoch": 1.092071611253197,
"grad_norm": 2.2334712493856554,
"learning_rate": 4.27938952106779e-06,
"loss": 0.0882,
"step": 427
},
{
"epoch": 1.0946291560102301,
"grad_norm": 1.2884249231332807,
"learning_rate": 4.259518189155049e-06,
"loss": 0.0575,
"step": 428
},
{
"epoch": 1.0971867007672633,
"grad_norm": 1.9023989436254016,
"learning_rate": 4.2396588081268355e-06,
"loss": 0.0661,
"step": 429
},
{
"epoch": 1.0997442455242967,
"grad_norm": 2.3807159026641345,
"learning_rate": 4.219811698500365e-06,
"loss": 0.1393,
"step": 430
},
{
"epoch": 1.10230179028133,
"grad_norm": 1.511245707372203,
"learning_rate": 4.199977180594807e-06,
"loss": 0.0557,
"step": 431
},
{
"epoch": 1.104859335038363,
"grad_norm": 2.213364286263777,
"learning_rate": 4.1801555745261025e-06,
"loss": 0.0703,
"step": 432
},
{
"epoch": 1.1074168797953965,
"grad_norm": 1.5318293474866007,
"learning_rate": 4.160347200201804e-06,
"loss": 0.0897,
"step": 433
},
{
"epoch": 1.1099744245524297,
"grad_norm": 1.6444289494653057,
"learning_rate": 4.140552377315918e-06,
"loss": 0.0839,
"step": 434
},
{
"epoch": 1.1125319693094629,
"grad_norm": 1.8090163189909276,
"learning_rate": 4.120771425343733e-06,
"loss": 0.0597,
"step": 435
},
{
"epoch": 1.1150895140664963,
"grad_norm": 1.7790317490199443,
"learning_rate": 4.101004663536675e-06,
"loss": 0.0776,
"step": 436
},
{
"epoch": 1.1176470588235294,
"grad_norm": 1.597885560258774,
"learning_rate": 4.081252410917148e-06,
"loss": 0.0779,
"step": 437
},
{
"epoch": 1.1202046035805626,
"grad_norm": 1.8656620983935601,
"learning_rate": 4.061514986273391e-06,
"loss": 0.0744,
"step": 438
},
{
"epoch": 1.1227621483375958,
"grad_norm": 2.3111425176888902,
"learning_rate": 4.041792708154321e-06,
"loss": 0.1111,
"step": 439
},
{
"epoch": 1.1253196930946292,
"grad_norm": 1.3808204885398507,
"learning_rate": 4.022085894864408e-06,
"loss": 0.0607,
"step": 440
},
{
"epoch": 1.1278772378516624,
"grad_norm": 1.540584019757327,
"learning_rate": 4.0023948644585294e-06,
"loss": 0.0685,
"step": 441
},
{
"epoch": 1.1304347826086956,
"grad_norm": 2.5606144468666137,
"learning_rate": 3.982719934736832e-06,
"loss": 0.0948,
"step": 442
},
{
"epoch": 1.132992327365729,
"grad_norm": 1.4721842128661413,
"learning_rate": 3.963061423239612e-06,
"loss": 0.0813,
"step": 443
},
{
"epoch": 1.1355498721227621,
"grad_norm": 2.0215762888887565,
"learning_rate": 3.943419647242189e-06,
"loss": 0.0865,
"step": 444
},
{
"epoch": 1.1381074168797953,
"grad_norm": 1.698982836870589,
"learning_rate": 3.923794923749775e-06,
"loss": 0.0886,
"step": 445
},
{
"epoch": 1.1406649616368287,
"grad_norm": 1.4422178530936527,
"learning_rate": 3.904187569492373e-06,
"loss": 0.061,
"step": 446
},
{
"epoch": 1.143222506393862,
"grad_norm": 1.6568446058876403,
"learning_rate": 3.884597900919656e-06,
"loss": 0.072,
"step": 447
},
{
"epoch": 1.145780051150895,
"grad_norm": 1.6692733646389666,
"learning_rate": 3.865026234195863e-06,
"loss": 0.064,
"step": 448
},
{
"epoch": 1.1483375959079285,
"grad_norm": 1.9178605584493544,
"learning_rate": 3.8454728851946885e-06,
"loss": 0.0797,
"step": 449
},
{
"epoch": 1.1508951406649617,
"grad_norm": 1.624572665749194,
"learning_rate": 3.8259381694942e-06,
"loss": 0.0769,
"step": 450
},
{
"epoch": 1.1534526854219949,
"grad_norm": 1.4633601037928454,
"learning_rate": 3.806422402371733e-06,
"loss": 0.0596,
"step": 451
},
{
"epoch": 1.156010230179028,
"grad_norm": 1.6961811237423687,
"learning_rate": 3.786925898798801e-06,
"loss": 0.0779,
"step": 452
},
{
"epoch": 1.1585677749360614,
"grad_norm": 1.3744500648620386,
"learning_rate": 3.767448973436021e-06,
"loss": 0.0666,
"step": 453
},
{
"epoch": 1.1611253196930946,
"grad_norm": 2.179426339865248,
"learning_rate": 3.7479919406280334e-06,
"loss": 0.0841,
"step": 454
},
{
"epoch": 1.1636828644501278,
"grad_norm": 1.816045503492125,
"learning_rate": 3.728555114398419e-06,
"loss": 0.0715,
"step": 455
},
{
"epoch": 1.1662404092071612,
"grad_norm": 1.9881894419562507,
"learning_rate": 3.709138808444641e-06,
"loss": 0.0799,
"step": 456
},
{
"epoch": 1.1687979539641944,
"grad_norm": 1.49466287653342,
"learning_rate": 3.689743336132982e-06,
"loss": 0.0516,
"step": 457
},
{
"epoch": 1.1713554987212276,
"grad_norm": 1.3842976468768091,
"learning_rate": 3.6703690104934806e-06,
"loss": 0.0655,
"step": 458
},
{
"epoch": 1.1739130434782608,
"grad_norm": 1.754609078905008,
"learning_rate": 3.6510161442148783e-06,
"loss": 0.0685,
"step": 459
},
{
"epoch": 1.1764705882352942,
"grad_norm": 1.4943900089831539,
"learning_rate": 3.6316850496395863e-06,
"loss": 0.0726,
"step": 460
},
{
"epoch": 1.1790281329923273,
"grad_norm": 1.5559428054805928,
"learning_rate": 3.6123760387586265e-06,
"loss": 0.0627,
"step": 461
},
{
"epoch": 1.1815856777493605,
"grad_norm": 1.6718429584640253,
"learning_rate": 3.5930894232066072e-06,
"loss": 0.0601,
"step": 462
},
{
"epoch": 1.184143222506394,
"grad_norm": 1.8714176452111198,
"learning_rate": 3.5738255142566912e-06,
"loss": 0.092,
"step": 463
},
{
"epoch": 1.186700767263427,
"grad_norm": 1.6615141671047535,
"learning_rate": 3.5545846228155743e-06,
"loss": 0.0608,
"step": 464
},
{
"epoch": 1.1892583120204603,
"grad_norm": 1.5566192447860165,
"learning_rate": 3.5353670594184623e-06,
"loss": 0.0578,
"step": 465
},
{
"epoch": 1.1918158567774937,
"grad_norm": 2.0925827246711686,
"learning_rate": 3.516173134224059e-06,
"loss": 0.0864,
"step": 466
},
{
"epoch": 1.1943734015345269,
"grad_norm": 1.5682869130335337,
"learning_rate": 3.4970031570095707e-06,
"loss": 0.0563,
"step": 467
},
{
"epoch": 1.19693094629156,
"grad_norm": 1.57609443913993,
"learning_rate": 3.477857437165694e-06,
"loss": 0.0711,
"step": 468
},
{
"epoch": 1.1994884910485935,
"grad_norm": 1.7895753498943323,
"learning_rate": 3.458736283691626e-06,
"loss": 0.0712,
"step": 469
},
{
"epoch": 1.2020460358056266,
"grad_norm": 1.6643459994448309,
"learning_rate": 3.4396400051900846e-06,
"loss": 0.059,
"step": 470
},
{
"epoch": 1.2046035805626598,
"grad_norm": 1.3441611085327887,
"learning_rate": 3.4205689098623195e-06,
"loss": 0.0488,
"step": 471
},
{
"epoch": 1.207161125319693,
"grad_norm": 1.1211357993124325,
"learning_rate": 3.401523305503139e-06,
"loss": 0.0371,
"step": 472
},
{
"epoch": 1.2097186700767264,
"grad_norm": 2.0660355539669393,
"learning_rate": 3.3825034994959445e-06,
"loss": 0.0947,
"step": 473
},
{
"epoch": 1.2122762148337596,
"grad_norm": 1.608787114297835,
"learning_rate": 3.3635097988077724e-06,
"loss": 0.074,
"step": 474
},
{
"epoch": 1.2148337595907928,
"grad_norm": 1.7486815265862035,
"learning_rate": 3.3445425099843343e-06,
"loss": 0.0742,
"step": 475
},
{
"epoch": 1.2173913043478262,
"grad_norm": 1.5604908568521394,
"learning_rate": 3.3256019391450696e-06,
"loss": 0.0514,
"step": 476
},
{
"epoch": 1.2199488491048593,
"grad_norm": 2.2122019533355983,
"learning_rate": 3.3066883919782116e-06,
"loss": 0.1137,
"step": 477
},
{
"epoch": 1.2225063938618925,
"grad_norm": 1.8080383259958446,
"learning_rate": 3.287802173735848e-06,
"loss": 0.0709,
"step": 478
},
{
"epoch": 1.2250639386189257,
"grad_norm": 1.6502758829199498,
"learning_rate": 3.268943589228992e-06,
"loss": 0.0648,
"step": 479
},
{
"epoch": 1.227621483375959,
"grad_norm": 2.365736744384701,
"learning_rate": 3.250112942822673e-06,
"loss": 0.0759,
"step": 480
},
{
"epoch": 1.2301790281329923,
"grad_norm": 2.097762143569881,
"learning_rate": 3.231310538431015e-06,
"loss": 0.0966,
"step": 481
},
{
"epoch": 1.2327365728900257,
"grad_norm": 2.112263985795022,
"learning_rate": 3.212536679512332e-06,
"loss": 0.0801,
"step": 482
},
{
"epoch": 1.2352941176470589,
"grad_norm": 1.6316550742720446,
"learning_rate": 3.1937916690642356e-06,
"loss": 0.0585,
"step": 483
},
{
"epoch": 1.237851662404092,
"grad_norm": 2.051437980934914,
"learning_rate": 3.1750758096187446e-06,
"loss": 0.0954,
"step": 484
},
{
"epoch": 1.2404092071611252,
"grad_norm": 1.6248985041395814,
"learning_rate": 3.1563894032373977e-06,
"loss": 0.0705,
"step": 485
},
{
"epoch": 1.2429667519181586,
"grad_norm": 1.8808984876425323,
"learning_rate": 3.137732751506376e-06,
"loss": 0.0699,
"step": 486
},
{
"epoch": 1.2455242966751918,
"grad_norm": 1.6657687470775868,
"learning_rate": 3.1191061555316503e-06,
"loss": 0.0725,
"step": 487
},
{
"epoch": 1.248081841432225,
"grad_norm": 1.6204503721283818,
"learning_rate": 3.1005099159341044e-06,
"loss": 0.0705,
"step": 488
},
{
"epoch": 1.2506393861892584,
"grad_norm": 1.7946799973401157,
"learning_rate": 3.08194433284469e-06,
"loss": 0.0939,
"step": 489
},
{
"epoch": 1.2531969309462916,
"grad_norm": 1.5436173873028685,
"learning_rate": 3.0634097058995877e-06,
"loss": 0.0642,
"step": 490
},
{
"epoch": 1.2557544757033248,
"grad_norm": 2.5025377910535602,
"learning_rate": 3.0449063342353635e-06,
"loss": 0.089,
"step": 491
},
{
"epoch": 1.258312020460358,
"grad_norm": 1.846350151404866,
"learning_rate": 3.0264345164841426e-06,
"loss": 0.0797,
"step": 492
},
{
"epoch": 1.2608695652173914,
"grad_norm": 1.4014902867742467,
"learning_rate": 3.007994550768793e-06,
"loss": 0.0529,
"step": 493
},
{
"epoch": 1.2634271099744245,
"grad_norm": 1.749293050973978,
"learning_rate": 2.989586734698113e-06,
"loss": 0.066,
"step": 494
},
{
"epoch": 1.265984654731458,
"grad_norm": 1.677143321270154,
"learning_rate": 2.971211365362028e-06,
"loss": 0.0717,
"step": 495
},
{
"epoch": 1.2685421994884911,
"grad_norm": 1.4865811657524735,
"learning_rate": 2.9528687393267865e-06,
"loss": 0.0746,
"step": 496
},
{
"epoch": 1.2710997442455243,
"grad_norm": 1.581847266608915,
"learning_rate": 2.934559152630192e-06,
"loss": 0.0732,
"step": 497
},
{
"epoch": 1.2736572890025575,
"grad_norm": 1.7806387171278764,
"learning_rate": 2.9162829007768103e-06,
"loss": 0.0919,
"step": 498
},
{
"epoch": 1.2762148337595907,
"grad_norm": 1.5745970726671847,
"learning_rate": 2.898040278733203e-06,
"loss": 0.0719,
"step": 499
},
{
"epoch": 1.278772378516624,
"grad_norm": 1.6462836730413777,
"learning_rate": 2.879831580923176e-06,
"loss": 0.0652,
"step": 500
},
{
"epoch": 1.278772378516624,
"eval_loss": 0.21273775398731232,
"eval_runtime": 4.5782,
"eval_samples_per_second": 6.99,
"eval_steps_per_second": 1.747,
"step": 500
},
{
"epoch": 1.2813299232736572,
"grad_norm": 1.943689809670517,
"learning_rate": 2.8616571012230134e-06,
"loss": 0.0896,
"step": 501
},
{
"epoch": 1.2838874680306906,
"grad_norm": 1.7108775240717138,
"learning_rate": 2.843517132956742e-06,
"loss": 0.0829,
"step": 502
},
{
"epoch": 1.2864450127877238,
"grad_norm": 1.5745418008984486,
"learning_rate": 2.8254119688914017e-06,
"loss": 0.0761,
"step": 503
},
{
"epoch": 1.289002557544757,
"grad_norm": 1.5315465963046555,
"learning_rate": 2.8073419012323154e-06,
"loss": 0.0798,
"step": 504
},
{
"epoch": 1.2915601023017902,
"grad_norm": 1.5085834635172504,
"learning_rate": 2.789307221618369e-06,
"loss": 0.0696,
"step": 505
},
{
"epoch": 1.2941176470588236,
"grad_norm": 2.0317808899575933,
"learning_rate": 2.771308221117309e-06,
"loss": 0.1059,
"step": 506
},
{
"epoch": 1.2966751918158568,
"grad_norm": 1.4613005775169157,
"learning_rate": 2.7533451902210512e-06,
"loss": 0.0603,
"step": 507
},
{
"epoch": 1.29923273657289,
"grad_norm": 1.8605111151110105,
"learning_rate": 2.7354184188409773e-06,
"loss": 0.0888,
"step": 508
},
{
"epoch": 1.3017902813299234,
"grad_norm": 1.617241995750445,
"learning_rate": 2.71752819630327e-06,
"loss": 0.0737,
"step": 509
},
{
"epoch": 1.3043478260869565,
"grad_norm": 1.8548161748439362,
"learning_rate": 2.6996748113442397e-06,
"loss": 0.0663,
"step": 510
},
{
"epoch": 1.3069053708439897,
"grad_norm": 1.8033275920362444,
"learning_rate": 2.6818585521056573e-06,
"loss": 0.0704,
"step": 511
},
{
"epoch": 1.309462915601023,
"grad_norm": 1.8578383533525582,
"learning_rate": 2.66407970613011e-06,
"loss": 0.0754,
"step": 512
},
{
"epoch": 1.3120204603580563,
"grad_norm": 1.6455284536390185,
"learning_rate": 2.646338560356363e-06,
"loss": 0.0545,
"step": 513
},
{
"epoch": 1.3145780051150895,
"grad_norm": 1.6433378276314705,
"learning_rate": 2.6286354011147252e-06,
"loss": 0.063,
"step": 514
},
{
"epoch": 1.317135549872123,
"grad_norm": 2.305871703667012,
"learning_rate": 2.6109705141224255e-06,
"loss": 0.1028,
"step": 515
},
{
"epoch": 1.319693094629156,
"grad_norm": 1.5367647731836724,
"learning_rate": 2.593344184479003e-06,
"loss": 0.066,
"step": 516
},
{
"epoch": 1.3222506393861893,
"grad_norm": 1.6097087447086145,
"learning_rate": 2.575756696661713e-06,
"loss": 0.0582,
"step": 517
},
{
"epoch": 1.3248081841432224,
"grad_norm": 1.214868677490465,
"learning_rate": 2.5582083345209217e-06,
"loss": 0.057,
"step": 518
},
{
"epoch": 1.3273657289002558,
"grad_norm": 1.7532789333330783,
"learning_rate": 2.540699381275539e-06,
"loss": 0.0784,
"step": 519
},
{
"epoch": 1.329923273657289,
"grad_norm": 1.535396007238416,
"learning_rate": 2.5232301195084395e-06,
"loss": 0.061,
"step": 520
},
{
"epoch": 1.3324808184143222,
"grad_norm": 1.961040557523754,
"learning_rate": 2.5058008311619035e-06,
"loss": 0.0774,
"step": 521
},
{
"epoch": 1.3350383631713556,
"grad_norm": 1.6548886187642289,
"learning_rate": 2.488411797533064e-06,
"loss": 0.0497,
"step": 522
},
{
"epoch": 1.3375959079283888,
"grad_norm": 1.8616891226892944,
"learning_rate": 2.4710632992693737e-06,
"loss": 0.0664,
"step": 523
},
{
"epoch": 1.340153452685422,
"grad_norm": 1.522818582149643,
"learning_rate": 2.4537556163640726e-06,
"loss": 0.0642,
"step": 524
},
{
"epoch": 1.3427109974424551,
"grad_norm": 1.9564770020434565,
"learning_rate": 2.4364890281516633e-06,
"loss": 0.0815,
"step": 525
},
{
"epoch": 1.3452685421994885,
"grad_norm": 1.1715877168427582,
"learning_rate": 2.4192638133034074e-06,
"loss": 0.0435,
"step": 526
},
{
"epoch": 1.3478260869565217,
"grad_norm": 1.3873917483817966,
"learning_rate": 2.4020802498228333e-06,
"loss": 0.0573,
"step": 527
},
{
"epoch": 1.350383631713555,
"grad_norm": 1.0503749788006405,
"learning_rate": 2.384938615041238e-06,
"loss": 0.0497,
"step": 528
},
{
"epoch": 1.3529411764705883,
"grad_norm": 1.7078053655214958,
"learning_rate": 2.3678391856132203e-06,
"loss": 0.0658,
"step": 529
},
{
"epoch": 1.3554987212276215,
"grad_norm": 1.233203753875264,
"learning_rate": 2.350782237512215e-06,
"loss": 0.0471,
"step": 530
},
{
"epoch": 1.3580562659846547,
"grad_norm": 1.5492283920850878,
"learning_rate": 2.3337680460260314e-06,
"loss": 0.0606,
"step": 531
},
{
"epoch": 1.3606138107416879,
"grad_norm": 2.42405809855206,
"learning_rate": 2.316796885752415e-06,
"loss": 0.108,
"step": 532
},
{
"epoch": 1.3631713554987213,
"grad_norm": 2.0245811204612516,
"learning_rate": 2.299869030594622e-06,
"loss": 0.1002,
"step": 533
},
{
"epoch": 1.3657289002557544,
"grad_norm": 1.0435760500681248,
"learning_rate": 2.2829847537569904e-06,
"loss": 0.0351,
"step": 534
},
{
"epoch": 1.3682864450127878,
"grad_norm": 2.2546431033682985,
"learning_rate": 2.266144327740531e-06,
"loss": 0.0926,
"step": 535
},
{
"epoch": 1.370843989769821,
"grad_norm": 2.155804482532426,
"learning_rate": 2.2493480243385298e-06,
"loss": 0.0832,
"step": 536
},
{
"epoch": 1.3734015345268542,
"grad_norm": 1.7587243734253648,
"learning_rate": 2.2325961146321683e-06,
"loss": 0.0744,
"step": 537
},
{
"epoch": 1.3759590792838874,
"grad_norm": 1.9603939445396372,
"learning_rate": 2.2158888689861434e-06,
"loss": 0.0742,
"step": 538
},
{
"epoch": 1.3785166240409208,
"grad_norm": 1.473096805809632,
"learning_rate": 2.1992265570442974e-06,
"loss": 0.0706,
"step": 539
},
{
"epoch": 1.381074168797954,
"grad_norm": 1.8442896619919424,
"learning_rate": 2.182609447725279e-06,
"loss": 0.0885,
"step": 540
},
{
"epoch": 1.3836317135549872,
"grad_norm": 1.8191363677733596,
"learning_rate": 2.1660378092181935e-06,
"loss": 0.074,
"step": 541
},
{
"epoch": 1.3861892583120206,
"grad_norm": 1.9074363059029638,
"learning_rate": 2.149511908978275e-06,
"loss": 0.0746,
"step": 542
},
{
"epoch": 1.3887468030690537,
"grad_norm": 1.910812757836058,
"learning_rate": 2.1330320137225773e-06,
"loss": 0.0956,
"step": 543
},
{
"epoch": 1.391304347826087,
"grad_norm": 1.8198118504923657,
"learning_rate": 2.1165983894256647e-06,
"loss": 0.091,
"step": 544
},
{
"epoch": 1.39386189258312,
"grad_norm": 1.771534594356425,
"learning_rate": 2.100211301315315e-06,
"loss": 0.0624,
"step": 545
},
{
"epoch": 1.3964194373401535,
"grad_norm": 1.5960742782667032,
"learning_rate": 2.0838710138682412e-06,
"loss": 0.0649,
"step": 546
},
{
"epoch": 1.3989769820971867,
"grad_norm": 1.4352565996218818,
"learning_rate": 2.0675777908058307e-06,
"loss": 0.0651,
"step": 547
},
{
"epoch": 1.40153452685422,
"grad_norm": 1.8453025336140658,
"learning_rate": 2.051331895089882e-06,
"loss": 0.0856,
"step": 548
},
{
"epoch": 1.4040920716112533,
"grad_norm": 1.7879301257977294,
"learning_rate": 2.035133588918356e-06,
"loss": 0.0564,
"step": 549
},
{
"epoch": 1.4066496163682864,
"grad_norm": 1.055074940010279,
"learning_rate": 2.0189831337211573e-06,
"loss": 0.0349,
"step": 550
},
{
"epoch": 1.4092071611253196,
"grad_norm": 1.7438163522982681,
"learning_rate": 2.0028807901559027e-06,
"loss": 0.0776,
"step": 551
},
{
"epoch": 1.4117647058823528,
"grad_norm": 1.5139546840336329,
"learning_rate": 1.9868268181037186e-06,
"loss": 0.0754,
"step": 552
},
{
"epoch": 1.4143222506393862,
"grad_norm": 1.79660968193301,
"learning_rate": 1.970821476665051e-06,
"loss": 0.0693,
"step": 553
},
{
"epoch": 1.4168797953964194,
"grad_norm": 1.396783128713414,
"learning_rate": 1.9548650241554812e-06,
"loss": 0.0559,
"step": 554
},
{
"epoch": 1.4194373401534528,
"grad_norm": 1.6817944787104224,
"learning_rate": 1.9389577181015496e-06,
"loss": 0.0532,
"step": 555
},
{
"epoch": 1.421994884910486,
"grad_norm": 1.3959759855353526,
"learning_rate": 1.923099815236608e-06,
"loss": 0.053,
"step": 556
},
{
"epoch": 1.4245524296675192,
"grad_norm": 1.266582979888415,
"learning_rate": 1.9072915714966761e-06,
"loss": 0.048,
"step": 557
},
{
"epoch": 1.4271099744245523,
"grad_norm": 1.8765331165368473,
"learning_rate": 1.8915332420163074e-06,
"loss": 0.0633,
"step": 558
},
{
"epoch": 1.4296675191815857,
"grad_norm": 1.2977035796306693,
"learning_rate": 1.8758250811244682e-06,
"loss": 0.0509,
"step": 559
},
{
"epoch": 1.432225063938619,
"grad_norm": 1.4167349873272226,
"learning_rate": 1.8601673423404449e-06,
"loss": 0.0645,
"step": 560
},
{
"epoch": 1.434782608695652,
"grad_norm": 1.8235368661900502,
"learning_rate": 1.8445602783697375e-06,
"loss": 0.0668,
"step": 561
},
{
"epoch": 1.4373401534526855,
"grad_norm": 1.5509277107583779,
"learning_rate": 1.8290041410999893e-06,
"loss": 0.077,
"step": 562
},
{
"epoch": 1.4398976982097187,
"grad_norm": 1.5375878611055105,
"learning_rate": 1.8134991815969238e-06,
"loss": 0.0673,
"step": 563
},
{
"epoch": 1.4424552429667519,
"grad_norm": 1.9051540404989096,
"learning_rate": 1.798045650100289e-06,
"loss": 0.0709,
"step": 564
},
{
"epoch": 1.445012787723785,
"grad_norm": 2.141394685521614,
"learning_rate": 1.782643796019814e-06,
"loss": 0.072,
"step": 565
},
{
"epoch": 1.4475703324808185,
"grad_norm": 1.7999776365108109,
"learning_rate": 1.7672938679311957e-06,
"loss": 0.0764,
"step": 566
},
{
"epoch": 1.4501278772378516,
"grad_norm": 1.8863953126395598,
"learning_rate": 1.7519961135720737e-06,
"loss": 0.0754,
"step": 567
},
{
"epoch": 1.452685421994885,
"grad_norm": 1.7560519814030673,
"learning_rate": 1.736750779838044e-06,
"loss": 0.0712,
"step": 568
},
{
"epoch": 1.4552429667519182,
"grad_norm": 1.5334124499018023,
"learning_rate": 1.7215581127786624e-06,
"loss": 0.0639,
"step": 569
},
{
"epoch": 1.4578005115089514,
"grad_norm": 1.364571623755127,
"learning_rate": 1.7064183575934856e-06,
"loss": 0.0598,
"step": 570
},
{
"epoch": 1.4603580562659846,
"grad_norm": 1.358357570743382,
"learning_rate": 1.6913317586281048e-06,
"loss": 0.0587,
"step": 571
},
{
"epoch": 1.4629156010230178,
"grad_norm": 2.2006102667445413,
"learning_rate": 1.676298559370202e-06,
"loss": 0.098,
"step": 572
},
{
"epoch": 1.4654731457800512,
"grad_norm": 1.885231575108773,
"learning_rate": 1.6613190024456293e-06,
"loss": 0.0756,
"step": 573
},
{
"epoch": 1.4680306905370843,
"grad_norm": 1.8937798233386698,
"learning_rate": 1.6463933296144863e-06,
"loss": 0.0645,
"step": 574
},
{
"epoch": 1.4705882352941178,
"grad_norm": 2.079267842097328,
"learning_rate": 1.6315217817672142e-06,
"loss": 0.0794,
"step": 575
},
{
"epoch": 1.473145780051151,
"grad_norm": 2.0372550296739087,
"learning_rate": 1.6167045989207185e-06,
"loss": 0.0877,
"step": 576
},
{
"epoch": 1.4757033248081841,
"grad_norm": 1.6766306067310754,
"learning_rate": 1.6019420202144853e-06,
"loss": 0.0403,
"step": 577
},
{
"epoch": 1.4782608695652173,
"grad_norm": 1.6282511096736187,
"learning_rate": 1.5872342839067305e-06,
"loss": 0.0533,
"step": 578
},
{
"epoch": 1.4808184143222507,
"grad_norm": 1.3859280744992961,
"learning_rate": 1.5725816273705453e-06,
"loss": 0.0578,
"step": 579
},
{
"epoch": 1.4833759590792839,
"grad_norm": 1.1143022475391577,
"learning_rate": 1.5579842870900746e-06,
"loss": 0.0376,
"step": 580
},
{
"epoch": 1.485933503836317,
"grad_norm": 1.529711981151928,
"learning_rate": 1.5434424986566938e-06,
"loss": 0.0953,
"step": 581
},
{
"epoch": 1.4884910485933505,
"grad_norm": 1.27023062886774,
"learning_rate": 1.5289564967652033e-06,
"loss": 0.0456,
"step": 582
},
{
"epoch": 1.4910485933503836,
"grad_norm": 1.6016588716041684,
"learning_rate": 1.5145265152100574e-06,
"loss": 0.0695,
"step": 583
},
{
"epoch": 1.4936061381074168,
"grad_norm": 1.7027552802851214,
"learning_rate": 1.5001527868815702e-06,
"loss": 0.075,
"step": 584
},
{
"epoch": 1.49616368286445,
"grad_norm": 2.087105079353017,
"learning_rate": 1.4858355437621663e-06,
"loss": 0.0843,
"step": 585
},
{
"epoch": 1.4987212276214834,
"grad_norm": 1.5515159664184583,
"learning_rate": 1.4715750169226417e-06,
"loss": 0.0746,
"step": 586
},
{
"epoch": 1.5012787723785166,
"grad_norm": 1.6199284724546499,
"learning_rate": 1.457371436518424e-06,
"loss": 0.06,
"step": 587
},
{
"epoch": 1.50383631713555,
"grad_norm": 1.5277517404992678,
"learning_rate": 1.4432250317858675e-06,
"loss": 0.0729,
"step": 588
},
{
"epoch": 1.5063938618925832,
"grad_norm": 2.3550500012049733,
"learning_rate": 1.4291360310385455e-06,
"loss": 0.0633,
"step": 589
},
{
"epoch": 1.5089514066496164,
"grad_norm": 1.7295044529207286,
"learning_rate": 1.4151046616635727e-06,
"loss": 0.0671,
"step": 590
},
{
"epoch": 1.5115089514066495,
"grad_norm": 1.894949015332874,
"learning_rate": 1.4011311501179287e-06,
"loss": 0.0804,
"step": 591
},
{
"epoch": 1.5140664961636827,
"grad_norm": 2.121021767643575,
"learning_rate": 1.3872157219248045e-06,
"loss": 0.0717,
"step": 592
},
{
"epoch": 1.5166240409207161,
"grad_norm": 1.6430284408399045,
"learning_rate": 1.373358601669973e-06,
"loss": 0.053,
"step": 593
},
{
"epoch": 1.5191815856777495,
"grad_norm": 1.526803613192238,
"learning_rate": 1.3595600129981469e-06,
"loss": 0.0638,
"step": 594
},
{
"epoch": 1.5217391304347827,
"grad_norm": 1.0212528309323259,
"learning_rate": 1.3458201786093795e-06,
"loss": 0.0416,
"step": 595
},
{
"epoch": 1.5242966751918159,
"grad_norm": 1.9219415286349144,
"learning_rate": 1.3321393202554739e-06,
"loss": 0.0834,
"step": 596
},
{
"epoch": 1.526854219948849,
"grad_norm": 2.080278099746331,
"learning_rate": 1.3185176587363919e-06,
"loss": 0.0882,
"step": 597
},
{
"epoch": 1.5294117647058822,
"grad_norm": 1.7816231414266257,
"learning_rate": 1.3049554138967052e-06,
"loss": 0.0582,
"step": 598
},
{
"epoch": 1.5319693094629157,
"grad_norm": 1.889256583497343,
"learning_rate": 1.2914528046220332e-06,
"loss": 0.0695,
"step": 599
},
{
"epoch": 1.5345268542199488,
"grad_norm": 1.1550477281276221,
"learning_rate": 1.278010048835523e-06,
"loss": 0.0333,
"step": 600
},
{
"epoch": 1.5370843989769822,
"grad_norm": 1.4939337260656969,
"learning_rate": 1.2646273634943195e-06,
"loss": 0.0725,
"step": 601
},
{
"epoch": 1.5396419437340154,
"grad_norm": 1.4169987914491964,
"learning_rate": 1.2513049645860759e-06,
"loss": 0.0412,
"step": 602
},
{
"epoch": 1.5421994884910486,
"grad_norm": 1.7381273646988493,
"learning_rate": 1.2380430671254618e-06,
"loss": 0.0604,
"step": 603
},
{
"epoch": 1.5447570332480818,
"grad_norm": 1.606050132274542,
"learning_rate": 1.224841885150691e-06,
"loss": 0.0654,
"step": 604
},
{
"epoch": 1.547314578005115,
"grad_norm": 1.9353777885376064,
"learning_rate": 1.2117016317200702e-06,
"loss": 0.0999,
"step": 605
},
{
"epoch": 1.5498721227621484,
"grad_norm": 1.4204697537593365,
"learning_rate": 1.1986225189085627e-06,
"loss": 0.0503,
"step": 606
},
{
"epoch": 1.5524296675191815,
"grad_norm": 1.7779994619812158,
"learning_rate": 1.185604757804359e-06,
"loss": 0.0648,
"step": 607
},
{
"epoch": 1.554987212276215,
"grad_norm": 1.8729532913873508,
"learning_rate": 1.172648558505477e-06,
"loss": 0.0621,
"step": 608
},
{
"epoch": 1.5575447570332481,
"grad_norm": 1.5939978689407244,
"learning_rate": 1.1597541301163655e-06,
"loss": 0.0616,
"step": 609
},
{
"epoch": 1.5601023017902813,
"grad_norm": 1.6805775276124237,
"learning_rate": 1.1469216807445348e-06,
"loss": 0.0584,
"step": 610
},
{
"epoch": 1.5626598465473145,
"grad_norm": 1.8194848984443412,
"learning_rate": 1.1341514174971907e-06,
"loss": 0.0632,
"step": 611
},
{
"epoch": 1.5652173913043477,
"grad_norm": 1.6220328579883132,
"learning_rate": 1.1214435464779006e-06,
"loss": 0.0706,
"step": 612
},
{
"epoch": 1.567774936061381,
"grad_norm": 1.9488536146970024,
"learning_rate": 1.1087982727832613e-06,
"loss": 0.1119,
"step": 613
},
{
"epoch": 1.5703324808184145,
"grad_norm": 2.2215510632895907,
"learning_rate": 1.0962158004995893e-06,
"loss": 0.097,
"step": 614
},
{
"epoch": 1.5728900255754477,
"grad_norm": 1.3539194267902872,
"learning_rate": 1.083696332699628e-06,
"loss": 0.0474,
"step": 615
},
{
"epoch": 1.5754475703324808,
"grad_norm": 1.0447972180103597,
"learning_rate": 1.0712400714392723e-06,
"loss": 0.0349,
"step": 616
},
{
"epoch": 1.578005115089514,
"grad_norm": 1.7820783836872298,
"learning_rate": 1.058847217754303e-06,
"loss": 0.0574,
"step": 617
},
{
"epoch": 1.5805626598465472,
"grad_norm": 1.2726641899295212,
"learning_rate": 1.0465179716571467e-06,
"loss": 0.0391,
"step": 618
},
{
"epoch": 1.5831202046035806,
"grad_norm": 2.1713194520674297,
"learning_rate": 1.034252532133646e-06,
"loss": 0.0564,
"step": 619
},
{
"epoch": 1.5856777493606138,
"grad_norm": 1.3422351008724465,
"learning_rate": 1.0220510971398473e-06,
"loss": 0.0582,
"step": 620
},
{
"epoch": 1.5882352941176472,
"grad_norm": 1.9326774960679987,
"learning_rate": 1.0099138635988026e-06,
"loss": 0.0679,
"step": 621
},
{
"epoch": 1.5907928388746804,
"grad_norm": 1.4277862321292778,
"learning_rate": 9.978410273974015e-07,
"loss": 0.0578,
"step": 622
},
{
"epoch": 1.5933503836317136,
"grad_norm": 1.84261949025278,
"learning_rate": 9.858327833832004e-07,
"loss": 0.0638,
"step": 623
},
{
"epoch": 1.5959079283887467,
"grad_norm": 1.5750246594688215,
"learning_rate": 9.738893253612808e-07,
"loss": 0.0646,
"step": 624
},
{
"epoch": 1.59846547314578,
"grad_norm": 1.7918813630008548,
"learning_rate": 9.620108460911181e-07,
"loss": 0.0575,
"step": 625
},
{
"epoch": 1.6010230179028133,
"grad_norm": 1.582528596444316,
"learning_rate": 9.50197537283481e-07,
"loss": 0.0643,
"step": 626
},
{
"epoch": 1.6035805626598465,
"grad_norm": 1.858978097610581,
"learning_rate": 9.384495895973227e-07,
"loss": 0.0775,
"step": 627
},
{
"epoch": 1.60613810741688,
"grad_norm": 1.9376171131571154,
"learning_rate": 9.267671926367166e-07,
"loss": 0.0682,
"step": 628
},
{
"epoch": 1.608695652173913,
"grad_norm": 1.6529180966681518,
"learning_rate": 9.151505349477901e-07,
"loss": 0.0749,
"step": 629
},
{
"epoch": 1.6112531969309463,
"grad_norm": 1.545032973496891,
"learning_rate": 9.035998040156801e-07,
"loss": 0.0653,
"step": 630
},
{
"epoch": 1.6138107416879794,
"grad_norm": 1.4873821155805664,
"learning_rate": 8.921151862615091e-07,
"loss": 0.0522,
"step": 631
},
{
"epoch": 1.6163682864450126,
"grad_norm": 1.9828594885163102,
"learning_rate": 8.806968670393801e-07,
"loss": 0.0881,
"step": 632
},
{
"epoch": 1.618925831202046,
"grad_norm": 1.5428881405958734,
"learning_rate": 8.693450306333818e-07,
"loss": 0.0596,
"step": 633
},
{
"epoch": 1.6214833759590794,
"grad_norm": 1.9526142345186388,
"learning_rate": 8.580598602546109e-07,
"loss": 0.0713,
"step": 634
},
{
"epoch": 1.6240409207161126,
"grad_norm": 1.2159178790910155,
"learning_rate": 8.4684153803822e-07,
"loss": 0.0437,
"step": 635
},
{
"epoch": 1.6265984654731458,
"grad_norm": 1.5525955368175755,
"learning_rate": 8.356902450404792e-07,
"loss": 0.0553,
"step": 636
},
{
"epoch": 1.629156010230179,
"grad_norm": 1.4526232993831398,
"learning_rate": 8.246061612358475e-07,
"loss": 0.0607,
"step": 637
},
{
"epoch": 1.6317135549872122,
"grad_norm": 1.623054202144306,
"learning_rate": 8.135894655140758e-07,
"loss": 0.0614,
"step": 638
},
{
"epoch": 1.6342710997442456,
"grad_norm": 2.020789659063834,
"learning_rate": 8.026403356773161e-07,
"loss": 0.0662,
"step": 639
},
{
"epoch": 1.6368286445012787,
"grad_norm": 2.0131654709879756,
"learning_rate": 7.91758948437249e-07,
"loss": 0.0641,
"step": 640
},
{
"epoch": 1.6393861892583121,
"grad_norm": 1.5241049366793433,
"learning_rate": 7.809454794122346e-07,
"loss": 0.0676,
"step": 641
},
{
"epoch": 1.6419437340153453,
"grad_norm": 1.5072894320854298,
"learning_rate": 7.702001031244816e-07,
"loss": 0.0613,
"step": 642
},
{
"epoch": 1.6445012787723785,
"grad_norm": 1.3149510012931023,
"learning_rate": 7.595229929972253e-07,
"loss": 0.066,
"step": 643
},
{
"epoch": 1.6470588235294117,
"grad_norm": 1.354258203231293,
"learning_rate": 7.489143213519301e-07,
"loss": 0.0409,
"step": 644
},
{
"epoch": 1.6496163682864449,
"grad_norm": 1.394303666622696,
"learning_rate": 7.383742594055077e-07,
"loss": 0.0616,
"step": 645
},
{
"epoch": 1.6521739130434783,
"grad_norm": 1.9374972519378002,
"learning_rate": 7.279029772675572e-07,
"loss": 0.082,
"step": 646
},
{
"epoch": 1.6547314578005117,
"grad_norm": 1.6243040582047277,
"learning_rate": 7.17500643937617e-07,
"loss": 0.0498,
"step": 647
},
{
"epoch": 1.6572890025575449,
"grad_norm": 1.6392303232363044,
"learning_rate": 7.071674273024353e-07,
"loss": 0.0634,
"step": 648
},
{
"epoch": 1.659846547314578,
"grad_norm": 2.139421270837991,
"learning_rate": 6.969034941332664e-07,
"loss": 0.1057,
"step": 649
},
{
"epoch": 1.6624040920716112,
"grad_norm": 1.3581405578159422,
"learning_rate": 6.86709010083172e-07,
"loss": 0.0392,
"step": 650
},
{
"epoch": 1.6649616368286444,
"grad_norm": 1.675037519291502,
"learning_rate": 6.765841396843514e-07,
"loss": 0.0526,
"step": 651
},
{
"epoch": 1.6675191815856778,
"grad_norm": 2.0592421439000135,
"learning_rate": 6.665290463454882e-07,
"loss": 0.0733,
"step": 652
},
{
"epoch": 1.670076726342711,
"grad_norm": 2.3725190689778834,
"learning_rate": 6.565438923491102e-07,
"loss": 0.085,
"step": 653
},
{
"epoch": 1.6726342710997444,
"grad_norm": 1.5665070937066485,
"learning_rate": 6.466288388489689e-07,
"loss": 0.0741,
"step": 654
},
{
"epoch": 1.6751918158567776,
"grad_norm": 1.5211024774946007,
"learning_rate": 6.367840458674401e-07,
"loss": 0.0662,
"step": 655
},
{
"epoch": 1.6777493606138107,
"grad_norm": 1.4224852306974154,
"learning_rate": 6.270096722929442e-07,
"loss": 0.0465,
"step": 656
},
{
"epoch": 1.680306905370844,
"grad_norm": 1.6096574471315697,
"learning_rate": 6.173058758773775e-07,
"loss": 0.0635,
"step": 657
},
{
"epoch": 1.682864450127877,
"grad_norm": 1.7328118872108653,
"learning_rate": 6.076728132335669e-07,
"loss": 0.0536,
"step": 658
},
{
"epoch": 1.6854219948849105,
"grad_norm": 2.0081885289199004,
"learning_rate": 5.981106398327463e-07,
"loss": 0.0762,
"step": 659
},
{
"epoch": 1.6879795396419437,
"grad_norm": 1.7928585732400393,
"learning_rate": 5.886195100020408e-07,
"loss": 0.0775,
"step": 660
},
{
"epoch": 1.690537084398977,
"grad_norm": 1.7653788413306644,
"learning_rate": 5.7919957692198e-07,
"loss": 0.0553,
"step": 661
},
{
"epoch": 1.6930946291560103,
"grad_norm": 1.4481488430178222,
"learning_rate": 5.698509926240275e-07,
"loss": 0.0647,
"step": 662
},
{
"epoch": 1.6956521739130435,
"grad_norm": 1.7839607218624998,
"learning_rate": 5.60573907988124e-07,
"loss": 0.0784,
"step": 663
},
{
"epoch": 1.6982097186700766,
"grad_norm": 1.3127529412098409,
"learning_rate": 5.513684727402529e-07,
"loss": 0.0444,
"step": 664
},
{
"epoch": 1.7007672634271098,
"grad_norm": 1.3869543812328162,
"learning_rate": 5.422348354500217e-07,
"loss": 0.0516,
"step": 665
},
{
"epoch": 1.7033248081841432,
"grad_norm": 1.6005455194774372,
"learning_rate": 5.331731435282705e-07,
"loss": 0.0488,
"step": 666
},
{
"epoch": 1.7058823529411766,
"grad_norm": 1.8517881383130126,
"learning_rate": 5.241835432246888e-07,
"loss": 0.0851,
"step": 667
},
{
"epoch": 1.7084398976982098,
"grad_norm": 1.4850618619837235,
"learning_rate": 5.152661796254505e-07,
"loss": 0.0447,
"step": 668
},
{
"epoch": 1.710997442455243,
"grad_norm": 2.1553630250764884,
"learning_rate": 5.064211966508837e-07,
"loss": 0.0595,
"step": 669
},
{
"epoch": 1.7135549872122762,
"grad_norm": 1.5588219606375748,
"learning_rate": 4.976487370531352e-07,
"loss": 0.0621,
"step": 670
},
{
"epoch": 1.7161125319693094,
"grad_norm": 2.089144802560461,
"learning_rate": 4.88948942413876e-07,
"loss": 0.0858,
"step": 671
},
{
"epoch": 1.7186700767263428,
"grad_norm": 1.7907366729470824,
"learning_rate": 4.803219531420128e-07,
"loss": 0.0585,
"step": 672
},
{
"epoch": 1.721227621483376,
"grad_norm": 1.6065952810269382,
"learning_rate": 4.717679084714222e-07,
"loss": 0.0469,
"step": 673
},
{
"epoch": 1.7237851662404093,
"grad_norm": 1.391081341252637,
"learning_rate": 4.6328694645870254e-07,
"loss": 0.0621,
"step": 674
},
{
"epoch": 1.7263427109974425,
"grad_norm": 1.7591705981665215,
"learning_rate": 4.5487920398094465e-07,
"loss": 0.0688,
"step": 675
},
{
"epoch": 1.7289002557544757,
"grad_norm": 1.3688117618719597,
"learning_rate": 4.46544816733529e-07,
"loss": 0.0516,
"step": 676
},
{
"epoch": 1.7314578005115089,
"grad_norm": 1.5369125245891289,
"learning_rate": 4.382839192279303e-07,
"loss": 0.068,
"step": 677
},
{
"epoch": 1.734015345268542,
"grad_norm": 1.4485510331681306,
"learning_rate": 4.3009664478954384e-07,
"loss": 0.0438,
"step": 678
},
{
"epoch": 1.7365728900255755,
"grad_norm": 1.9885788988160304,
"learning_rate": 4.219831255555423e-07,
"loss": 0.0673,
"step": 679
},
{
"epoch": 1.7391304347826086,
"grad_norm": 1.3795869342736704,
"learning_rate": 4.139434924727359e-07,
"loss": 0.0442,
"step": 680
},
{
"epoch": 1.741687979539642,
"grad_norm": 1.8321446740987333,
"learning_rate": 4.059778752954607e-07,
"loss": 0.064,
"step": 681
},
{
"epoch": 1.7442455242966752,
"grad_norm": 1.6108426141916128,
"learning_rate": 3.9808640258348686e-07,
"loss": 0.0561,
"step": 682
},
{
"epoch": 1.7468030690537084,
"grad_norm": 1.5359072620188055,
"learning_rate": 3.9026920169994374e-07,
"loss": 0.0675,
"step": 683
},
{
"epoch": 1.7493606138107416,
"grad_norm": 1.7008119887557362,
"learning_rate": 3.825263988092587e-07,
"loss": 0.0671,
"step": 684
},
{
"epoch": 1.7519181585677748,
"grad_norm": 1.471165980148968,
"learning_rate": 3.7485811887512714e-07,
"loss": 0.0607,
"step": 685
},
{
"epoch": 1.7544757033248082,
"grad_norm": 1.4713436991190307,
"learning_rate": 3.672644856584928e-07,
"loss": 0.0593,
"step": 686
},
{
"epoch": 1.7570332480818416,
"grad_norm": 1.8321987965717021,
"learning_rate": 3.597456217155526e-07,
"loss": 0.0503,
"step": 687
},
{
"epoch": 1.7595907928388748,
"grad_norm": 1.4654951355310737,
"learning_rate": 3.523016483957742e-07,
"loss": 0.0661,
"step": 688
},
{
"epoch": 1.762148337595908,
"grad_norm": 1.6318489446677316,
"learning_rate": 3.4493268583994434e-07,
"loss": 0.0595,
"step": 689
},
{
"epoch": 1.7647058823529411,
"grad_norm": 1.2394295437052634,
"learning_rate": 3.3763885297822153e-07,
"loss": 0.0282,
"step": 690
},
{
"epoch": 1.7672634271099743,
"grad_norm": 1.7143312871829341,
"learning_rate": 3.3042026752822254e-07,
"loss": 0.0728,
"step": 691
},
{
"epoch": 1.7698209718670077,
"grad_norm": 1.8174618944743401,
"learning_rate": 3.2327704599312283e-07,
"loss": 0.0644,
"step": 692
},
{
"epoch": 1.772378516624041,
"grad_norm": 1.6070849800659126,
"learning_rate": 3.16209303659773e-07,
"loss": 0.0597,
"step": 693
},
{
"epoch": 1.7749360613810743,
"grad_norm": 1.9286508631962667,
"learning_rate": 3.0921715459683753e-07,
"loss": 0.0502,
"step": 694
},
{
"epoch": 1.7774936061381075,
"grad_norm": 1.5349318887338477,
"learning_rate": 3.0230071165295804e-07,
"loss": 0.0456,
"step": 695
},
{
"epoch": 1.7800511508951407,
"grad_norm": 1.4901091372165034,
"learning_rate": 2.95460086454929e-07,
"loss": 0.0528,
"step": 696
},
{
"epoch": 1.7826086956521738,
"grad_norm": 1.918618234904437,
"learning_rate": 2.88695389405898e-07,
"loss": 0.0784,
"step": 697
},
{
"epoch": 1.785166240409207,
"grad_norm": 1.1882113006459234,
"learning_rate": 2.820067296835799e-07,
"loss": 0.0445,
"step": 698
},
{
"epoch": 1.7877237851662404,
"grad_norm": 1.9596226833895882,
"learning_rate": 2.753942152385014e-07,
"loss": 0.0688,
"step": 699
},
{
"epoch": 1.7902813299232738,
"grad_norm": 1.4807925998658464,
"learning_rate": 2.688579527922514e-07,
"loss": 0.0402,
"step": 700
},
{
"epoch": 1.792838874680307,
"grad_norm": 1.5494471130294227,
"learning_rate": 2.6239804783576294e-07,
"loss": 0.0543,
"step": 701
},
{
"epoch": 1.7953964194373402,
"grad_norm": 1.710830595848272,
"learning_rate": 2.560146046276135e-07,
"loss": 0.078,
"step": 702
},
{
"epoch": 1.7979539641943734,
"grad_norm": 1.6103330225310404,
"learning_rate": 2.4970772619233475e-07,
"loss": 0.0609,
"step": 703
},
{
"epoch": 1.8005115089514065,
"grad_norm": 2.1596511568230294,
"learning_rate": 2.4347751431875453e-07,
"loss": 0.0849,
"step": 704
},
{
"epoch": 1.80306905370844,
"grad_norm": 1.6435842864771086,
"learning_rate": 2.373240695583534e-07,
"loss": 0.0789,
"step": 705
},
{
"epoch": 1.8056265984654731,
"grad_norm": 1.5677270813395774,
"learning_rate": 2.3124749122364286e-07,
"loss": 0.0664,
"step": 706
},
{
"epoch": 1.8081841432225065,
"grad_norm": 1.710908609250228,
"learning_rate": 2.2524787738656073e-07,
"loss": 0.0554,
"step": 707
},
{
"epoch": 1.8107416879795397,
"grad_norm": 1.6048364103546928,
"learning_rate": 2.1932532487688784e-07,
"loss": 0.0532,
"step": 708
},
{
"epoch": 1.813299232736573,
"grad_norm": 1.4644455792556497,
"learning_rate": 2.1347992928068884e-07,
"loss": 0.0603,
"step": 709
},
{
"epoch": 1.815856777493606,
"grad_norm": 1.0091446695350044,
"learning_rate": 2.0771178493876387e-07,
"loss": 0.0313,
"step": 710
},
{
"epoch": 1.8184143222506393,
"grad_norm": 1.987324391957735,
"learning_rate": 2.0202098494513157e-07,
"loss": 0.0747,
"step": 711
},
{
"epoch": 1.8209718670076727,
"grad_norm": 1.5012232578184002,
"learning_rate": 1.964076211455246e-07,
"loss": 0.0491,
"step": 712
},
{
"epoch": 1.8235294117647058,
"grad_norm": 1.6839252722578555,
"learning_rate": 1.908717841359048e-07,
"loss": 0.065,
"step": 713
},
{
"epoch": 1.8260869565217392,
"grad_norm": 1.764351815514701,
"learning_rate": 1.8541356326100436e-07,
"loss": 0.049,
"step": 714
},
{
"epoch": 1.8286445012787724,
"grad_norm": 1.9346780735975497,
"learning_rate": 1.800330466128808e-07,
"loss": 0.0635,
"step": 715
},
{
"epoch": 1.8312020460358056,
"grad_norm": 1.1857733970794198,
"learning_rate": 1.7473032102949983e-07,
"loss": 0.0427,
"step": 716
},
{
"epoch": 1.8337595907928388,
"grad_norm": 2.126736518256365,
"learning_rate": 1.695054720933309e-07,
"loss": 0.0884,
"step": 717
},
{
"epoch": 1.836317135549872,
"grad_norm": 1.5012079730873682,
"learning_rate": 1.6435858412996275e-07,
"loss": 0.0525,
"step": 718
},
{
"epoch": 1.8388746803069054,
"grad_norm": 1.9920935892905687,
"learning_rate": 1.5928974020674947e-07,
"loss": 0.0633,
"step": 719
},
{
"epoch": 1.8414322250639388,
"grad_norm": 1.4935126462129222,
"learning_rate": 1.542990221314644e-07,
"loss": 0.0416,
"step": 720
},
{
"epoch": 1.843989769820972,
"grad_norm": 1.5667962682302419,
"learning_rate": 1.4938651045098174e-07,
"loss": 0.0401,
"step": 721
},
{
"epoch": 1.8465473145780051,
"grad_norm": 2.3165346207133957,
"learning_rate": 1.445522844499775e-07,
"loss": 0.0555,
"step": 722
},
{
"epoch": 1.8491048593350383,
"grad_norm": 1.9816583062122102,
"learning_rate": 1.3979642214964728e-07,
"loss": 0.0648,
"step": 723
},
{
"epoch": 1.8516624040920715,
"grad_norm": 2.4975848569979355,
"learning_rate": 1.3511900030644954e-07,
"loss": 0.0852,
"step": 724
},
{
"epoch": 1.854219948849105,
"grad_norm": 2.2176696568855374,
"learning_rate": 1.3052009441086533e-07,
"loss": 0.0691,
"step": 725
},
{
"epoch": 1.856777493606138,
"grad_norm": 1.9148609494936542,
"learning_rate": 1.2599977868618052e-07,
"loss": 0.0518,
"step": 726
},
{
"epoch": 1.8593350383631715,
"grad_norm": 2.056805786998454,
"learning_rate": 1.215581260872889e-07,
"loss": 0.0703,
"step": 727
},
{
"epoch": 1.8618925831202047,
"grad_norm": 1.156766803459366,
"learning_rate": 1.1719520829951203e-07,
"loss": 0.0393,
"step": 728
},
{
"epoch": 1.8644501278772379,
"grad_norm": 1.1820320080282383,
"learning_rate": 1.1291109573744574e-07,
"loss": 0.0336,
"step": 729
},
{
"epoch": 1.867007672634271,
"grad_norm": 1.7806788218166225,
"learning_rate": 1.087058575438199e-07,
"loss": 0.0737,
"step": 730
},
{
"epoch": 1.8695652173913042,
"grad_norm": 1.5667661177865002,
"learning_rate": 1.0457956158838545e-07,
"loss": 0.0625,
"step": 731
},
{
"epoch": 1.8721227621483376,
"grad_norm": 1.186958803626775,
"learning_rate": 1.0053227446681912e-07,
"loss": 0.0496,
"step": 732
},
{
"epoch": 1.8746803069053708,
"grad_norm": 1.8521104628392622,
"learning_rate": 9.656406149964548e-08,
"loss": 0.0756,
"step": 733
},
{
"epoch": 1.8772378516624042,
"grad_norm": 1.5291622162529863,
"learning_rate": 9.267498673118547e-08,
"loss": 0.0609,
"step": 734
},
{
"epoch": 1.8797953964194374,
"grad_norm": 1.5425572563924586,
"learning_rate": 8.886511292852395e-08,
"loss": 0.0629,
"step": 735
},
{
"epoch": 1.8823529411764706,
"grad_norm": 1.823405533973878,
"learning_rate": 8.513450158049109e-08,
"loss": 0.0627,
"step": 736
},
{
"epoch": 1.8849104859335037,
"grad_norm": 1.7161639545427452,
"learning_rate": 8.148321289667749e-08,
"loss": 0.0572,
"step": 737
},
{
"epoch": 1.887468030690537,
"grad_norm": 1.2571507117703948,
"learning_rate": 7.791130580645623e-08,
"loss": 0.0485,
"step": 738
},
{
"epoch": 1.8900255754475703,
"grad_norm": 1.575736071352483,
"learning_rate": 7.441883795803462e-08,
"loss": 0.0572,
"step": 739
},
{
"epoch": 1.8925831202046037,
"grad_norm": 1.187272056902486,
"learning_rate": 7.100586571752444e-08,
"loss": 0.0392,
"step": 740
},
{
"epoch": 1.895140664961637,
"grad_norm": 1.2982831651687614,
"learning_rate": 6.767244416802988e-08,
"loss": 0.0482,
"step": 741
},
{
"epoch": 1.89769820971867,
"grad_norm": 1.5142498294577111,
"learning_rate": 6.441862710876102e-08,
"loss": 0.0589,
"step": 742
},
{
"epoch": 1.9002557544757033,
"grad_norm": 1.5225022048090735,
"learning_rate": 6.124446705416343e-08,
"loss": 0.0385,
"step": 743
},
{
"epoch": 1.9028132992327365,
"grad_norm": 1.485678760379258,
"learning_rate": 5.815001523307162e-08,
"loss": 0.0559,
"step": 744
},
{
"epoch": 1.9053708439897699,
"grad_norm": 1.5322533890589751,
"learning_rate": 5.513532158788193e-08,
"loss": 0.0627,
"step": 745
},
{
"epoch": 1.907928388746803,
"grad_norm": 1.9793699147122528,
"learning_rate": 5.220043477374759e-08,
"loss": 0.0895,
"step": 746
},
{
"epoch": 1.9104859335038364,
"grad_norm": 1.8767345948228846,
"learning_rate": 4.934540215779271e-08,
"loss": 0.062,
"step": 747
},
{
"epoch": 1.9130434782608696,
"grad_norm": 1.8530710891566307,
"learning_rate": 4.657026981834623e-08,
"loss": 0.0725,
"step": 748
},
{
"epoch": 1.9156010230179028,
"grad_norm": 1.5959856775583183,
"learning_rate": 4.3875082544201364e-08,
"loss": 0.066,
"step": 749
},
{
"epoch": 1.918158567774936,
"grad_norm": 1.6751349325104308,
"learning_rate": 4.125988383388957e-08,
"loss": 0.0601,
"step": 750
},
{
"epoch": 1.9207161125319692,
"grad_norm": 1.4734314475739716,
"learning_rate": 3.87247158949805e-08,
"loss": 0.0542,
"step": 751
},
{
"epoch": 1.9232736572890026,
"grad_norm": 1.5924404071141742,
"learning_rate": 3.626961964340203e-08,
"loss": 0.0724,
"step": 752
},
{
"epoch": 1.9258312020460358,
"grad_norm": 1.7826522089646637,
"learning_rate": 3.389463470277576e-08,
"loss": 0.0768,
"step": 753
},
{
"epoch": 1.9283887468030692,
"grad_norm": 1.6983017020209106,
"learning_rate": 3.159979940378088e-08,
"loss": 0.0592,
"step": 754
},
{
"epoch": 1.9309462915601023,
"grad_norm": 2.3762549911017707,
"learning_rate": 2.938515078353521e-08,
"loss": 0.0738,
"step": 755
},
{
"epoch": 1.9335038363171355,
"grad_norm": 1.5837970119495974,
"learning_rate": 2.725072458499567e-08,
"loss": 0.0721,
"step": 756
},
{
"epoch": 1.9360613810741687,
"grad_norm": 1.8572244239313698,
"learning_rate": 2.519655525638376e-08,
"loss": 0.0892,
"step": 757
},
{
"epoch": 1.938618925831202,
"grad_norm": 1.9352353669922568,
"learning_rate": 2.3222675950627106e-08,
"loss": 0.0569,
"step": 758
},
{
"epoch": 1.9411764705882353,
"grad_norm": 1.3105846383254622,
"learning_rate": 2.1329118524827662e-08,
"loss": 0.0489,
"step": 759
},
{
"epoch": 1.9437340153452687,
"grad_norm": 1.6708577410266003,
"learning_rate": 1.9515913539743247e-08,
"loss": 0.063,
"step": 760
},
{
"epoch": 1.9462915601023019,
"grad_norm": 1.4618266441401186,
"learning_rate": 1.7783090259297918e-08,
"loss": 0.0609,
"step": 761
},
{
"epoch": 1.948849104859335,
"grad_norm": 1.4615574349262082,
"learning_rate": 1.613067665010959e-08,
"loss": 0.0449,
"step": 762
},
{
"epoch": 1.9514066496163682,
"grad_norm": 2.07290740502851,
"learning_rate": 1.4558699381034825e-08,
"loss": 0.0781,
"step": 763
},
{
"epoch": 1.9539641943734014,
"grad_norm": 1.6805936964351897,
"learning_rate": 1.3067183822742525e-08,
"loss": 0.0529,
"step": 764
},
{
"epoch": 1.9565217391304348,
"grad_norm": 1.7159228063505934,
"learning_rate": 1.1656154047303691e-08,
"loss": 0.0497,
"step": 765
},
{
"epoch": 1.959079283887468,
"grad_norm": 2.8268253984668634,
"learning_rate": 1.0325632827801745e-08,
"loss": 0.0748,
"step": 766
},
{
"epoch": 1.9616368286445014,
"grad_norm": 1.9193840541256346,
"learning_rate": 9.075641637964483e-09,
"loss": 0.0739,
"step": 767
},
{
"epoch": 1.9641943734015346,
"grad_norm": 1.7559700086749113,
"learning_rate": 7.906200651819907e-09,
"loss": 0.0459,
"step": 768
},
{
"epoch": 1.9667519181585678,
"grad_norm": 1.5806477845906548,
"learning_rate": 6.817328743368712e-09,
"loss": 0.0535,
"step": 769
},
{
"epoch": 1.969309462915601,
"grad_norm": 1.9343097264191595,
"learning_rate": 5.809043486279531e-09,
"loss": 0.0894,
"step": 770
},
{
"epoch": 1.9718670076726341,
"grad_norm": 1.5764067486137285,
"learning_rate": 4.881361153606934e-09,
"loss": 0.0498,
"step": 771
},
{
"epoch": 1.9744245524296675,
"grad_norm": 1.4007499728745463,
"learning_rate": 4.034296717527752e-09,
"loss": 0.0572,
"step": 772
},
{
"epoch": 1.976982097186701,
"grad_norm": 1.615233023165604,
"learning_rate": 3.2678638490996064e-09,
"loss": 0.0536,
"step": 773
},
{
"epoch": 1.979539641943734,
"grad_norm": 1.782038931727096,
"learning_rate": 2.5820749180388573e-09,
"loss": 0.0712,
"step": 774
},
{
"epoch": 1.9820971867007673,
"grad_norm": 1.6623499709903002,
"learning_rate": 1.976940992523546e-09,
"loss": 0.0584,
"step": 775
},
{
"epoch": 1.9846547314578005,
"grad_norm": 2.255461841110182,
"learning_rate": 1.4524718390140913e-09,
"loss": 0.096,
"step": 776
},
{
"epoch": 1.9872122762148337,
"grad_norm": 1.5422911151085525,
"learning_rate": 1.0086759220934162e-09,
"loss": 0.0624,
"step": 777
},
{
"epoch": 1.989769820971867,
"grad_norm": 1.5624451885477848,
"learning_rate": 6.455604043331676e-10,
"loss": 0.0611,
"step": 778
},
{
"epoch": 1.9923273657289002,
"grad_norm": 1.1618378160708118,
"learning_rate": 3.631311461765874e-10,
"loss": 0.0338,
"step": 779
},
{
"epoch": 1.9948849104859336,
"grad_norm": 2.0229168127939583,
"learning_rate": 1.6139270584358823e-10,
"loss": 0.0759,
"step": 780
},
{
"epoch": 1.9974424552429668,
"grad_norm": 1.6493382864517687,
"learning_rate": 4.034833925969928e-11,
"loss": 0.0716,
"step": 781
},
{
"epoch": 2.0,
"grad_norm": 1.867208361253687,
"learning_rate": 0.0,
"loss": 0.0635,
"step": 782
},
{
"epoch": 2.0,
"step": 782,
"total_flos": 4414597447680.0,
"train_loss": 0.1475916886583085,
"train_runtime": 1693.0919,
"train_samples_per_second": 3.695,
"train_steps_per_second": 0.462
}
],
"logging_steps": 1,
"max_steps": 782,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4414597447680.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}