diff --git "a/Qwen2.5-Coder-7B-Instruct-MathQA/checkpoint-2000/trainer_state.json" "b/Qwen2.5-Coder-7B-Instruct-MathQA/checkpoint-2000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/Qwen2.5-Coder-7B-Instruct-MathQA/checkpoint-2000/trainer_state.json" @@ -0,0 +1,14033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.42003570303475796, + "eval_steps": 500, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00021001785151737897, + "grad_norm": 0.5090809464454651, + "learning_rate": 4.19287211740042e-07, + "loss": 1.1155, + "step": 1 + }, + { + "epoch": 0.00042003570303475793, + "grad_norm": 0.6277585625648499, + "learning_rate": 8.38574423480084e-07, + "loss": 1.3855, + "step": 2 + }, + { + "epoch": 0.000630053554552137, + "grad_norm": 0.7032777070999146, + "learning_rate": 1.257861635220126e-06, + "loss": 1.3905, + "step": 3 + }, + { + "epoch": 0.0008400714060695159, + "grad_norm": 0.745212197303772, + "learning_rate": 1.677148846960168e-06, + "loss": 1.4668, + "step": 4 + }, + { + "epoch": 0.0010500892575868949, + "grad_norm": 0.7577304840087891, + "learning_rate": 2.09643605870021e-06, + "loss": 1.473, + "step": 5 + }, + { + "epoch": 0.001260107109104274, + "grad_norm": 0.7788395881652832, + "learning_rate": 2.515723270440252e-06, + "loss": 1.485, + "step": 6 + }, + { + "epoch": 0.001470124960621653, + "grad_norm": 0.7430889010429382, + "learning_rate": 2.935010482180294e-06, + "loss": 1.4338, + "step": 7 + }, + { + "epoch": 0.0016801428121390317, + "grad_norm": 0.8291558623313904, + "learning_rate": 3.354297693920336e-06, + "loss": 1.4768, + "step": 8 + }, + { + "epoch": 0.0018901606636564107, + "grad_norm": 0.7731107473373413, + "learning_rate": 3.7735849056603773e-06, + "loss": 1.5853, + "step": 9 + }, + { + "epoch": 0.0021001785151737898, + "grad_norm": 0.8241227269172668, + "learning_rate": 4.19287211740042e-06, + "loss": 1.7314, + "step": 10 + }, + { + "epoch": 0.002310196366691169, + "grad_norm": 0.8158630728721619, + "learning_rate": 4.612159329140462e-06, + "loss": 1.5916, + "step": 11 + }, + { + "epoch": 0.002520214218208548, + "grad_norm": 0.8860861659049988, + "learning_rate": 5.031446540880504e-06, + "loss": 1.5115, + "step": 12 + }, + { + "epoch": 0.002730232069725927, + "grad_norm": 0.868651270866394, + "learning_rate": 5.4507337526205454e-06, + "loss": 1.7514, + "step": 13 + }, + { + "epoch": 0.002940249921243306, + "grad_norm": 0.9116117358207703, + "learning_rate": 5.870020964360588e-06, + "loss": 1.5645, + "step": 14 + }, + { + "epoch": 0.003150267772760685, + "grad_norm": 0.8694919347763062, + "learning_rate": 6.289308176100629e-06, + "loss": 1.6326, + "step": 15 + }, + { + "epoch": 0.0033602856242780635, + "grad_norm": 0.8614499568939209, + "learning_rate": 6.708595387840672e-06, + "loss": 1.6224, + "step": 16 + }, + { + "epoch": 0.0035703034757954425, + "grad_norm": 0.8713967800140381, + "learning_rate": 7.127882599580712e-06, + "loss": 1.5923, + "step": 17 + }, + { + "epoch": 0.0037803213273128215, + "grad_norm": 0.8446964025497437, + "learning_rate": 7.547169811320755e-06, + "loss": 1.5843, + "step": 18 + }, + { + "epoch": 0.0039903391788302005, + "grad_norm": 0.8920742869377136, + "learning_rate": 7.966457023060797e-06, + "loss": 1.5485, + "step": 19 + }, + { + "epoch": 0.0042003570303475795, + "grad_norm": 0.9501891136169434, + "learning_rate": 8.38574423480084e-06, + "loss": 1.663, + "step": 20 + }, + { + "epoch": 0.0044103748818649586, + "grad_norm": 0.9179856181144714, + "learning_rate": 8.80503144654088e-06, + "loss": 1.6163, + "step": 21 + }, + { + "epoch": 0.004620392733382338, + "grad_norm": 0.8716169595718384, + "learning_rate": 9.224318658280923e-06, + "loss": 1.5311, + "step": 22 + }, + { + "epoch": 0.004830410584899717, + "grad_norm": 0.9034018516540527, + "learning_rate": 9.643605870020965e-06, + "loss": 1.5315, + "step": 23 + }, + { + "epoch": 0.005040428436417096, + "grad_norm": 0.9811834692955017, + "learning_rate": 1.0062893081761008e-05, + "loss": 1.5356, + "step": 24 + }, + { + "epoch": 0.005250446287934475, + "grad_norm": 0.8846603035926819, + "learning_rate": 1.0482180293501048e-05, + "loss": 1.5815, + "step": 25 + }, + { + "epoch": 0.005460464139451854, + "grad_norm": 0.8842517137527466, + "learning_rate": 1.0901467505241091e-05, + "loss": 1.5628, + "step": 26 + }, + { + "epoch": 0.005670481990969233, + "grad_norm": 0.9207525253295898, + "learning_rate": 1.1320754716981132e-05, + "loss": 1.5623, + "step": 27 + }, + { + "epoch": 0.005880499842486612, + "grad_norm": 0.9082942605018616, + "learning_rate": 1.1740041928721176e-05, + "loss": 1.4599, + "step": 28 + }, + { + "epoch": 0.006090517694003991, + "grad_norm": 0.8724138736724854, + "learning_rate": 1.2159329140461215e-05, + "loss": 1.5282, + "step": 29 + }, + { + "epoch": 0.00630053554552137, + "grad_norm": 0.8738006353378296, + "learning_rate": 1.2578616352201259e-05, + "loss": 1.4782, + "step": 30 + }, + { + "epoch": 0.006510553397038748, + "grad_norm": 0.9410291910171509, + "learning_rate": 1.29979035639413e-05, + "loss": 1.3856, + "step": 31 + }, + { + "epoch": 0.006720571248556127, + "grad_norm": 0.9309423565864563, + "learning_rate": 1.3417190775681343e-05, + "loss": 1.4267, + "step": 32 + }, + { + "epoch": 0.006930589100073506, + "grad_norm": 0.9442999362945557, + "learning_rate": 1.3836477987421385e-05, + "loss": 1.3706, + "step": 33 + }, + { + "epoch": 0.007140606951590885, + "grad_norm": 0.9511269927024841, + "learning_rate": 1.4255765199161425e-05, + "loss": 1.26, + "step": 34 + }, + { + "epoch": 0.007350624803108264, + "grad_norm": 1.0389297008514404, + "learning_rate": 1.467505241090147e-05, + "loss": 1.235, + "step": 35 + }, + { + "epoch": 0.007560642654625643, + "grad_norm": 1.0033001899719238, + "learning_rate": 1.509433962264151e-05, + "loss": 1.2687, + "step": 36 + }, + { + "epoch": 0.007770660506143022, + "grad_norm": 1.075852632522583, + "learning_rate": 1.5513626834381552e-05, + "loss": 1.2762, + "step": 37 + }, + { + "epoch": 0.007980678357660401, + "grad_norm": 1.0721476078033447, + "learning_rate": 1.5932914046121594e-05, + "loss": 1.1935, + "step": 38 + }, + { + "epoch": 0.00819069620917778, + "grad_norm": 1.0784581899642944, + "learning_rate": 1.6352201257861635e-05, + "loss": 1.1119, + "step": 39 + }, + { + "epoch": 0.008400714060695159, + "grad_norm": 1.1390137672424316, + "learning_rate": 1.677148846960168e-05, + "loss": 1.0373, + "step": 40 + }, + { + "epoch": 0.008610731912212538, + "grad_norm": 1.3073922395706177, + "learning_rate": 1.719077568134172e-05, + "loss": 1.1536, + "step": 41 + }, + { + "epoch": 0.008820749763729917, + "grad_norm": 1.3248019218444824, + "learning_rate": 1.761006289308176e-05, + "loss": 0.9316, + "step": 42 + }, + { + "epoch": 0.009030767615247296, + "grad_norm": 1.3569798469543457, + "learning_rate": 1.8029350104821805e-05, + "loss": 0.8881, + "step": 43 + }, + { + "epoch": 0.009240785466764675, + "grad_norm": 1.3192838430404663, + "learning_rate": 1.8448637316561846e-05, + "loss": 0.8825, + "step": 44 + }, + { + "epoch": 0.009450803318282054, + "grad_norm": 1.1947859525680542, + "learning_rate": 1.8867924528301888e-05, + "loss": 0.9415, + "step": 45 + }, + { + "epoch": 0.009660821169799433, + "grad_norm": 1.1684753894805908, + "learning_rate": 1.928721174004193e-05, + "loss": 0.83, + "step": 46 + }, + { + "epoch": 0.009870839021316812, + "grad_norm": 1.1097474098205566, + "learning_rate": 1.970649895178197e-05, + "loss": 0.7248, + "step": 47 + }, + { + "epoch": 0.010080856872834191, + "grad_norm": 1.0564842224121094, + "learning_rate": 2.0125786163522016e-05, + "loss": 0.7427, + "step": 48 + }, + { + "epoch": 0.01029087472435157, + "grad_norm": 0.9865881204605103, + "learning_rate": 2.0545073375262054e-05, + "loss": 0.877, + "step": 49 + }, + { + "epoch": 0.01050089257586895, + "grad_norm": 1.072039246559143, + "learning_rate": 2.0964360587002095e-05, + "loss": 0.8473, + "step": 50 + }, + { + "epoch": 0.010710910427386328, + "grad_norm": 0.5596430897712708, + "learning_rate": 2.138364779874214e-05, + "loss": 0.4599, + "step": 51 + }, + { + "epoch": 0.010920928278903707, + "grad_norm": 0.6180581450462341, + "learning_rate": 2.1802935010482182e-05, + "loss": 0.5215, + "step": 52 + }, + { + "epoch": 0.011130946130421086, + "grad_norm": 0.6805194616317749, + "learning_rate": 2.2222222222222223e-05, + "loss": 0.6148, + "step": 53 + }, + { + "epoch": 0.011340963981938465, + "grad_norm": 0.7125585079193115, + "learning_rate": 2.2641509433962265e-05, + "loss": 0.5615, + "step": 54 + }, + { + "epoch": 0.011550981833455844, + "grad_norm": 0.6816964745521545, + "learning_rate": 2.3060796645702306e-05, + "loss": 0.478, + "step": 55 + }, + { + "epoch": 0.011760999684973223, + "grad_norm": 0.5821985602378845, + "learning_rate": 2.348008385744235e-05, + "loss": 0.5604, + "step": 56 + }, + { + "epoch": 0.011971017536490602, + "grad_norm": 0.642721951007843, + "learning_rate": 2.3899371069182393e-05, + "loss": 0.4068, + "step": 57 + }, + { + "epoch": 0.012181035388007981, + "grad_norm": 0.5806999206542969, + "learning_rate": 2.431865828092243e-05, + "loss": 0.4711, + "step": 58 + }, + { + "epoch": 0.01239105323952536, + "grad_norm": 0.6702911257743835, + "learning_rate": 2.4737945492662476e-05, + "loss": 0.4601, + "step": 59 + }, + { + "epoch": 0.01260107109104274, + "grad_norm": 0.6345894932746887, + "learning_rate": 2.5157232704402517e-05, + "loss": 0.4172, + "step": 60 + }, + { + "epoch": 0.012811088942560117, + "grad_norm": 0.6444422602653503, + "learning_rate": 2.5576519916142562e-05, + "loss": 0.4635, + "step": 61 + }, + { + "epoch": 0.013021106794077496, + "grad_norm": 0.6568206548690796, + "learning_rate": 2.59958071278826e-05, + "loss": 0.4327, + "step": 62 + }, + { + "epoch": 0.013231124645594875, + "grad_norm": 0.6627638936042786, + "learning_rate": 2.641509433962264e-05, + "loss": 0.4935, + "step": 63 + }, + { + "epoch": 0.013441142497112254, + "grad_norm": 0.6746403574943542, + "learning_rate": 2.6834381551362687e-05, + "loss": 0.4013, + "step": 64 + }, + { + "epoch": 0.013651160348629633, + "grad_norm": 0.7141286134719849, + "learning_rate": 2.7253668763102725e-05, + "loss": 0.5162, + "step": 65 + }, + { + "epoch": 0.013861178200147012, + "grad_norm": 0.779960572719574, + "learning_rate": 2.767295597484277e-05, + "loss": 0.4126, + "step": 66 + }, + { + "epoch": 0.014071196051664391, + "grad_norm": 0.6626395583152771, + "learning_rate": 2.809224318658281e-05, + "loss": 0.3925, + "step": 67 + }, + { + "epoch": 0.01428121390318177, + "grad_norm": 0.6545393466949463, + "learning_rate": 2.851153039832285e-05, + "loss": 0.3152, + "step": 68 + }, + { + "epoch": 0.014491231754699149, + "grad_norm": 0.7004114389419556, + "learning_rate": 2.8930817610062894e-05, + "loss": 0.4223, + "step": 69 + }, + { + "epoch": 0.014701249606216528, + "grad_norm": 0.6912452578544617, + "learning_rate": 2.935010482180294e-05, + "loss": 0.3089, + "step": 70 + }, + { + "epoch": 0.014911267457733907, + "grad_norm": 0.7729060649871826, + "learning_rate": 2.976939203354298e-05, + "loss": 0.4045, + "step": 71 + }, + { + "epoch": 0.015121285309251286, + "grad_norm": 0.7606898546218872, + "learning_rate": 3.018867924528302e-05, + "loss": 0.3079, + "step": 72 + }, + { + "epoch": 0.015331303160768665, + "grad_norm": 0.6202028393745422, + "learning_rate": 3.060796645702306e-05, + "loss": 0.3833, + "step": 73 + }, + { + "epoch": 0.015541321012286044, + "grad_norm": 0.6014758348464966, + "learning_rate": 3.1027253668763105e-05, + "loss": 0.3815, + "step": 74 + }, + { + "epoch": 0.015751338863803425, + "grad_norm": 0.6792122721672058, + "learning_rate": 3.144654088050314e-05, + "loss": 0.3383, + "step": 75 + }, + { + "epoch": 0.015961356715320802, + "grad_norm": 0.7135879993438721, + "learning_rate": 3.186582809224319e-05, + "loss": 0.3744, + "step": 76 + }, + { + "epoch": 0.016171374566838183, + "grad_norm": 0.6972818374633789, + "learning_rate": 3.228511530398323e-05, + "loss": 0.3256, + "step": 77 + }, + { + "epoch": 0.01638139241835556, + "grad_norm": 0.5925168395042419, + "learning_rate": 3.270440251572327e-05, + "loss": 0.3309, + "step": 78 + }, + { + "epoch": 0.01659141026987294, + "grad_norm": 0.7750416994094849, + "learning_rate": 3.3123689727463316e-05, + "loss": 0.4142, + "step": 79 + }, + { + "epoch": 0.016801428121390318, + "grad_norm": 0.7466484904289246, + "learning_rate": 3.354297693920336e-05, + "loss": 0.2527, + "step": 80 + }, + { + "epoch": 0.0170114459729077, + "grad_norm": 0.7709718942642212, + "learning_rate": 3.39622641509434e-05, + "loss": 0.3717, + "step": 81 + }, + { + "epoch": 0.017221463824425076, + "grad_norm": 0.6134454607963562, + "learning_rate": 3.438155136268344e-05, + "loss": 0.2969, + "step": 82 + }, + { + "epoch": 0.017431481675942453, + "grad_norm": 0.6442283391952515, + "learning_rate": 3.480083857442348e-05, + "loss": 0.3009, + "step": 83 + }, + { + "epoch": 0.017641499527459834, + "grad_norm": 0.6788150072097778, + "learning_rate": 3.522012578616352e-05, + "loss": 0.309, + "step": 84 + }, + { + "epoch": 0.01785151737897721, + "grad_norm": 0.7172322869300842, + "learning_rate": 3.5639412997903565e-05, + "loss": 0.3602, + "step": 85 + }, + { + "epoch": 0.018061535230494592, + "grad_norm": 0.7475742697715759, + "learning_rate": 3.605870020964361e-05, + "loss": 0.1889, + "step": 86 + }, + { + "epoch": 0.01827155308201197, + "grad_norm": 0.7164073586463928, + "learning_rate": 3.647798742138365e-05, + "loss": 0.2062, + "step": 87 + }, + { + "epoch": 0.01848157093352935, + "grad_norm": 0.7514247298240662, + "learning_rate": 3.689727463312369e-05, + "loss": 0.2426, + "step": 88 + }, + { + "epoch": 0.018691588785046728, + "grad_norm": 0.8898234963417053, + "learning_rate": 3.731656184486374e-05, + "loss": 0.3759, + "step": 89 + }, + { + "epoch": 0.01890160663656411, + "grad_norm": 0.8034729361534119, + "learning_rate": 3.7735849056603776e-05, + "loss": 0.2547, + "step": 90 + }, + { + "epoch": 0.019111624488081486, + "grad_norm": 0.771716296672821, + "learning_rate": 3.8155136268343814e-05, + "loss": 0.2125, + "step": 91 + }, + { + "epoch": 0.019321642339598866, + "grad_norm": 0.811174213886261, + "learning_rate": 3.857442348008386e-05, + "loss": 0.3535, + "step": 92 + }, + { + "epoch": 0.019531660191116244, + "grad_norm": 1.0474952459335327, + "learning_rate": 3.8993710691823904e-05, + "loss": 0.3278, + "step": 93 + }, + { + "epoch": 0.019741678042633624, + "grad_norm": 0.752088725566864, + "learning_rate": 3.941299790356394e-05, + "loss": 0.2574, + "step": 94 + }, + { + "epoch": 0.019951695894151, + "grad_norm": 0.9202740788459778, + "learning_rate": 3.983228511530399e-05, + "loss": 0.2618, + "step": 95 + }, + { + "epoch": 0.020161713745668382, + "grad_norm": 0.663686990737915, + "learning_rate": 4.025157232704403e-05, + "loss": 0.1981, + "step": 96 + }, + { + "epoch": 0.02037173159718576, + "grad_norm": 0.7075244784355164, + "learning_rate": 4.067085953878407e-05, + "loss": 0.195, + "step": 97 + }, + { + "epoch": 0.02058174944870314, + "grad_norm": 0.8226995468139648, + "learning_rate": 4.109014675052411e-05, + "loss": 0.3464, + "step": 98 + }, + { + "epoch": 0.020791767300220518, + "grad_norm": 0.826926589012146, + "learning_rate": 4.150943396226415e-05, + "loss": 0.241, + "step": 99 + }, + { + "epoch": 0.0210017851517379, + "grad_norm": 0.8767513632774353, + "learning_rate": 4.192872117400419e-05, + "loss": 0.33, + "step": 100 + }, + { + "epoch": 0.021211803003255276, + "grad_norm": 0.9166819453239441, + "learning_rate": 4.2348008385744236e-05, + "loss": 0.3528, + "step": 101 + }, + { + "epoch": 0.021421820854772657, + "grad_norm": 0.6607112288475037, + "learning_rate": 4.276729559748428e-05, + "loss": 0.3294, + "step": 102 + }, + { + "epoch": 0.021631838706290034, + "grad_norm": 0.5891725420951843, + "learning_rate": 4.318658280922432e-05, + "loss": 0.2523, + "step": 103 + }, + { + "epoch": 0.021841856557807415, + "grad_norm": 0.5484351515769958, + "learning_rate": 4.3605870020964364e-05, + "loss": 0.3563, + "step": 104 + }, + { + "epoch": 0.022051874409324792, + "grad_norm": 0.6384206414222717, + "learning_rate": 4.402515723270441e-05, + "loss": 0.5014, + "step": 105 + }, + { + "epoch": 0.022261892260842173, + "grad_norm": 0.6228074431419373, + "learning_rate": 4.4444444444444447e-05, + "loss": 0.297, + "step": 106 + }, + { + "epoch": 0.02247191011235955, + "grad_norm": 0.6993734240531921, + "learning_rate": 4.4863731656184485e-05, + "loss": 0.3416, + "step": 107 + }, + { + "epoch": 0.02268192796387693, + "grad_norm": 0.5191211104393005, + "learning_rate": 4.528301886792453e-05, + "loss": 0.2403, + "step": 108 + }, + { + "epoch": 0.022891945815394308, + "grad_norm": 0.5719013214111328, + "learning_rate": 4.570230607966457e-05, + "loss": 0.2226, + "step": 109 + }, + { + "epoch": 0.02310196366691169, + "grad_norm": 0.5222904682159424, + "learning_rate": 4.612159329140461e-05, + "loss": 0.2119, + "step": 110 + }, + { + "epoch": 0.023311981518429066, + "grad_norm": 0.4741697609424591, + "learning_rate": 4.654088050314466e-05, + "loss": 0.2076, + "step": 111 + }, + { + "epoch": 0.023521999369946447, + "grad_norm": 0.5350250005722046, + "learning_rate": 4.69601677148847e-05, + "loss": 0.2342, + "step": 112 + }, + { + "epoch": 0.023732017221463824, + "grad_norm": 0.6532084345817566, + "learning_rate": 4.737945492662474e-05, + "loss": 0.3541, + "step": 113 + }, + { + "epoch": 0.023942035072981205, + "grad_norm": 0.6158542633056641, + "learning_rate": 4.7798742138364785e-05, + "loss": 0.2586, + "step": 114 + }, + { + "epoch": 0.024152052924498582, + "grad_norm": 0.7820281982421875, + "learning_rate": 4.8218029350104823e-05, + "loss": 0.472, + "step": 115 + }, + { + "epoch": 0.024362070776015963, + "grad_norm": 0.6096176505088806, + "learning_rate": 4.863731656184486e-05, + "loss": 0.3014, + "step": 116 + }, + { + "epoch": 0.02457208862753334, + "grad_norm": 0.5152641534805298, + "learning_rate": 4.9056603773584906e-05, + "loss": 0.2032, + "step": 117 + }, + { + "epoch": 0.02478210647905072, + "grad_norm": 0.6049755215644836, + "learning_rate": 4.947589098532495e-05, + "loss": 0.2932, + "step": 118 + }, + { + "epoch": 0.024992124330568098, + "grad_norm": 0.5203216075897217, + "learning_rate": 4.989517819706499e-05, + "loss": 0.1978, + "step": 119 + }, + { + "epoch": 0.02520214218208548, + "grad_norm": 0.6881438493728638, + "learning_rate": 5.0314465408805034e-05, + "loss": 0.3873, + "step": 120 + }, + { + "epoch": 0.025412160033602856, + "grad_norm": 0.602206289768219, + "learning_rate": 5.073375262054507e-05, + "loss": 0.2508, + "step": 121 + }, + { + "epoch": 0.025622177885120234, + "grad_norm": 0.7059246897697449, + "learning_rate": 5.1153039832285124e-05, + "loss": 0.2661, + "step": 122 + }, + { + "epoch": 0.025832195736637614, + "grad_norm": 0.44054412841796875, + "learning_rate": 5.157232704402516e-05, + "loss": 0.177, + "step": 123 + }, + { + "epoch": 0.02604221358815499, + "grad_norm": 0.6321287155151367, + "learning_rate": 5.19916142557652e-05, + "loss": 0.2849, + "step": 124 + }, + { + "epoch": 0.026252231439672372, + "grad_norm": 0.7430282235145569, + "learning_rate": 5.2410901467505245e-05, + "loss": 0.3353, + "step": 125 + }, + { + "epoch": 0.02646224929118975, + "grad_norm": 0.6884610056877136, + "learning_rate": 5.283018867924528e-05, + "loss": 0.2935, + "step": 126 + }, + { + "epoch": 0.02667226714270713, + "grad_norm": 0.8021960854530334, + "learning_rate": 5.324947589098532e-05, + "loss": 0.2609, + "step": 127 + }, + { + "epoch": 0.026882284994224508, + "grad_norm": 0.5545848608016968, + "learning_rate": 5.366876310272537e-05, + "loss": 0.1972, + "step": 128 + }, + { + "epoch": 0.02709230284574189, + "grad_norm": 0.6628248691558838, + "learning_rate": 5.408805031446541e-05, + "loss": 0.2184, + "step": 129 + }, + { + "epoch": 0.027302320697259266, + "grad_norm": 0.5908805131912231, + "learning_rate": 5.450733752620545e-05, + "loss": 0.2029, + "step": 130 + }, + { + "epoch": 0.027512338548776646, + "grad_norm": 0.6377450823783875, + "learning_rate": 5.49266247379455e-05, + "loss": 0.2382, + "step": 131 + }, + { + "epoch": 0.027722356400294024, + "grad_norm": 0.7006211876869202, + "learning_rate": 5.534591194968554e-05, + "loss": 0.1936, + "step": 132 + }, + { + "epoch": 0.027932374251811404, + "grad_norm": 0.5962005257606506, + "learning_rate": 5.576519916142558e-05, + "loss": 0.2922, + "step": 133 + }, + { + "epoch": 0.028142392103328782, + "grad_norm": 0.6030206084251404, + "learning_rate": 5.618448637316562e-05, + "loss": 0.1629, + "step": 134 + }, + { + "epoch": 0.028352409954846162, + "grad_norm": 0.7888013124465942, + "learning_rate": 5.660377358490566e-05, + "loss": 0.2866, + "step": 135 + }, + { + "epoch": 0.02856242780636354, + "grad_norm": 0.5116386413574219, + "learning_rate": 5.70230607966457e-05, + "loss": 0.1963, + "step": 136 + }, + { + "epoch": 0.02877244565788092, + "grad_norm": 0.6759427785873413, + "learning_rate": 5.744234800838575e-05, + "loss": 0.2412, + "step": 137 + }, + { + "epoch": 0.028982463509398298, + "grad_norm": 0.8643584847450256, + "learning_rate": 5.786163522012579e-05, + "loss": 0.2277, + "step": 138 + }, + { + "epoch": 0.02919248136091568, + "grad_norm": 0.639639139175415, + "learning_rate": 5.8280922431865826e-05, + "loss": 0.2286, + "step": 139 + }, + { + "epoch": 0.029402499212433056, + "grad_norm": 0.6094908714294434, + "learning_rate": 5.870020964360588e-05, + "loss": 0.1656, + "step": 140 + }, + { + "epoch": 0.029612517063950437, + "grad_norm": 0.7927185297012329, + "learning_rate": 5.9119496855345916e-05, + "loss": 0.2436, + "step": 141 + }, + { + "epoch": 0.029822534915467814, + "grad_norm": 0.8780869841575623, + "learning_rate": 5.953878406708596e-05, + "loss": 0.2614, + "step": 142 + }, + { + "epoch": 0.030032552766985195, + "grad_norm": 0.5985304117202759, + "learning_rate": 5.9958071278826e-05, + "loss": 0.2268, + "step": 143 + }, + { + "epoch": 0.030242570618502572, + "grad_norm": 0.6452706456184387, + "learning_rate": 6.037735849056604e-05, + "loss": 0.211, + "step": 144 + }, + { + "epoch": 0.030452588470019953, + "grad_norm": 0.8015931844711304, + "learning_rate": 6.079664570230609e-05, + "loss": 0.3532, + "step": 145 + }, + { + "epoch": 0.03066260632153733, + "grad_norm": 0.667226254940033, + "learning_rate": 6.121593291404612e-05, + "loss": 0.2051, + "step": 146 + }, + { + "epoch": 0.03087262417305471, + "grad_norm": 0.6942270398139954, + "learning_rate": 6.163522012578616e-05, + "loss": 0.2516, + "step": 147 + }, + { + "epoch": 0.031082642024572088, + "grad_norm": 0.845588743686676, + "learning_rate": 6.205450733752621e-05, + "loss": 0.257, + "step": 148 + }, + { + "epoch": 0.031292659876089465, + "grad_norm": 0.6104562878608704, + "learning_rate": 6.247379454926625e-05, + "loss": 0.246, + "step": 149 + }, + { + "epoch": 0.03150267772760685, + "grad_norm": 0.7243993282318115, + "learning_rate": 6.289308176100629e-05, + "loss": 0.2623, + "step": 150 + }, + { + "epoch": 0.03171269557912423, + "grad_norm": 0.6479102373123169, + "learning_rate": 6.331236897274634e-05, + "loss": 0.3154, + "step": 151 + }, + { + "epoch": 0.031922713430641604, + "grad_norm": 0.6088507175445557, + "learning_rate": 6.373165618448638e-05, + "loss": 0.3639, + "step": 152 + }, + { + "epoch": 0.03213273128215898, + "grad_norm": 0.5590083599090576, + "learning_rate": 6.415094339622641e-05, + "loss": 0.2753, + "step": 153 + }, + { + "epoch": 0.032342749133676366, + "grad_norm": 0.6644802093505859, + "learning_rate": 6.457023060796647e-05, + "loss": 0.2722, + "step": 154 + }, + { + "epoch": 0.03255276698519374, + "grad_norm": 0.6034846901893616, + "learning_rate": 6.49895178197065e-05, + "loss": 0.3084, + "step": 155 + }, + { + "epoch": 0.03276278483671112, + "grad_norm": 0.897366464138031, + "learning_rate": 6.540880503144654e-05, + "loss": 0.2834, + "step": 156 + }, + { + "epoch": 0.0329728026882285, + "grad_norm": 0.7516223788261414, + "learning_rate": 6.58280922431866e-05, + "loss": 0.2515, + "step": 157 + }, + { + "epoch": 0.03318282053974588, + "grad_norm": 0.712957501411438, + "learning_rate": 6.624737945492663e-05, + "loss": 0.217, + "step": 158 + }, + { + "epoch": 0.03339283839126326, + "grad_norm": 0.6373322010040283, + "learning_rate": 6.666666666666667e-05, + "loss": 0.3091, + "step": 159 + }, + { + "epoch": 0.033602856242780636, + "grad_norm": 0.6305301189422607, + "learning_rate": 6.708595387840672e-05, + "loss": 0.1965, + "step": 160 + }, + { + "epoch": 0.033812874094298014, + "grad_norm": 0.6340491771697998, + "learning_rate": 6.750524109014676e-05, + "loss": 0.2316, + "step": 161 + }, + { + "epoch": 0.0340228919458154, + "grad_norm": 0.6992335915565491, + "learning_rate": 6.79245283018868e-05, + "loss": 0.321, + "step": 162 + }, + { + "epoch": 0.034232909797332775, + "grad_norm": 0.723899245262146, + "learning_rate": 6.834381551362684e-05, + "loss": 0.2057, + "step": 163 + }, + { + "epoch": 0.03444292764885015, + "grad_norm": 0.6245738863945007, + "learning_rate": 6.876310272536687e-05, + "loss": 0.2367, + "step": 164 + }, + { + "epoch": 0.03465294550036753, + "grad_norm": 0.716299295425415, + "learning_rate": 6.918238993710691e-05, + "loss": 0.3107, + "step": 165 + }, + { + "epoch": 0.03486296335188491, + "grad_norm": 0.8374738097190857, + "learning_rate": 6.960167714884696e-05, + "loss": 0.4097, + "step": 166 + }, + { + "epoch": 0.03507298120340229, + "grad_norm": 0.7812545299530029, + "learning_rate": 7.0020964360587e-05, + "loss": 0.389, + "step": 167 + }, + { + "epoch": 0.03528299905491967, + "grad_norm": 0.516504168510437, + "learning_rate": 7.044025157232704e-05, + "loss": 0.2321, + "step": 168 + }, + { + "epoch": 0.035493016906437046, + "grad_norm": 0.5948511958122253, + "learning_rate": 7.085953878406709e-05, + "loss": 0.2075, + "step": 169 + }, + { + "epoch": 0.03570303475795442, + "grad_norm": 0.5658239126205444, + "learning_rate": 7.127882599580713e-05, + "loss": 0.2366, + "step": 170 + }, + { + "epoch": 0.03591305260947181, + "grad_norm": 0.44888898730278015, + "learning_rate": 7.169811320754717e-05, + "loss": 0.226, + "step": 171 + }, + { + "epoch": 0.036123070460989185, + "grad_norm": 0.5403774380683899, + "learning_rate": 7.211740041928722e-05, + "loss": 0.2887, + "step": 172 + }, + { + "epoch": 0.03633308831250656, + "grad_norm": 0.5742720365524292, + "learning_rate": 7.253668763102726e-05, + "loss": 0.1841, + "step": 173 + }, + { + "epoch": 0.03654310616402394, + "grad_norm": 0.7217287421226501, + "learning_rate": 7.29559748427673e-05, + "loss": 0.283, + "step": 174 + }, + { + "epoch": 0.03675312401554132, + "grad_norm": 0.6517660021781921, + "learning_rate": 7.337526205450735e-05, + "loss": 0.277, + "step": 175 + }, + { + "epoch": 0.0369631418670587, + "grad_norm": 0.5237565040588379, + "learning_rate": 7.379454926624739e-05, + "loss": 0.2764, + "step": 176 + }, + { + "epoch": 0.03717315971857608, + "grad_norm": 0.5715314745903015, + "learning_rate": 7.421383647798742e-05, + "loss": 0.2947, + "step": 177 + }, + { + "epoch": 0.037383177570093455, + "grad_norm": 0.39689743518829346, + "learning_rate": 7.463312368972748e-05, + "loss": 0.1478, + "step": 178 + }, + { + "epoch": 0.03759319542161084, + "grad_norm": 0.62773197889328, + "learning_rate": 7.505241090146751e-05, + "loss": 0.2688, + "step": 179 + }, + { + "epoch": 0.03780321327312822, + "grad_norm": 0.5422549247741699, + "learning_rate": 7.547169811320755e-05, + "loss": 0.2852, + "step": 180 + }, + { + "epoch": 0.038013231124645594, + "grad_norm": 0.7973243594169617, + "learning_rate": 7.589098532494759e-05, + "loss": 0.2414, + "step": 181 + }, + { + "epoch": 0.03822324897616297, + "grad_norm": 0.596788227558136, + "learning_rate": 7.631027253668763e-05, + "loss": 0.2979, + "step": 182 + }, + { + "epoch": 0.038433266827680355, + "grad_norm": 0.7164194583892822, + "learning_rate": 7.672955974842768e-05, + "loss": 0.3195, + "step": 183 + }, + { + "epoch": 0.03864328467919773, + "grad_norm": 0.6374505758285522, + "learning_rate": 7.714884696016772e-05, + "loss": 0.2244, + "step": 184 + }, + { + "epoch": 0.03885330253071511, + "grad_norm": 0.7066443562507629, + "learning_rate": 7.756813417190776e-05, + "loss": 0.3328, + "step": 185 + }, + { + "epoch": 0.03906332038223249, + "grad_norm": 0.5930470824241638, + "learning_rate": 7.798742138364781e-05, + "loss": 0.208, + "step": 186 + }, + { + "epoch": 0.03927333823374987, + "grad_norm": 0.7578011155128479, + "learning_rate": 7.840670859538785e-05, + "loss": 0.3468, + "step": 187 + }, + { + "epoch": 0.03948335608526725, + "grad_norm": 0.5424745678901672, + "learning_rate": 7.882599580712788e-05, + "loss": 0.159, + "step": 188 + }, + { + "epoch": 0.039693373936784626, + "grad_norm": 0.6554968953132629, + "learning_rate": 7.924528301886794e-05, + "loss": 0.1718, + "step": 189 + }, + { + "epoch": 0.039903391788302, + "grad_norm": 0.596862256526947, + "learning_rate": 7.966457023060797e-05, + "loss": 0.2238, + "step": 190 + }, + { + "epoch": 0.04011340963981939, + "grad_norm": 0.7238299250602722, + "learning_rate": 8.008385744234801e-05, + "loss": 0.2322, + "step": 191 + }, + { + "epoch": 0.040323427491336765, + "grad_norm": 0.6230559349060059, + "learning_rate": 8.050314465408806e-05, + "loss": 0.166, + "step": 192 + }, + { + "epoch": 0.04053344534285414, + "grad_norm": 0.63409823179245, + "learning_rate": 8.09224318658281e-05, + "loss": 0.2431, + "step": 193 + }, + { + "epoch": 0.04074346319437152, + "grad_norm": 0.43581536412239075, + "learning_rate": 8.134171907756814e-05, + "loss": 0.1758, + "step": 194 + }, + { + "epoch": 0.040953481045888904, + "grad_norm": 0.5425090789794922, + "learning_rate": 8.176100628930818e-05, + "loss": 0.1214, + "step": 195 + }, + { + "epoch": 0.04116349889740628, + "grad_norm": 0.44201138615608215, + "learning_rate": 8.218029350104822e-05, + "loss": 0.148, + "step": 196 + }, + { + "epoch": 0.04137351674892366, + "grad_norm": 0.8185025453567505, + "learning_rate": 8.259958071278825e-05, + "loss": 0.2788, + "step": 197 + }, + { + "epoch": 0.041583534600441036, + "grad_norm": 0.5838762521743774, + "learning_rate": 8.30188679245283e-05, + "loss": 0.2547, + "step": 198 + }, + { + "epoch": 0.04179355245195842, + "grad_norm": 0.6128750443458557, + "learning_rate": 8.343815513626834e-05, + "loss": 0.2107, + "step": 199 + }, + { + "epoch": 0.0420035703034758, + "grad_norm": 0.6906862854957581, + "learning_rate": 8.385744234800838e-05, + "loss": 0.2066, + "step": 200 + }, + { + "epoch": 0.042213588154993174, + "grad_norm": 0.46028971672058105, + "learning_rate": 8.427672955974843e-05, + "loss": 0.3119, + "step": 201 + }, + { + "epoch": 0.04242360600651055, + "grad_norm": 0.45515576004981995, + "learning_rate": 8.469601677148847e-05, + "loss": 0.3191, + "step": 202 + }, + { + "epoch": 0.04263362385802793, + "grad_norm": 0.5129204392433167, + "learning_rate": 8.511530398322851e-05, + "loss": 0.2978, + "step": 203 + }, + { + "epoch": 0.04284364170954531, + "grad_norm": 0.5413036942481995, + "learning_rate": 8.553459119496856e-05, + "loss": 0.2489, + "step": 204 + }, + { + "epoch": 0.04305365956106269, + "grad_norm": 0.5111532211303711, + "learning_rate": 8.59538784067086e-05, + "loss": 0.2322, + "step": 205 + }, + { + "epoch": 0.04326367741258007, + "grad_norm": 0.50229412317276, + "learning_rate": 8.637316561844864e-05, + "loss": 0.2858, + "step": 206 + }, + { + "epoch": 0.043473695264097445, + "grad_norm": 0.5821024179458618, + "learning_rate": 8.679245283018869e-05, + "loss": 0.3118, + "step": 207 + }, + { + "epoch": 0.04368371311561483, + "grad_norm": 0.5523480176925659, + "learning_rate": 8.721174004192873e-05, + "loss": 0.3757, + "step": 208 + }, + { + "epoch": 0.043893730967132207, + "grad_norm": 0.721248209476471, + "learning_rate": 8.763102725366877e-05, + "loss": 0.3196, + "step": 209 + }, + { + "epoch": 0.044103748818649584, + "grad_norm": 0.47591283917427063, + "learning_rate": 8.805031446540882e-05, + "loss": 0.233, + "step": 210 + }, + { + "epoch": 0.04431376667016696, + "grad_norm": 0.5817727446556091, + "learning_rate": 8.846960167714886e-05, + "loss": 0.2924, + "step": 211 + }, + { + "epoch": 0.044523784521684345, + "grad_norm": 0.5370981097221375, + "learning_rate": 8.888888888888889e-05, + "loss": 0.2487, + "step": 212 + }, + { + "epoch": 0.04473380237320172, + "grad_norm": 0.47605571150779724, + "learning_rate": 8.930817610062893e-05, + "loss": 0.2249, + "step": 213 + }, + { + "epoch": 0.0449438202247191, + "grad_norm": 0.6315035223960876, + "learning_rate": 8.972746331236897e-05, + "loss": 0.2805, + "step": 214 + }, + { + "epoch": 0.04515383807623648, + "grad_norm": 0.7511045932769775, + "learning_rate": 9.014675052410901e-05, + "loss": 0.2235, + "step": 215 + }, + { + "epoch": 0.04536385592775386, + "grad_norm": 0.6990196704864502, + "learning_rate": 9.056603773584906e-05, + "loss": 0.2548, + "step": 216 + }, + { + "epoch": 0.04557387377927124, + "grad_norm": 0.6954676508903503, + "learning_rate": 9.09853249475891e-05, + "loss": 0.2232, + "step": 217 + }, + { + "epoch": 0.045783891630788616, + "grad_norm": 0.6898313164710999, + "learning_rate": 9.140461215932914e-05, + "loss": 0.2241, + "step": 218 + }, + { + "epoch": 0.04599390948230599, + "grad_norm": 0.649360716342926, + "learning_rate": 9.182389937106919e-05, + "loss": 0.3236, + "step": 219 + }, + { + "epoch": 0.04620392733382338, + "grad_norm": 0.6722248792648315, + "learning_rate": 9.224318658280923e-05, + "loss": 0.4025, + "step": 220 + }, + { + "epoch": 0.046413945185340755, + "grad_norm": 0.8752652406692505, + "learning_rate": 9.266247379454928e-05, + "loss": 0.2937, + "step": 221 + }, + { + "epoch": 0.04662396303685813, + "grad_norm": 0.6925809979438782, + "learning_rate": 9.308176100628931e-05, + "loss": 0.2719, + "step": 222 + }, + { + "epoch": 0.04683398088837551, + "grad_norm": 0.6006962656974792, + "learning_rate": 9.350104821802935e-05, + "loss": 0.3025, + "step": 223 + }, + { + "epoch": 0.047043998739892894, + "grad_norm": 0.7841734290122986, + "learning_rate": 9.39203354297694e-05, + "loss": 0.267, + "step": 224 + }, + { + "epoch": 0.04725401659141027, + "grad_norm": 0.6895375847816467, + "learning_rate": 9.433962264150944e-05, + "loss": 0.185, + "step": 225 + }, + { + "epoch": 0.04746403444292765, + "grad_norm": 0.4874520003795624, + "learning_rate": 9.475890985324948e-05, + "loss": 0.1967, + "step": 226 + }, + { + "epoch": 0.047674052294445025, + "grad_norm": 0.5807353258132935, + "learning_rate": 9.517819706498953e-05, + "loss": 0.2649, + "step": 227 + }, + { + "epoch": 0.04788407014596241, + "grad_norm": 0.5833479166030884, + "learning_rate": 9.559748427672957e-05, + "loss": 0.2506, + "step": 228 + }, + { + "epoch": 0.04809408799747979, + "grad_norm": 0.5458546280860901, + "learning_rate": 9.601677148846961e-05, + "loss": 0.2338, + "step": 229 + }, + { + "epoch": 0.048304105848997164, + "grad_norm": 0.543692409992218, + "learning_rate": 9.643605870020965e-05, + "loss": 0.2058, + "step": 230 + }, + { + "epoch": 0.04851412370051454, + "grad_norm": 0.8071588277816772, + "learning_rate": 9.685534591194969e-05, + "loss": 0.2658, + "step": 231 + }, + { + "epoch": 0.048724141552031926, + "grad_norm": 0.6677277088165283, + "learning_rate": 9.727463312368972e-05, + "loss": 0.3603, + "step": 232 + }, + { + "epoch": 0.0489341594035493, + "grad_norm": 0.6699347496032715, + "learning_rate": 9.769392033542977e-05, + "loss": 0.2474, + "step": 233 + }, + { + "epoch": 0.04914417725506668, + "grad_norm": 0.47732773423194885, + "learning_rate": 9.811320754716981e-05, + "loss": 0.1856, + "step": 234 + }, + { + "epoch": 0.04935419510658406, + "grad_norm": 0.738476037979126, + "learning_rate": 9.853249475890985e-05, + "loss": 0.2443, + "step": 235 + }, + { + "epoch": 0.04956421295810144, + "grad_norm": 0.6604114174842834, + "learning_rate": 9.89517819706499e-05, + "loss": 0.2656, + "step": 236 + }, + { + "epoch": 0.04977423080961882, + "grad_norm": 0.6403035521507263, + "learning_rate": 9.937106918238994e-05, + "loss": 0.2248, + "step": 237 + }, + { + "epoch": 0.049984248661136196, + "grad_norm": 0.7960561513900757, + "learning_rate": 9.979035639412998e-05, + "loss": 0.5721, + "step": 238 + }, + { + "epoch": 0.050194266512653574, + "grad_norm": 0.7372507452964783, + "learning_rate": 0.00010020964360587002, + "loss": 0.263, + "step": 239 + }, + { + "epoch": 0.05040428436417096, + "grad_norm": 0.5040899515151978, + "learning_rate": 0.00010062893081761007, + "loss": 0.2005, + "step": 240 + }, + { + "epoch": 0.050614302215688335, + "grad_norm": 0.5214698314666748, + "learning_rate": 0.00010104821802935012, + "loss": 0.1686, + "step": 241 + }, + { + "epoch": 0.05082432006720571, + "grad_norm": 0.5759347677230835, + "learning_rate": 0.00010146750524109014, + "loss": 0.213, + "step": 242 + }, + { + "epoch": 0.05103433791872309, + "grad_norm": 0.5980076789855957, + "learning_rate": 0.0001018867924528302, + "loss": 0.2073, + "step": 243 + }, + { + "epoch": 0.05124435577024047, + "grad_norm": 0.4733896851539612, + "learning_rate": 0.00010230607966457025, + "loss": 0.2275, + "step": 244 + }, + { + "epoch": 0.05145437362175785, + "grad_norm": 0.597186803817749, + "learning_rate": 0.00010272536687631027, + "loss": 0.2205, + "step": 245 + }, + { + "epoch": 0.05166439147327523, + "grad_norm": 0.7414665818214417, + "learning_rate": 0.00010314465408805032, + "loss": 0.2796, + "step": 246 + }, + { + "epoch": 0.051874409324792606, + "grad_norm": 0.5712472200393677, + "learning_rate": 0.00010356394129979036, + "loss": 0.2343, + "step": 247 + }, + { + "epoch": 0.05208442717630998, + "grad_norm": 0.7146592140197754, + "learning_rate": 0.0001039832285115304, + "loss": 0.2339, + "step": 248 + }, + { + "epoch": 0.05229444502782737, + "grad_norm": 0.5384387373924255, + "learning_rate": 0.00010440251572327044, + "loss": 0.3013, + "step": 249 + }, + { + "epoch": 0.052504462879344745, + "grad_norm": 0.6523765921592712, + "learning_rate": 0.00010482180293501049, + "loss": 0.2295, + "step": 250 + }, + { + "epoch": 0.05271448073086212, + "grad_norm": 0.5877966284751892, + "learning_rate": 0.00010524109014675052, + "loss": 0.3045, + "step": 251 + }, + { + "epoch": 0.0529244985823795, + "grad_norm": 0.5458659529685974, + "learning_rate": 0.00010566037735849057, + "loss": 0.3458, + "step": 252 + }, + { + "epoch": 0.05313451643389688, + "grad_norm": 0.6964541673660278, + "learning_rate": 0.00010607966457023062, + "loss": 0.3715, + "step": 253 + }, + { + "epoch": 0.05334453428541426, + "grad_norm": 0.4040902853012085, + "learning_rate": 0.00010649895178197064, + "loss": 0.1554, + "step": 254 + }, + { + "epoch": 0.05355455213693164, + "grad_norm": 0.7013939023017883, + "learning_rate": 0.0001069182389937107, + "loss": 0.4111, + "step": 255 + }, + { + "epoch": 0.053764569988449015, + "grad_norm": 0.622279942035675, + "learning_rate": 0.00010733752620545075, + "loss": 0.2715, + "step": 256 + }, + { + "epoch": 0.0539745878399664, + "grad_norm": 0.6869280934333801, + "learning_rate": 0.00010775681341719077, + "loss": 0.3444, + "step": 257 + }, + { + "epoch": 0.05418460569148378, + "grad_norm": 0.6003340482711792, + "learning_rate": 0.00010817610062893082, + "loss": 0.261, + "step": 258 + }, + { + "epoch": 0.054394623543001154, + "grad_norm": 0.4238702356815338, + "learning_rate": 0.00010859538784067087, + "loss": 0.2615, + "step": 259 + }, + { + "epoch": 0.05460464139451853, + "grad_norm": 0.5799490809440613, + "learning_rate": 0.0001090146750524109, + "loss": 0.3293, + "step": 260 + }, + { + "epoch": 0.054814659246035916, + "grad_norm": 0.6378821134567261, + "learning_rate": 0.00010943396226415095, + "loss": 0.3184, + "step": 261 + }, + { + "epoch": 0.05502467709755329, + "grad_norm": 0.540050745010376, + "learning_rate": 0.000109853249475891, + "loss": 0.262, + "step": 262 + }, + { + "epoch": 0.05523469494907067, + "grad_norm": 0.551968514919281, + "learning_rate": 0.00011027253668763103, + "loss": 0.3183, + "step": 263 + }, + { + "epoch": 0.05544471280058805, + "grad_norm": 0.6024414896965027, + "learning_rate": 0.00011069182389937108, + "loss": 0.2883, + "step": 264 + }, + { + "epoch": 0.05565473065210543, + "grad_norm": 0.6821377277374268, + "learning_rate": 0.00011111111111111112, + "loss": 0.3178, + "step": 265 + }, + { + "epoch": 0.05586474850362281, + "grad_norm": 0.6535030603408813, + "learning_rate": 0.00011153039832285115, + "loss": 0.2091, + "step": 266 + }, + { + "epoch": 0.056074766355140186, + "grad_norm": 0.5255767703056335, + "learning_rate": 0.00011194968553459119, + "loss": 0.1802, + "step": 267 + }, + { + "epoch": 0.056284784206657563, + "grad_norm": 0.59894859790802, + "learning_rate": 0.00011236897274633124, + "loss": 0.2526, + "step": 268 + }, + { + "epoch": 0.05649480205817495, + "grad_norm": 0.6401522159576416, + "learning_rate": 0.00011278825995807127, + "loss": 0.2462, + "step": 269 + }, + { + "epoch": 0.056704819909692325, + "grad_norm": 0.6755663752555847, + "learning_rate": 0.00011320754716981132, + "loss": 0.1973, + "step": 270 + }, + { + "epoch": 0.0569148377612097, + "grad_norm": 0.5885902643203735, + "learning_rate": 0.00011362683438155137, + "loss": 0.2111, + "step": 271 + }, + { + "epoch": 0.05712485561272708, + "grad_norm": 0.48771098256111145, + "learning_rate": 0.0001140461215932914, + "loss": 0.2409, + "step": 272 + }, + { + "epoch": 0.057334873464244464, + "grad_norm": 0.5513054132461548, + "learning_rate": 0.00011446540880503145, + "loss": 0.1883, + "step": 273 + }, + { + "epoch": 0.05754489131576184, + "grad_norm": 0.43761372566223145, + "learning_rate": 0.0001148846960167715, + "loss": 0.228, + "step": 274 + }, + { + "epoch": 0.05775490916727922, + "grad_norm": 0.5232881307601929, + "learning_rate": 0.00011530398322851152, + "loss": 0.1592, + "step": 275 + }, + { + "epoch": 0.057964927018796596, + "grad_norm": 0.5873312950134277, + "learning_rate": 0.00011572327044025158, + "loss": 0.1972, + "step": 276 + }, + { + "epoch": 0.05817494487031398, + "grad_norm": 0.5464483499526978, + "learning_rate": 0.00011614255765199163, + "loss": 0.1502, + "step": 277 + }, + { + "epoch": 0.05838496272183136, + "grad_norm": 0.6480989456176758, + "learning_rate": 0.00011656184486373165, + "loss": 0.2349, + "step": 278 + }, + { + "epoch": 0.058594980573348734, + "grad_norm": 0.41417571902275085, + "learning_rate": 0.0001169811320754717, + "loss": 0.2097, + "step": 279 + }, + { + "epoch": 0.05880499842486611, + "grad_norm": 0.8272523880004883, + "learning_rate": 0.00011740041928721176, + "loss": 0.2676, + "step": 280 + }, + { + "epoch": 0.059015016276383496, + "grad_norm": 0.6363915205001831, + "learning_rate": 0.0001178197064989518, + "loss": 0.3084, + "step": 281 + }, + { + "epoch": 0.05922503412790087, + "grad_norm": 0.6411394476890564, + "learning_rate": 0.00011823899371069183, + "loss": 0.244, + "step": 282 + }, + { + "epoch": 0.05943505197941825, + "grad_norm": 0.9145995378494263, + "learning_rate": 0.00011865828092243187, + "loss": 0.298, + "step": 283 + }, + { + "epoch": 0.05964506983093563, + "grad_norm": 0.7248232960700989, + "learning_rate": 0.00011907756813417192, + "loss": 0.2438, + "step": 284 + }, + { + "epoch": 0.059855087682453005, + "grad_norm": 0.4901827573776245, + "learning_rate": 0.00011949685534591195, + "loss": 0.1903, + "step": 285 + }, + { + "epoch": 0.06006510553397039, + "grad_norm": 0.5104687809944153, + "learning_rate": 0.000119916142557652, + "loss": 0.2014, + "step": 286 + }, + { + "epoch": 0.06027512338548777, + "grad_norm": 0.5063393712043762, + "learning_rate": 0.00012033542976939205, + "loss": 0.212, + "step": 287 + }, + { + "epoch": 0.060485141237005144, + "grad_norm": 0.6044209599494934, + "learning_rate": 0.00012075471698113207, + "loss": 0.3138, + "step": 288 + }, + { + "epoch": 0.06069515908852252, + "grad_norm": 0.5843082666397095, + "learning_rate": 0.00012117400419287213, + "loss": 0.2199, + "step": 289 + }, + { + "epoch": 0.060905176940039905, + "grad_norm": 0.4589983820915222, + "learning_rate": 0.00012159329140461218, + "loss": 0.2222, + "step": 290 + }, + { + "epoch": 0.06111519479155728, + "grad_norm": 0.4094448983669281, + "learning_rate": 0.0001220125786163522, + "loss": 0.1286, + "step": 291 + }, + { + "epoch": 0.06132521264307466, + "grad_norm": 0.42624855041503906, + "learning_rate": 0.00012243186582809224, + "loss": 0.2719, + "step": 292 + }, + { + "epoch": 0.06153523049459204, + "grad_norm": 0.5488569736480713, + "learning_rate": 0.0001228511530398323, + "loss": 0.2588, + "step": 293 + }, + { + "epoch": 0.06174524834610942, + "grad_norm": 0.6029438972473145, + "learning_rate": 0.00012327044025157232, + "loss": 0.3182, + "step": 294 + }, + { + "epoch": 0.0619552661976268, + "grad_norm": 0.49090123176574707, + "learning_rate": 0.00012368972746331237, + "loss": 0.2192, + "step": 295 + }, + { + "epoch": 0.062165284049144176, + "grad_norm": 0.7553131580352783, + "learning_rate": 0.00012410901467505242, + "loss": 0.2932, + "step": 296 + }, + { + "epoch": 0.06237530190066155, + "grad_norm": 0.6839373707771301, + "learning_rate": 0.00012452830188679244, + "loss": 0.1896, + "step": 297 + }, + { + "epoch": 0.06258531975217893, + "grad_norm": 0.5805861949920654, + "learning_rate": 0.0001249475890985325, + "loss": 0.2613, + "step": 298 + }, + { + "epoch": 0.06279533760369631, + "grad_norm": 0.4247298836708069, + "learning_rate": 0.00012536687631027255, + "loss": 0.1701, + "step": 299 + }, + { + "epoch": 0.0630053554552137, + "grad_norm": 0.6167422533035278, + "learning_rate": 0.00012578616352201257, + "loss": 0.2919, + "step": 300 + }, + { + "epoch": 0.06321537330673108, + "grad_norm": 0.5140472054481506, + "learning_rate": 0.00012620545073375262, + "loss": 0.2204, + "step": 301 + }, + { + "epoch": 0.06342539115824845, + "grad_norm": 0.48360675573349, + "learning_rate": 0.00012662473794549268, + "loss": 0.2625, + "step": 302 + }, + { + "epoch": 0.06363540900976583, + "grad_norm": 0.5805841684341431, + "learning_rate": 0.0001270440251572327, + "loss": 0.2659, + "step": 303 + }, + { + "epoch": 0.06384542686128321, + "grad_norm": 0.4108704924583435, + "learning_rate": 0.00012746331236897275, + "loss": 0.1757, + "step": 304 + }, + { + "epoch": 0.06405544471280059, + "grad_norm": 0.4739980697631836, + "learning_rate": 0.0001278825995807128, + "loss": 0.2413, + "step": 305 + }, + { + "epoch": 0.06426546256431796, + "grad_norm": 0.6421864032745361, + "learning_rate": 0.00012830188679245283, + "loss": 0.3373, + "step": 306 + }, + { + "epoch": 0.06447548041583534, + "grad_norm": 0.6035056114196777, + "learning_rate": 0.00012872117400419288, + "loss": 0.1632, + "step": 307 + }, + { + "epoch": 0.06468549826735273, + "grad_norm": 0.5946957468986511, + "learning_rate": 0.00012914046121593293, + "loss": 0.2797, + "step": 308 + }, + { + "epoch": 0.06489551611887011, + "grad_norm": 0.5636250972747803, + "learning_rate": 0.00012955974842767296, + "loss": 0.3484, + "step": 309 + }, + { + "epoch": 0.06510553397038749, + "grad_norm": 0.5175902843475342, + "learning_rate": 0.000129979035639413, + "loss": 0.2306, + "step": 310 + }, + { + "epoch": 0.06531555182190486, + "grad_norm": 0.39933711290359497, + "learning_rate": 0.00013039832285115306, + "loss": 0.2018, + "step": 311 + }, + { + "epoch": 0.06552556967342224, + "grad_norm": 0.6203914284706116, + "learning_rate": 0.00013081761006289308, + "loss": 0.2519, + "step": 312 + }, + { + "epoch": 0.06573558752493962, + "grad_norm": 0.6847423911094666, + "learning_rate": 0.00013123689727463314, + "loss": 0.2125, + "step": 313 + }, + { + "epoch": 0.065945605376457, + "grad_norm": 0.5958030223846436, + "learning_rate": 0.0001316561844863732, + "loss": 0.2019, + "step": 314 + }, + { + "epoch": 0.06615562322797437, + "grad_norm": 0.4878827631473541, + "learning_rate": 0.0001320754716981132, + "loss": 0.2286, + "step": 315 + }, + { + "epoch": 0.06636564107949176, + "grad_norm": 0.5386853814125061, + "learning_rate": 0.00013249475890985326, + "loss": 0.2349, + "step": 316 + }, + { + "epoch": 0.06657565893100914, + "grad_norm": 0.5583687424659729, + "learning_rate": 0.00013291404612159332, + "loss": 0.273, + "step": 317 + }, + { + "epoch": 0.06678567678252652, + "grad_norm": 0.503384530544281, + "learning_rate": 0.00013333333333333334, + "loss": 0.2718, + "step": 318 + }, + { + "epoch": 0.0669956946340439, + "grad_norm": 0.6256868839263916, + "learning_rate": 0.0001337526205450734, + "loss": 0.2176, + "step": 319 + }, + { + "epoch": 0.06720571248556127, + "grad_norm": 0.4585525095462799, + "learning_rate": 0.00013417190775681344, + "loss": 0.1671, + "step": 320 + }, + { + "epoch": 0.06741573033707865, + "grad_norm": 0.52493816614151, + "learning_rate": 0.00013459119496855347, + "loss": 0.2129, + "step": 321 + }, + { + "epoch": 0.06762574818859603, + "grad_norm": 0.7206648588180542, + "learning_rate": 0.00013501048218029352, + "loss": 0.1872, + "step": 322 + }, + { + "epoch": 0.0678357660401134, + "grad_norm": 0.5732535123825073, + "learning_rate": 0.00013542976939203354, + "loss": 0.2131, + "step": 323 + }, + { + "epoch": 0.0680457838916308, + "grad_norm": 0.5404482483863831, + "learning_rate": 0.0001358490566037736, + "loss": 0.1472, + "step": 324 + }, + { + "epoch": 0.06825580174314817, + "grad_norm": 0.7235817313194275, + "learning_rate": 0.00013626834381551362, + "loss": 0.2404, + "step": 325 + }, + { + "epoch": 0.06846581959466555, + "grad_norm": 0.4254133999347687, + "learning_rate": 0.00013668763102725367, + "loss": 0.2133, + "step": 326 + }, + { + "epoch": 0.06867583744618293, + "grad_norm": 0.4804741144180298, + "learning_rate": 0.0001371069182389937, + "loss": 0.1776, + "step": 327 + }, + { + "epoch": 0.0688858552977003, + "grad_norm": 0.4900747537612915, + "learning_rate": 0.00013752620545073375, + "loss": 0.1958, + "step": 328 + }, + { + "epoch": 0.06909587314921768, + "grad_norm": 0.576337456703186, + "learning_rate": 0.0001379454926624738, + "loss": 0.2318, + "step": 329 + }, + { + "epoch": 0.06930589100073506, + "grad_norm": 0.5610971450805664, + "learning_rate": 0.00013836477987421382, + "loss": 0.2631, + "step": 330 + }, + { + "epoch": 0.06951590885225244, + "grad_norm": 0.6010019779205322, + "learning_rate": 0.00013878406708595388, + "loss": 0.2201, + "step": 331 + }, + { + "epoch": 0.06972592670376981, + "grad_norm": 0.4658229947090149, + "learning_rate": 0.00013920335429769393, + "loss": 0.1602, + "step": 332 + }, + { + "epoch": 0.0699359445552872, + "grad_norm": 0.5411532521247864, + "learning_rate": 0.00013962264150943395, + "loss": 0.1942, + "step": 333 + }, + { + "epoch": 0.07014596240680458, + "grad_norm": 0.875629186630249, + "learning_rate": 0.000140041928721174, + "loss": 0.2337, + "step": 334 + }, + { + "epoch": 0.07035598025832196, + "grad_norm": 0.5620985627174377, + "learning_rate": 0.00014046121593291406, + "loss": 0.2641, + "step": 335 + }, + { + "epoch": 0.07056599810983934, + "grad_norm": 0.8389297723770142, + "learning_rate": 0.00014088050314465408, + "loss": 0.3069, + "step": 336 + }, + { + "epoch": 0.07077601596135671, + "grad_norm": 0.4745865762233734, + "learning_rate": 0.00014129979035639413, + "loss": 0.1626, + "step": 337 + }, + { + "epoch": 0.07098603381287409, + "grad_norm": 0.4688374996185303, + "learning_rate": 0.00014171907756813418, + "loss": 0.1327, + "step": 338 + }, + { + "epoch": 0.07119605166439147, + "grad_norm": 0.4219890832901001, + "learning_rate": 0.0001421383647798742, + "loss": 0.142, + "step": 339 + }, + { + "epoch": 0.07140606951590885, + "grad_norm": 0.700579047203064, + "learning_rate": 0.00014255765199161426, + "loss": 0.179, + "step": 340 + }, + { + "epoch": 0.07161608736742624, + "grad_norm": 0.36132583022117615, + "learning_rate": 0.0001429769392033543, + "loss": 0.1283, + "step": 341 + }, + { + "epoch": 0.07182610521894361, + "grad_norm": 0.9342030882835388, + "learning_rate": 0.00014339622641509434, + "loss": 0.3873, + "step": 342 + }, + { + "epoch": 0.07203612307046099, + "grad_norm": 0.6389639973640442, + "learning_rate": 0.0001438155136268344, + "loss": 0.2631, + "step": 343 + }, + { + "epoch": 0.07224614092197837, + "grad_norm": 0.7687662243843079, + "learning_rate": 0.00014423480083857444, + "loss": 0.2002, + "step": 344 + }, + { + "epoch": 0.07245615877349575, + "grad_norm": 0.6517148613929749, + "learning_rate": 0.00014465408805031446, + "loss": 0.2454, + "step": 345 + }, + { + "epoch": 0.07266617662501312, + "grad_norm": 0.5010355710983276, + "learning_rate": 0.00014507337526205452, + "loss": 0.1541, + "step": 346 + }, + { + "epoch": 0.0728761944765305, + "grad_norm": 0.49431943893432617, + "learning_rate": 0.00014549266247379457, + "loss": 0.213, + "step": 347 + }, + { + "epoch": 0.07308621232804788, + "grad_norm": 0.6462149024009705, + "learning_rate": 0.0001459119496855346, + "loss": 0.2642, + "step": 348 + }, + { + "epoch": 0.07329623017956527, + "grad_norm": 0.5412748456001282, + "learning_rate": 0.00014633123689727464, + "loss": 0.187, + "step": 349 + }, + { + "epoch": 0.07350624803108265, + "grad_norm": 0.6458069682121277, + "learning_rate": 0.0001467505241090147, + "loss": 0.3224, + "step": 350 + }, + { + "epoch": 0.07371626588260002, + "grad_norm": 0.4398702383041382, + "learning_rate": 0.00014716981132075472, + "loss": 0.2419, + "step": 351 + }, + { + "epoch": 0.0739262837341174, + "grad_norm": 0.47583240270614624, + "learning_rate": 0.00014758909853249477, + "loss": 0.2259, + "step": 352 + }, + { + "epoch": 0.07413630158563478, + "grad_norm": 0.5058132410049438, + "learning_rate": 0.00014800838574423482, + "loss": 0.3733, + "step": 353 + }, + { + "epoch": 0.07434631943715216, + "grad_norm": 0.4765789210796356, + "learning_rate": 0.00014842767295597485, + "loss": 0.3204, + "step": 354 + }, + { + "epoch": 0.07455633728866953, + "grad_norm": 0.4549868106842041, + "learning_rate": 0.0001488469601677149, + "loss": 0.2788, + "step": 355 + }, + { + "epoch": 0.07476635514018691, + "grad_norm": 0.44640183448791504, + "learning_rate": 0.00014926624737945495, + "loss": 0.225, + "step": 356 + }, + { + "epoch": 0.0749763729917043, + "grad_norm": 0.5040209293365479, + "learning_rate": 0.00014968553459119498, + "loss": 0.288, + "step": 357 + }, + { + "epoch": 0.07518639084322168, + "grad_norm": 0.7681525349617004, + "learning_rate": 0.00015010482180293503, + "loss": 0.2677, + "step": 358 + }, + { + "epoch": 0.07539640869473906, + "grad_norm": 0.3658473491668701, + "learning_rate": 0.00015052410901467505, + "loss": 0.1944, + "step": 359 + }, + { + "epoch": 0.07560642654625643, + "grad_norm": 0.5071917772293091, + "learning_rate": 0.0001509433962264151, + "loss": 0.1937, + "step": 360 + }, + { + "epoch": 0.07581644439777381, + "grad_norm": 0.5669259428977966, + "learning_rate": 0.00015136268343815513, + "loss": 0.2755, + "step": 361 + }, + { + "epoch": 0.07602646224929119, + "grad_norm": 0.5721021294593811, + "learning_rate": 0.00015178197064989518, + "loss": 0.2233, + "step": 362 + }, + { + "epoch": 0.07623648010080857, + "grad_norm": 0.5776953101158142, + "learning_rate": 0.00015220125786163523, + "loss": 0.2918, + "step": 363 + }, + { + "epoch": 0.07644649795232594, + "grad_norm": 0.7863572239875793, + "learning_rate": 0.00015262054507337526, + "loss": 0.3225, + "step": 364 + }, + { + "epoch": 0.07665651580384332, + "grad_norm": 0.7403888702392578, + "learning_rate": 0.0001530398322851153, + "loss": 0.1836, + "step": 365 + }, + { + "epoch": 0.07686653365536071, + "grad_norm": 0.7344810962677002, + "learning_rate": 0.00015345911949685536, + "loss": 0.342, + "step": 366 + }, + { + "epoch": 0.07707655150687809, + "grad_norm": 0.6341666579246521, + "learning_rate": 0.00015387840670859538, + "loss": 0.2222, + "step": 367 + }, + { + "epoch": 0.07728656935839547, + "grad_norm": 0.7821016907691956, + "learning_rate": 0.00015429769392033544, + "loss": 0.3106, + "step": 368 + }, + { + "epoch": 0.07749658720991284, + "grad_norm": 0.5648399591445923, + "learning_rate": 0.0001547169811320755, + "loss": 0.1907, + "step": 369 + }, + { + "epoch": 0.07770660506143022, + "grad_norm": 0.5853981971740723, + "learning_rate": 0.0001551362683438155, + "loss": 0.1873, + "step": 370 + }, + { + "epoch": 0.0779166229129476, + "grad_norm": 0.6429926753044128, + "learning_rate": 0.00015555555555555556, + "loss": 0.177, + "step": 371 + }, + { + "epoch": 0.07812664076446497, + "grad_norm": 0.5365523099899292, + "learning_rate": 0.00015597484276729561, + "loss": 0.2283, + "step": 372 + }, + { + "epoch": 0.07833665861598235, + "grad_norm": 0.4820340871810913, + "learning_rate": 0.00015639412997903564, + "loss": 0.2179, + "step": 373 + }, + { + "epoch": 0.07854667646749974, + "grad_norm": 0.5231903195381165, + "learning_rate": 0.0001568134171907757, + "loss": 0.2165, + "step": 374 + }, + { + "epoch": 0.07875669431901712, + "grad_norm": 0.6309874057769775, + "learning_rate": 0.00015723270440251574, + "loss": 0.2511, + "step": 375 + }, + { + "epoch": 0.0789667121705345, + "grad_norm": 0.6248964667320251, + "learning_rate": 0.00015765199161425577, + "loss": 0.192, + "step": 376 + }, + { + "epoch": 0.07917673002205187, + "grad_norm": 0.4089469611644745, + "learning_rate": 0.00015807127882599582, + "loss": 0.1674, + "step": 377 + }, + { + "epoch": 0.07938674787356925, + "grad_norm": 0.5720129609107971, + "learning_rate": 0.00015849056603773587, + "loss": 0.2571, + "step": 378 + }, + { + "epoch": 0.07959676572508663, + "grad_norm": 0.505424976348877, + "learning_rate": 0.0001589098532494759, + "loss": 0.2189, + "step": 379 + }, + { + "epoch": 0.079806783576604, + "grad_norm": 0.4483712315559387, + "learning_rate": 0.00015932914046121595, + "loss": 0.2959, + "step": 380 + }, + { + "epoch": 0.08001680142812138, + "grad_norm": 0.6313521862030029, + "learning_rate": 0.000159748427672956, + "loss": 0.237, + "step": 381 + }, + { + "epoch": 0.08022681927963878, + "grad_norm": 0.530503511428833, + "learning_rate": 0.00016016771488469602, + "loss": 0.1922, + "step": 382 + }, + { + "epoch": 0.08043683713115615, + "grad_norm": 0.65278160572052, + "learning_rate": 0.00016058700209643607, + "loss": 0.2714, + "step": 383 + }, + { + "epoch": 0.08064685498267353, + "grad_norm": 0.6226363182067871, + "learning_rate": 0.00016100628930817613, + "loss": 0.1913, + "step": 384 + }, + { + "epoch": 0.08085687283419091, + "grad_norm": 0.6313908696174622, + "learning_rate": 0.00016142557651991615, + "loss": 0.2457, + "step": 385 + }, + { + "epoch": 0.08106689068570828, + "grad_norm": 0.5335121750831604, + "learning_rate": 0.0001618448637316562, + "loss": 0.2537, + "step": 386 + }, + { + "epoch": 0.08127690853722566, + "grad_norm": 0.7243566513061523, + "learning_rate": 0.00016226415094339625, + "loss": 0.2125, + "step": 387 + }, + { + "epoch": 0.08148692638874304, + "grad_norm": 0.5874237418174744, + "learning_rate": 0.00016268343815513628, + "loss": 0.2104, + "step": 388 + }, + { + "epoch": 0.08169694424026042, + "grad_norm": 0.5792878866195679, + "learning_rate": 0.00016310272536687633, + "loss": 0.198, + "step": 389 + }, + { + "epoch": 0.08190696209177781, + "grad_norm": 0.5439760088920593, + "learning_rate": 0.00016352201257861635, + "loss": 0.1895, + "step": 390 + }, + { + "epoch": 0.08211697994329518, + "grad_norm": 0.6903837323188782, + "learning_rate": 0.0001639412997903564, + "loss": 0.2587, + "step": 391 + }, + { + "epoch": 0.08232699779481256, + "grad_norm": 0.6126405596733093, + "learning_rate": 0.00016436058700209643, + "loss": 0.2232, + "step": 392 + }, + { + "epoch": 0.08253701564632994, + "grad_norm": 0.9248547554016113, + "learning_rate": 0.00016477987421383648, + "loss": 0.267, + "step": 393 + }, + { + "epoch": 0.08274703349784732, + "grad_norm": 0.6509301066398621, + "learning_rate": 0.0001651991614255765, + "loss": 0.2109, + "step": 394 + }, + { + "epoch": 0.0829570513493647, + "grad_norm": 0.5985137820243835, + "learning_rate": 0.00016561844863731656, + "loss": 0.1783, + "step": 395 + }, + { + "epoch": 0.08316706920088207, + "grad_norm": 0.6711693406105042, + "learning_rate": 0.0001660377358490566, + "loss": 0.2115, + "step": 396 + }, + { + "epoch": 0.08337708705239945, + "grad_norm": 0.4494445025920868, + "learning_rate": 0.00016645702306079664, + "loss": 0.1486, + "step": 397 + }, + { + "epoch": 0.08358710490391684, + "grad_norm": 0.5083547830581665, + "learning_rate": 0.0001668763102725367, + "loss": 0.2317, + "step": 398 + }, + { + "epoch": 0.08379712275543422, + "grad_norm": 0.7552763819694519, + "learning_rate": 0.00016729559748427674, + "loss": 0.236, + "step": 399 + }, + { + "epoch": 0.0840071406069516, + "grad_norm": 0.7656201124191284, + "learning_rate": 0.00016771488469601676, + "loss": 0.2732, + "step": 400 + }, + { + "epoch": 0.08421715845846897, + "grad_norm": 0.3850518465042114, + "learning_rate": 0.00016813417190775681, + "loss": 0.3322, + "step": 401 + }, + { + "epoch": 0.08442717630998635, + "grad_norm": 0.5610989928245544, + "learning_rate": 0.00016855345911949687, + "loss": 0.2844, + "step": 402 + }, + { + "epoch": 0.08463719416150373, + "grad_norm": 0.7500874400138855, + "learning_rate": 0.0001689727463312369, + "loss": 0.3651, + "step": 403 + }, + { + "epoch": 0.0848472120130211, + "grad_norm": 0.45343145728111267, + "learning_rate": 0.00016939203354297694, + "loss": 0.2174, + "step": 404 + }, + { + "epoch": 0.08505722986453848, + "grad_norm": 0.6427581310272217, + "learning_rate": 0.000169811320754717, + "loss": 0.2748, + "step": 405 + }, + { + "epoch": 0.08526724771605586, + "grad_norm": 0.64598149061203, + "learning_rate": 0.00017023060796645702, + "loss": 0.2912, + "step": 406 + }, + { + "epoch": 0.08547726556757325, + "grad_norm": 0.49100759625434875, + "learning_rate": 0.00017064989517819707, + "loss": 0.2582, + "step": 407 + }, + { + "epoch": 0.08568728341909063, + "grad_norm": 0.5637136101722717, + "learning_rate": 0.00017106918238993712, + "loss": 0.254, + "step": 408 + }, + { + "epoch": 0.085897301270608, + "grad_norm": 0.5617924928665161, + "learning_rate": 0.00017148846960167715, + "loss": 0.2043, + "step": 409 + }, + { + "epoch": 0.08610731912212538, + "grad_norm": 0.5467379093170166, + "learning_rate": 0.0001719077568134172, + "loss": 0.2363, + "step": 410 + }, + { + "epoch": 0.08631733697364276, + "grad_norm": 0.6882631778717041, + "learning_rate": 0.00017232704402515725, + "loss": 0.2341, + "step": 411 + }, + { + "epoch": 0.08652735482516014, + "grad_norm": 0.40710386633872986, + "learning_rate": 0.00017274633123689727, + "loss": 0.1952, + "step": 412 + }, + { + "epoch": 0.08673737267667751, + "grad_norm": 0.688685953617096, + "learning_rate": 0.00017316561844863733, + "loss": 0.3588, + "step": 413 + }, + { + "epoch": 0.08694739052819489, + "grad_norm": 0.7739083170890808, + "learning_rate": 0.00017358490566037738, + "loss": 0.1897, + "step": 414 + }, + { + "epoch": 0.08715740837971228, + "grad_norm": 0.45127734541893005, + "learning_rate": 0.0001740041928721174, + "loss": 0.2282, + "step": 415 + }, + { + "epoch": 0.08736742623122966, + "grad_norm": 0.6713837385177612, + "learning_rate": 0.00017442348008385745, + "loss": 0.3395, + "step": 416 + }, + { + "epoch": 0.08757744408274704, + "grad_norm": 0.5886412858963013, + "learning_rate": 0.0001748427672955975, + "loss": 0.1673, + "step": 417 + }, + { + "epoch": 0.08778746193426441, + "grad_norm": 0.6254634261131287, + "learning_rate": 0.00017526205450733753, + "loss": 0.2392, + "step": 418 + }, + { + "epoch": 0.08799747978578179, + "grad_norm": 0.5936654806137085, + "learning_rate": 0.00017568134171907758, + "loss": 0.1817, + "step": 419 + }, + { + "epoch": 0.08820749763729917, + "grad_norm": 0.6107873320579529, + "learning_rate": 0.00017610062893081763, + "loss": 0.2877, + "step": 420 + }, + { + "epoch": 0.08841751548881654, + "grad_norm": 0.583984911441803, + "learning_rate": 0.00017651991614255766, + "loss": 0.2382, + "step": 421 + }, + { + "epoch": 0.08862753334033392, + "grad_norm": 0.6411318778991699, + "learning_rate": 0.0001769392033542977, + "loss": 0.2528, + "step": 422 + }, + { + "epoch": 0.08883755119185131, + "grad_norm": 0.5407703518867493, + "learning_rate": 0.00017735849056603776, + "loss": 0.217, + "step": 423 + }, + { + "epoch": 0.08904756904336869, + "grad_norm": 0.5086292028427124, + "learning_rate": 0.00017777777777777779, + "loss": 0.1703, + "step": 424 + }, + { + "epoch": 0.08925758689488607, + "grad_norm": 0.534488320350647, + "learning_rate": 0.00017819706498951784, + "loss": 0.4212, + "step": 425 + }, + { + "epoch": 0.08946760474640345, + "grad_norm": 0.5869336724281311, + "learning_rate": 0.00017861635220125786, + "loss": 0.3634, + "step": 426 + }, + { + "epoch": 0.08967762259792082, + "grad_norm": 0.5784481763839722, + "learning_rate": 0.00017903563941299791, + "loss": 0.2076, + "step": 427 + }, + { + "epoch": 0.0898876404494382, + "grad_norm": 0.467438668012619, + "learning_rate": 0.00017945492662473794, + "loss": 0.1948, + "step": 428 + }, + { + "epoch": 0.09009765830095558, + "grad_norm": 0.8514359593391418, + "learning_rate": 0.000179874213836478, + "loss": 0.2695, + "step": 429 + }, + { + "epoch": 0.09030767615247295, + "grad_norm": 0.630066990852356, + "learning_rate": 0.00018029350104821801, + "loss": 0.2624, + "step": 430 + }, + { + "epoch": 0.09051769400399035, + "grad_norm": 0.6442775130271912, + "learning_rate": 0.00018071278825995807, + "loss": 0.2555, + "step": 431 + }, + { + "epoch": 0.09072771185550772, + "grad_norm": 0.6193580031394958, + "learning_rate": 0.00018113207547169812, + "loss": 0.2388, + "step": 432 + }, + { + "epoch": 0.0909377297070251, + "grad_norm": 1.108219027519226, + "learning_rate": 0.00018155136268343814, + "loss": 0.2135, + "step": 433 + }, + { + "epoch": 0.09114774755854248, + "grad_norm": 0.666748046875, + "learning_rate": 0.0001819706498951782, + "loss": 0.2402, + "step": 434 + }, + { + "epoch": 0.09135776541005985, + "grad_norm": 0.516096293926239, + "learning_rate": 0.00018238993710691825, + "loss": 0.2022, + "step": 435 + }, + { + "epoch": 0.09156778326157723, + "grad_norm": 0.4976787269115448, + "learning_rate": 0.00018280922431865827, + "loss": 0.1492, + "step": 436 + }, + { + "epoch": 0.09177780111309461, + "grad_norm": 0.596254289150238, + "learning_rate": 0.00018322851153039832, + "loss": 0.1926, + "step": 437 + }, + { + "epoch": 0.09198781896461199, + "grad_norm": 0.4079163670539856, + "learning_rate": 0.00018364779874213837, + "loss": 0.219, + "step": 438 + }, + { + "epoch": 0.09219783681612938, + "grad_norm": 0.4968511164188385, + "learning_rate": 0.00018406708595387843, + "loss": 0.2203, + "step": 439 + }, + { + "epoch": 0.09240785466764675, + "grad_norm": 0.5749839544296265, + "learning_rate": 0.00018448637316561845, + "loss": 0.2561, + "step": 440 + }, + { + "epoch": 0.09261787251916413, + "grad_norm": 0.46315014362335205, + "learning_rate": 0.0001849056603773585, + "loss": 0.1608, + "step": 441 + }, + { + "epoch": 0.09282789037068151, + "grad_norm": 0.4630315601825714, + "learning_rate": 0.00018532494758909855, + "loss": 0.1564, + "step": 442 + }, + { + "epoch": 0.09303790822219889, + "grad_norm": 0.5688292384147644, + "learning_rate": 0.00018574423480083858, + "loss": 0.1483, + "step": 443 + }, + { + "epoch": 0.09324792607371626, + "grad_norm": 0.9025551676750183, + "learning_rate": 0.00018616352201257863, + "loss": 0.216, + "step": 444 + }, + { + "epoch": 0.09345794392523364, + "grad_norm": 0.6165971755981445, + "learning_rate": 0.00018658280922431868, + "loss": 0.1852, + "step": 445 + }, + { + "epoch": 0.09366796177675102, + "grad_norm": 0.5040764808654785, + "learning_rate": 0.0001870020964360587, + "loss": 0.1534, + "step": 446 + }, + { + "epoch": 0.0938779796282684, + "grad_norm": 0.6921994686126709, + "learning_rate": 0.00018742138364779876, + "loss": 0.2739, + "step": 447 + }, + { + "epoch": 0.09408799747978579, + "grad_norm": 0.9911003708839417, + "learning_rate": 0.0001878406708595388, + "loss": 0.2148, + "step": 448 + }, + { + "epoch": 0.09429801533130316, + "grad_norm": 0.4098629951477051, + "learning_rate": 0.00018825995807127883, + "loss": 0.1627, + "step": 449 + }, + { + "epoch": 0.09450803318282054, + "grad_norm": 0.5267736315727234, + "learning_rate": 0.00018867924528301889, + "loss": 0.1714, + "step": 450 + }, + { + "epoch": 0.09471805103433792, + "grad_norm": 0.826693058013916, + "learning_rate": 0.00018909853249475894, + "loss": 0.3614, + "step": 451 + }, + { + "epoch": 0.0949280688858553, + "grad_norm": 0.7960173487663269, + "learning_rate": 0.00018951781970649896, + "loss": 0.2831, + "step": 452 + }, + { + "epoch": 0.09513808673737267, + "grad_norm": 0.5408324003219604, + "learning_rate": 0.00018993710691823901, + "loss": 0.2396, + "step": 453 + }, + { + "epoch": 0.09534810458889005, + "grad_norm": 0.5551522374153137, + "learning_rate": 0.00019035639412997907, + "loss": 0.2399, + "step": 454 + }, + { + "epoch": 0.09555812244040743, + "grad_norm": 0.5053918361663818, + "learning_rate": 0.0001907756813417191, + "loss": 0.2054, + "step": 455 + }, + { + "epoch": 0.09576814029192482, + "grad_norm": 0.6408351063728333, + "learning_rate": 0.00019119496855345914, + "loss": 0.2319, + "step": 456 + }, + { + "epoch": 0.0959781581434422, + "grad_norm": 0.6061432361602783, + "learning_rate": 0.0001916142557651992, + "loss": 0.2289, + "step": 457 + }, + { + "epoch": 0.09618817599495957, + "grad_norm": 0.6452487111091614, + "learning_rate": 0.00019203354297693922, + "loss": 0.2787, + "step": 458 + }, + { + "epoch": 0.09639819384647695, + "grad_norm": 0.5427165627479553, + "learning_rate": 0.00019245283018867927, + "loss": 0.3181, + "step": 459 + }, + { + "epoch": 0.09660821169799433, + "grad_norm": 0.5678632259368896, + "learning_rate": 0.0001928721174004193, + "loss": 0.3166, + "step": 460 + }, + { + "epoch": 0.0968182295495117, + "grad_norm": 0.554288923740387, + "learning_rate": 0.00019329140461215935, + "loss": 0.214, + "step": 461 + }, + { + "epoch": 0.09702824740102908, + "grad_norm": 0.7040925621986389, + "learning_rate": 0.00019371069182389937, + "loss": 0.473, + "step": 462 + }, + { + "epoch": 0.09723826525254646, + "grad_norm": 0.6425243020057678, + "learning_rate": 0.00019412997903563942, + "loss": 0.3542, + "step": 463 + }, + { + "epoch": 0.09744828310406385, + "grad_norm": 0.6984371542930603, + "learning_rate": 0.00019454926624737945, + "loss": 0.2165, + "step": 464 + }, + { + "epoch": 0.09765830095558123, + "grad_norm": 0.5204288959503174, + "learning_rate": 0.0001949685534591195, + "loss": 0.232, + "step": 465 + }, + { + "epoch": 0.0978683188070986, + "grad_norm": 0.5688004493713379, + "learning_rate": 0.00019538784067085955, + "loss": 0.3099, + "step": 466 + }, + { + "epoch": 0.09807833665861598, + "grad_norm": 0.4850284159183502, + "learning_rate": 0.00019580712788259957, + "loss": 0.202, + "step": 467 + }, + { + "epoch": 0.09828835451013336, + "grad_norm": 0.5034931302070618, + "learning_rate": 0.00019622641509433963, + "loss": 0.2077, + "step": 468 + }, + { + "epoch": 0.09849837236165074, + "grad_norm": 0.6193839311599731, + "learning_rate": 0.00019664570230607968, + "loss": 0.318, + "step": 469 + }, + { + "epoch": 0.09870839021316812, + "grad_norm": 0.6226887702941895, + "learning_rate": 0.0001970649895178197, + "loss": 0.3538, + "step": 470 + }, + { + "epoch": 0.09891840806468549, + "grad_norm": 0.6102244257926941, + "learning_rate": 0.00019748427672955975, + "loss": 0.2865, + "step": 471 + }, + { + "epoch": 0.09912842591620288, + "grad_norm": 0.6731789112091064, + "learning_rate": 0.0001979035639412998, + "loss": 0.3135, + "step": 472 + }, + { + "epoch": 0.09933844376772026, + "grad_norm": 0.661486029624939, + "learning_rate": 0.00019832285115303983, + "loss": 0.2198, + "step": 473 + }, + { + "epoch": 0.09954846161923764, + "grad_norm": 0.7321748733520508, + "learning_rate": 0.00019874213836477988, + "loss": 0.2557, + "step": 474 + }, + { + "epoch": 0.09975847947075502, + "grad_norm": 0.5708514451980591, + "learning_rate": 0.00019916142557651993, + "loss": 0.2385, + "step": 475 + }, + { + "epoch": 0.09996849732227239, + "grad_norm": 0.8140966892242432, + "learning_rate": 0.00019958071278825996, + "loss": 0.1724, + "step": 476 + }, + { + "epoch": 0.10017851517378977, + "grad_norm": 0.5185543298721313, + "learning_rate": 0.0002, + "loss": 0.192, + "step": 477 + }, + { + "epoch": 0.10038853302530715, + "grad_norm": 0.7630559802055359, + "learning_rate": 0.00019999999396812126, + "loss": 0.3107, + "step": 478 + }, + { + "epoch": 0.10059855087682452, + "grad_norm": 0.5256696939468384, + "learning_rate": 0.00019999997587248573, + "loss": 0.2057, + "step": 479 + }, + { + "epoch": 0.10080856872834192, + "grad_norm": 0.5820131301879883, + "learning_rate": 0.0001999999457130956, + "loss": 0.2444, + "step": 480 + }, + { + "epoch": 0.10101858657985929, + "grad_norm": 0.6161417365074158, + "learning_rate": 0.00019999990348995456, + "loss": 0.1882, + "step": 481 + }, + { + "epoch": 0.10122860443137667, + "grad_norm": 0.5549945831298828, + "learning_rate": 0.00019999984920306764, + "loss": 0.3101, + "step": 482 + }, + { + "epoch": 0.10143862228289405, + "grad_norm": 0.8708590269088745, + "learning_rate": 0.00019999978285244142, + "loss": 0.2377, + "step": 483 + }, + { + "epoch": 0.10164864013441142, + "grad_norm": 0.5110476016998291, + "learning_rate": 0.00019999970443808387, + "loss": 0.1476, + "step": 484 + }, + { + "epoch": 0.1018586579859288, + "grad_norm": 1.1280276775360107, + "learning_rate": 0.0001999996139600045, + "loss": 0.1595, + "step": 485 + }, + { + "epoch": 0.10206867583744618, + "grad_norm": 0.7876203656196594, + "learning_rate": 0.0001999995114182142, + "loss": 0.229, + "step": 486 + }, + { + "epoch": 0.10227869368896356, + "grad_norm": 0.7196666598320007, + "learning_rate": 0.00019999939681272536, + "loss": 0.1838, + "step": 487 + }, + { + "epoch": 0.10248871154048093, + "grad_norm": 0.6737300157546997, + "learning_rate": 0.00019999927014355175, + "loss": 0.1786, + "step": 488 + }, + { + "epoch": 0.10269872939199833, + "grad_norm": 0.7758048176765442, + "learning_rate": 0.0001999991314107087, + "loss": 0.1847, + "step": 489 + }, + { + "epoch": 0.1029087472435157, + "grad_norm": 0.8189213871955872, + "learning_rate": 0.00019999898061421294, + "loss": 0.2842, + "step": 490 + }, + { + "epoch": 0.10311876509503308, + "grad_norm": 0.5789510011672974, + "learning_rate": 0.00019999881775408263, + "loss": 0.2353, + "step": 491 + }, + { + "epoch": 0.10332878294655046, + "grad_norm": 0.808729350566864, + "learning_rate": 0.00019999864283033747, + "loss": 0.2481, + "step": 492 + }, + { + "epoch": 0.10353880079806783, + "grad_norm": 0.587478518486023, + "learning_rate": 0.00019999845584299855, + "loss": 0.1976, + "step": 493 + }, + { + "epoch": 0.10374881864958521, + "grad_norm": 0.7419194579124451, + "learning_rate": 0.00019999825679208839, + "loss": 0.2444, + "step": 494 + }, + { + "epoch": 0.10395883650110259, + "grad_norm": 0.6678702235221863, + "learning_rate": 0.000199998045677631, + "loss": 0.1752, + "step": 495 + }, + { + "epoch": 0.10416885435261997, + "grad_norm": 0.5477135181427002, + "learning_rate": 0.00019999782249965193, + "loss": 0.1176, + "step": 496 + }, + { + "epoch": 0.10437887220413736, + "grad_norm": 0.47613173723220825, + "learning_rate": 0.000199997587258178, + "loss": 0.1734, + "step": 497 + }, + { + "epoch": 0.10458889005565473, + "grad_norm": 0.8437466025352478, + "learning_rate": 0.0001999973399532377, + "loss": 0.2279, + "step": 498 + }, + { + "epoch": 0.10479890790717211, + "grad_norm": 0.7599924206733704, + "learning_rate": 0.00019999708058486074, + "loss": 0.209, + "step": 499 + }, + { + "epoch": 0.10500892575868949, + "grad_norm": 0.5578658580780029, + "learning_rate": 0.00019999680915307847, + "loss": 0.243, + "step": 500 + }, + { + "epoch": 0.10521894361020687, + "grad_norm": 0.5664511322975159, + "learning_rate": 0.00019999652565792368, + "loss": 0.2176, + "step": 501 + }, + { + "epoch": 0.10542896146172424, + "grad_norm": 0.5591540336608887, + "learning_rate": 0.0001999962300994305, + "loss": 0.3467, + "step": 502 + }, + { + "epoch": 0.10563897931324162, + "grad_norm": 0.5022396445274353, + "learning_rate": 0.0001999959224776346, + "loss": 0.2149, + "step": 503 + }, + { + "epoch": 0.105848997164759, + "grad_norm": 0.5846520662307739, + "learning_rate": 0.00019999560279257314, + "loss": 0.3388, + "step": 504 + }, + { + "epoch": 0.10605901501627639, + "grad_norm": 0.4137157201766968, + "learning_rate": 0.00019999527104428463, + "loss": 0.2223, + "step": 505 + }, + { + "epoch": 0.10626903286779377, + "grad_norm": 0.49332931637763977, + "learning_rate": 0.0001999949272328091, + "loss": 0.2679, + "step": 506 + }, + { + "epoch": 0.10647905071931114, + "grad_norm": 0.7095859050750732, + "learning_rate": 0.00019999457135818805, + "loss": 0.2681, + "step": 507 + }, + { + "epoch": 0.10668906857082852, + "grad_norm": 0.5563727021217346, + "learning_rate": 0.0001999942034204644, + "loss": 0.2695, + "step": 508 + }, + { + "epoch": 0.1068990864223459, + "grad_norm": 0.5464118719100952, + "learning_rate": 0.00019999382341968252, + "loss": 0.4308, + "step": 509 + }, + { + "epoch": 0.10710910427386328, + "grad_norm": 0.7822732329368591, + "learning_rate": 0.00019999343135588827, + "loss": 0.2458, + "step": 510 + }, + { + "epoch": 0.10731912212538065, + "grad_norm": 0.6268991231918335, + "learning_rate": 0.00019999302722912895, + "loss": 0.2877, + "step": 511 + }, + { + "epoch": 0.10752913997689803, + "grad_norm": 0.7860679626464844, + "learning_rate": 0.0001999926110394533, + "loss": 0.275, + "step": 512 + }, + { + "epoch": 0.10773915782841542, + "grad_norm": 0.5817549228668213, + "learning_rate": 0.00019999218278691158, + "loss": 0.2005, + "step": 513 + }, + { + "epoch": 0.1079491756799328, + "grad_norm": 0.8145076036453247, + "learning_rate": 0.00019999174247155535, + "loss": 0.2032, + "step": 514 + }, + { + "epoch": 0.10815919353145018, + "grad_norm": 0.7561895847320557, + "learning_rate": 0.0001999912900934378, + "loss": 0.2452, + "step": 515 + }, + { + "epoch": 0.10836921138296755, + "grad_norm": 0.8003164529800415, + "learning_rate": 0.0001999908256526135, + "loss": 0.2318, + "step": 516 + }, + { + "epoch": 0.10857922923448493, + "grad_norm": 0.6318978667259216, + "learning_rate": 0.0001999903491491385, + "loss": 0.2534, + "step": 517 + }, + { + "epoch": 0.10878924708600231, + "grad_norm": 0.5220886468887329, + "learning_rate": 0.00019998986058307022, + "loss": 0.2011, + "step": 518 + }, + { + "epoch": 0.10899926493751969, + "grad_norm": 0.5928252935409546, + "learning_rate": 0.00019998935995446763, + "loss": 0.2175, + "step": 519 + }, + { + "epoch": 0.10920928278903706, + "grad_norm": 0.7763411998748779, + "learning_rate": 0.00019998884726339116, + "loss": 0.2852, + "step": 520 + }, + { + "epoch": 0.10941930064055445, + "grad_norm": 0.7260156273841858, + "learning_rate": 0.00019998832250990264, + "loss": 0.2313, + "step": 521 + }, + { + "epoch": 0.10962931849207183, + "grad_norm": 0.7486017942428589, + "learning_rate": 0.0001999877856940653, + "loss": 0.2789, + "step": 522 + }, + { + "epoch": 0.10983933634358921, + "grad_norm": 0.6006895303726196, + "learning_rate": 0.00019998723681594402, + "loss": 0.1674, + "step": 523 + }, + { + "epoch": 0.11004935419510659, + "grad_norm": 0.7286220192909241, + "learning_rate": 0.00019998667587560495, + "loss": 0.2521, + "step": 524 + }, + { + "epoch": 0.11025937204662396, + "grad_norm": 0.7167160511016846, + "learning_rate": 0.00019998610287311574, + "loss": 0.2308, + "step": 525 + }, + { + "epoch": 0.11046938989814134, + "grad_norm": 0.6270986795425415, + "learning_rate": 0.00019998551780854557, + "loss": 0.2048, + "step": 526 + }, + { + "epoch": 0.11067940774965872, + "grad_norm": 0.7366195917129517, + "learning_rate": 0.000199984920681965, + "loss": 0.2239, + "step": 527 + }, + { + "epoch": 0.1108894256011761, + "grad_norm": 0.5709097981452942, + "learning_rate": 0.00019998431149344606, + "loss": 0.2074, + "step": 528 + }, + { + "epoch": 0.11109944345269347, + "grad_norm": 0.6718131899833679, + "learning_rate": 0.00019998369024306224, + "loss": 0.2902, + "step": 529 + }, + { + "epoch": 0.11130946130421086, + "grad_norm": 0.6741777658462524, + "learning_rate": 0.00019998305693088848, + "loss": 0.2638, + "step": 530 + }, + { + "epoch": 0.11151947915572824, + "grad_norm": 0.5218673944473267, + "learning_rate": 0.0001999824115570012, + "loss": 0.201, + "step": 531 + }, + { + "epoch": 0.11172949700724562, + "grad_norm": 0.6867272257804871, + "learning_rate": 0.00019998175412147824, + "loss": 0.2864, + "step": 532 + }, + { + "epoch": 0.111939514858763, + "grad_norm": 0.6319578289985657, + "learning_rate": 0.00019998108462439894, + "loss": 0.2277, + "step": 533 + }, + { + "epoch": 0.11214953271028037, + "grad_norm": 0.5601973533630371, + "learning_rate": 0.000199980403065844, + "loss": 0.2678, + "step": 534 + }, + { + "epoch": 0.11235955056179775, + "grad_norm": 0.5189068913459778, + "learning_rate": 0.00019997970944589572, + "loss": 0.2036, + "step": 535 + }, + { + "epoch": 0.11256956841331513, + "grad_norm": 0.7217200994491577, + "learning_rate": 0.00019997900376463778, + "loss": 0.2299, + "step": 536 + }, + { + "epoch": 0.1127795862648325, + "grad_norm": 0.6617181301116943, + "learning_rate": 0.0001999782860221552, + "loss": 0.2259, + "step": 537 + }, + { + "epoch": 0.1129896041163499, + "grad_norm": 0.6987117528915405, + "learning_rate": 0.0001999775562185347, + "loss": 0.1882, + "step": 538 + }, + { + "epoch": 0.11319962196786727, + "grad_norm": 0.4491863548755646, + "learning_rate": 0.00019997681435386422, + "loss": 0.1937, + "step": 539 + }, + { + "epoch": 0.11340963981938465, + "grad_norm": 0.5842171311378479, + "learning_rate": 0.00019997606042823334, + "loss": 0.2808, + "step": 540 + }, + { + "epoch": 0.11361965767090203, + "grad_norm": 0.7743870615959167, + "learning_rate": 0.00019997529444173293, + "loss": 0.2329, + "step": 541 + }, + { + "epoch": 0.1138296755224194, + "grad_norm": 0.5326593518257141, + "learning_rate": 0.00019997451639445547, + "loss": 0.188, + "step": 542 + }, + { + "epoch": 0.11403969337393678, + "grad_norm": 0.5364864468574524, + "learning_rate": 0.00019997372628649478, + "loss": 0.2294, + "step": 543 + }, + { + "epoch": 0.11424971122545416, + "grad_norm": 0.5609897375106812, + "learning_rate": 0.00019997292411794618, + "loss": 0.2108, + "step": 544 + }, + { + "epoch": 0.11445972907697154, + "grad_norm": 0.5446069836616516, + "learning_rate": 0.00019997210988890646, + "loss": 0.2577, + "step": 545 + }, + { + "epoch": 0.11466974692848893, + "grad_norm": 0.6916573643684387, + "learning_rate": 0.0001999712835994738, + "loss": 0.1976, + "step": 546 + }, + { + "epoch": 0.1148797647800063, + "grad_norm": 0.7029738426208496, + "learning_rate": 0.00019997044524974799, + "loss": 0.2076, + "step": 547 + }, + { + "epoch": 0.11508978263152368, + "grad_norm": 0.8003794550895691, + "learning_rate": 0.00019996959483983004, + "loss": 0.3284, + "step": 548 + }, + { + "epoch": 0.11529980048304106, + "grad_norm": 0.6394858360290527, + "learning_rate": 0.00019996873236982258, + "loss": 0.2397, + "step": 549 + }, + { + "epoch": 0.11550981833455844, + "grad_norm": 0.7164601683616638, + "learning_rate": 0.00019996785783982972, + "loss": 0.2097, + "step": 550 + }, + { + "epoch": 0.11571983618607581, + "grad_norm": 0.5346024036407471, + "learning_rate": 0.0001999669712499569, + "loss": 0.2559, + "step": 551 + }, + { + "epoch": 0.11592985403759319, + "grad_norm": 0.77498859167099, + "learning_rate": 0.00019996607260031106, + "loss": 0.3734, + "step": 552 + }, + { + "epoch": 0.11613987188911057, + "grad_norm": 0.6465743780136108, + "learning_rate": 0.00019996516189100066, + "loss": 0.2642, + "step": 553 + }, + { + "epoch": 0.11634988974062796, + "grad_norm": 0.4624630808830261, + "learning_rate": 0.00019996423912213554, + "loss": 0.1649, + "step": 554 + }, + { + "epoch": 0.11655990759214534, + "grad_norm": 0.6889922618865967, + "learning_rate": 0.00019996330429382703, + "loss": 0.2902, + "step": 555 + }, + { + "epoch": 0.11676992544366271, + "grad_norm": 0.7413411140441895, + "learning_rate": 0.0001999623574061879, + "loss": 0.2428, + "step": 556 + }, + { + "epoch": 0.11697994329518009, + "grad_norm": 0.9009401798248291, + "learning_rate": 0.0001999613984593324, + "loss": 0.3112, + "step": 557 + }, + { + "epoch": 0.11718996114669747, + "grad_norm": 0.6533844470977783, + "learning_rate": 0.00019996042745337617, + "loss": 0.2118, + "step": 558 + }, + { + "epoch": 0.11739997899821485, + "grad_norm": 0.6814008355140686, + "learning_rate": 0.00019995944438843636, + "loss": 0.2755, + "step": 559 + }, + { + "epoch": 0.11760999684973222, + "grad_norm": 0.6098254323005676, + "learning_rate": 0.0001999584492646316, + "loss": 0.2385, + "step": 560 + }, + { + "epoch": 0.1178200147012496, + "grad_norm": 0.7361414432525635, + "learning_rate": 0.00019995744208208194, + "loss": 0.326, + "step": 561 + }, + { + "epoch": 0.11803003255276699, + "grad_norm": 0.6809893250465393, + "learning_rate": 0.00019995642284090885, + "loss": 0.2727, + "step": 562 + }, + { + "epoch": 0.11824005040428437, + "grad_norm": 0.6401339769363403, + "learning_rate": 0.00019995539154123529, + "loss": 0.2143, + "step": 563 + }, + { + "epoch": 0.11845006825580175, + "grad_norm": 0.715313732624054, + "learning_rate": 0.00019995434818318567, + "loss": 0.2269, + "step": 564 + }, + { + "epoch": 0.11866008610731912, + "grad_norm": 0.6071058511734009, + "learning_rate": 0.00019995329276688593, + "loss": 0.4811, + "step": 565 + }, + { + "epoch": 0.1188701039588365, + "grad_norm": 0.882318377494812, + "learning_rate": 0.0001999522252924633, + "loss": 0.2607, + "step": 566 + }, + { + "epoch": 0.11908012181035388, + "grad_norm": 0.4758372902870178, + "learning_rate": 0.0001999511457600466, + "loss": 0.2013, + "step": 567 + }, + { + "epoch": 0.11929013966187126, + "grad_norm": 0.6482694149017334, + "learning_rate": 0.00019995005416976604, + "loss": 0.2567, + "step": 568 + }, + { + "epoch": 0.11950015751338863, + "grad_norm": 0.6689179539680481, + "learning_rate": 0.00019994895052175338, + "loss": 0.2498, + "step": 569 + }, + { + "epoch": 0.11971017536490601, + "grad_norm": 0.5817541480064392, + "learning_rate": 0.00019994783481614166, + "loss": 0.2013, + "step": 570 + }, + { + "epoch": 0.1199201932164234, + "grad_norm": 0.4717533588409424, + "learning_rate": 0.00019994670705306554, + "loss": 0.2647, + "step": 571 + }, + { + "epoch": 0.12013021106794078, + "grad_norm": 0.574079692363739, + "learning_rate": 0.00019994556723266103, + "loss": 0.1704, + "step": 572 + }, + { + "epoch": 0.12034022891945816, + "grad_norm": 0.759425938129425, + "learning_rate": 0.00019994441535506569, + "loss": 0.208, + "step": 573 + }, + { + "epoch": 0.12055024677097553, + "grad_norm": 0.5335227847099304, + "learning_rate": 0.0001999432514204184, + "loss": 0.2018, + "step": 574 + }, + { + "epoch": 0.12076026462249291, + "grad_norm": 0.5595372915267944, + "learning_rate": 0.00019994207542885963, + "loss": 0.2667, + "step": 575 + }, + { + "epoch": 0.12097028247401029, + "grad_norm": 0.6673279404640198, + "learning_rate": 0.00019994088738053124, + "loss": 0.3175, + "step": 576 + }, + { + "epoch": 0.12118030032552767, + "grad_norm": 0.5622591376304626, + "learning_rate": 0.0001999396872755766, + "loss": 0.1543, + "step": 577 + }, + { + "epoch": 0.12139031817704504, + "grad_norm": 0.5784288048744202, + "learning_rate": 0.0001999384751141404, + "loss": 0.2624, + "step": 578 + }, + { + "epoch": 0.12160033602856243, + "grad_norm": 0.6946631669998169, + "learning_rate": 0.00019993725089636891, + "loss": 0.2469, + "step": 579 + }, + { + "epoch": 0.12181035388007981, + "grad_norm": 0.6951069235801697, + "learning_rate": 0.00019993601462240984, + "loss": 0.2636, + "step": 580 + }, + { + "epoch": 0.12202037173159719, + "grad_norm": 0.5134934186935425, + "learning_rate": 0.0001999347662924123, + "loss": 0.1432, + "step": 581 + }, + { + "epoch": 0.12223038958311457, + "grad_norm": 0.5719296932220459, + "learning_rate": 0.00019993350590652691, + "loss": 0.2388, + "step": 582 + }, + { + "epoch": 0.12244040743463194, + "grad_norm": 0.7625638246536255, + "learning_rate": 0.0001999322334649057, + "loss": 0.2507, + "step": 583 + }, + { + "epoch": 0.12265042528614932, + "grad_norm": 0.6974209547042847, + "learning_rate": 0.00019993094896770218, + "loss": 0.2431, + "step": 584 + }, + { + "epoch": 0.1228604431376667, + "grad_norm": 0.7072513699531555, + "learning_rate": 0.0001999296524150713, + "loss": 0.1745, + "step": 585 + }, + { + "epoch": 0.12307046098918407, + "grad_norm": 0.7435344457626343, + "learning_rate": 0.00019992834380716946, + "loss": 0.216, + "step": 586 + }, + { + "epoch": 0.12328047884070147, + "grad_norm": 0.5491403937339783, + "learning_rate": 0.00019992702314415461, + "loss": 0.1853, + "step": 587 + }, + { + "epoch": 0.12349049669221884, + "grad_norm": 0.5487938523292542, + "learning_rate": 0.00019992569042618597, + "loss": 0.2361, + "step": 588 + }, + { + "epoch": 0.12370051454373622, + "grad_norm": 0.4346216320991516, + "learning_rate": 0.00019992434565342437, + "loss": 0.1812, + "step": 589 + }, + { + "epoch": 0.1239105323952536, + "grad_norm": 0.5448020696640015, + "learning_rate": 0.00019992298882603202, + "loss": 0.2017, + "step": 590 + }, + { + "epoch": 0.12412055024677097, + "grad_norm": 0.6867210268974304, + "learning_rate": 0.0001999216199441726, + "loss": 0.1788, + "step": 591 + }, + { + "epoch": 0.12433056809828835, + "grad_norm": 0.6821328401565552, + "learning_rate": 0.00019992023900801127, + "loss": 0.2159, + "step": 592 + }, + { + "epoch": 0.12454058594980573, + "grad_norm": 0.6648369431495667, + "learning_rate": 0.0001999188460177146, + "loss": 0.223, + "step": 593 + }, + { + "epoch": 0.1247506038013231, + "grad_norm": 0.6275060772895813, + "learning_rate": 0.00019991744097345068, + "loss": 0.2174, + "step": 594 + }, + { + "epoch": 0.1249606216528405, + "grad_norm": 0.43622860312461853, + "learning_rate": 0.00019991602387538896, + "loss": 0.1709, + "step": 595 + }, + { + "epoch": 0.12517063950435786, + "grad_norm": 0.41494739055633545, + "learning_rate": 0.00019991459472370042, + "loss": 0.1615, + "step": 596 + }, + { + "epoch": 0.12538065735587525, + "grad_norm": 0.4159907400608063, + "learning_rate": 0.00019991315351855748, + "loss": 0.1457, + "step": 597 + }, + { + "epoch": 0.12559067520739262, + "grad_norm": 0.8123224377632141, + "learning_rate": 0.00019991170026013397, + "loss": 0.202, + "step": 598 + }, + { + "epoch": 0.12580069305891, + "grad_norm": 0.9315401911735535, + "learning_rate": 0.00019991023494860522, + "loss": 0.2496, + "step": 599 + }, + { + "epoch": 0.1260107109104274, + "grad_norm": 0.7999815344810486, + "learning_rate": 0.00019990875758414803, + "loss": 0.2782, + "step": 600 + }, + { + "epoch": 0.12622072876194476, + "grad_norm": 0.5633922815322876, + "learning_rate": 0.0001999072681669406, + "loss": 0.2276, + "step": 601 + }, + { + "epoch": 0.12643074661346215, + "grad_norm": 0.6719483733177185, + "learning_rate": 0.00019990576669716258, + "loss": 0.3169, + "step": 602 + }, + { + "epoch": 0.12664076446497952, + "grad_norm": 0.7311053276062012, + "learning_rate": 0.0001999042531749952, + "loss": 0.2723, + "step": 603 + }, + { + "epoch": 0.1268507823164969, + "grad_norm": 0.5853881239891052, + "learning_rate": 0.00019990272760062093, + "loss": 0.2869, + "step": 604 + }, + { + "epoch": 0.12706080016801427, + "grad_norm": 0.7300302982330322, + "learning_rate": 0.0001999011899742239, + "loss": 0.2405, + "step": 605 + }, + { + "epoch": 0.12727081801953166, + "grad_norm": 0.704954206943512, + "learning_rate": 0.00019989964029598953, + "loss": 0.3195, + "step": 606 + }, + { + "epoch": 0.12748083587104905, + "grad_norm": 0.6305354833602905, + "learning_rate": 0.00019989807856610482, + "loss": 0.2442, + "step": 607 + }, + { + "epoch": 0.12769085372256642, + "grad_norm": 0.5027151107788086, + "learning_rate": 0.0001998965047847582, + "loss": 0.3006, + "step": 608 + }, + { + "epoch": 0.1279008715740838, + "grad_norm": 0.6237658262252808, + "learning_rate": 0.00019989491895213948, + "loss": 0.2019, + "step": 609 + }, + { + "epoch": 0.12811088942560117, + "grad_norm": 0.6959155797958374, + "learning_rate": 0.00019989332106844, + "loss": 0.2142, + "step": 610 + }, + { + "epoch": 0.12832090727711856, + "grad_norm": 0.7905144095420837, + "learning_rate": 0.0001998917111338525, + "loss": 0.2599, + "step": 611 + }, + { + "epoch": 0.12853092512863593, + "grad_norm": 0.7247504591941833, + "learning_rate": 0.00019989008914857116, + "loss": 0.2679, + "step": 612 + }, + { + "epoch": 0.12874094298015332, + "grad_norm": 0.5282559990882874, + "learning_rate": 0.0001998884551127917, + "loss": 0.2125, + "step": 613 + }, + { + "epoch": 0.12895096083167068, + "grad_norm": 0.6418580412864685, + "learning_rate": 0.0001998868090267113, + "loss": 0.2185, + "step": 614 + }, + { + "epoch": 0.12916097868318807, + "grad_norm": 0.48245900869369507, + "learning_rate": 0.00019988515089052844, + "loss": 0.2175, + "step": 615 + }, + { + "epoch": 0.12937099653470546, + "grad_norm": 0.4887724220752716, + "learning_rate": 0.00019988348070444322, + "loss": 0.1777, + "step": 616 + }, + { + "epoch": 0.12958101438622283, + "grad_norm": 0.5296192169189453, + "learning_rate": 0.0001998817984686571, + "loss": 0.2344, + "step": 617 + }, + { + "epoch": 0.12979103223774022, + "grad_norm": 0.6658729314804077, + "learning_rate": 0.00019988010418337305, + "loss": 0.2322, + "step": 618 + }, + { + "epoch": 0.13000105008925758, + "grad_norm": 0.5744292736053467, + "learning_rate": 0.0001998783978487954, + "loss": 0.2156, + "step": 619 + }, + { + "epoch": 0.13021106794077497, + "grad_norm": 0.5000370144844055, + "learning_rate": 0.00019987667946513006, + "loss": 0.2319, + "step": 620 + }, + { + "epoch": 0.13042108579229234, + "grad_norm": 0.8539411425590515, + "learning_rate": 0.00019987494903258432, + "loss": 0.3729, + "step": 621 + }, + { + "epoch": 0.13063110364380973, + "grad_norm": 0.6094825267791748, + "learning_rate": 0.00019987320655136693, + "loss": 0.2171, + "step": 622 + }, + { + "epoch": 0.1308411214953271, + "grad_norm": 0.6408823728561401, + "learning_rate": 0.00019987145202168805, + "loss": 0.2658, + "step": 623 + }, + { + "epoch": 0.13105113934684448, + "grad_norm": 0.5738769769668579, + "learning_rate": 0.0001998696854437594, + "loss": 0.2127, + "step": 624 + }, + { + "epoch": 0.13126115719836187, + "grad_norm": 0.6330286860466003, + "learning_rate": 0.00019986790681779412, + "loss": 0.1503, + "step": 625 + }, + { + "epoch": 0.13147117504987924, + "grad_norm": 0.8125373125076294, + "learning_rate": 0.0001998661161440067, + "loss": 0.2741, + "step": 626 + }, + { + "epoch": 0.13168119290139663, + "grad_norm": 0.710121750831604, + "learning_rate": 0.00019986431342261323, + "loss": 0.2672, + "step": 627 + }, + { + "epoch": 0.131891210752914, + "grad_norm": 0.8024762868881226, + "learning_rate": 0.00019986249865383115, + "loss": 0.2818, + "step": 628 + }, + { + "epoch": 0.13210122860443138, + "grad_norm": 1.0455816984176636, + "learning_rate": 0.0001998606718378794, + "loss": 0.3204, + "step": 629 + }, + { + "epoch": 0.13231124645594874, + "grad_norm": 0.7923910617828369, + "learning_rate": 0.00019985883297497835, + "loss": 0.213, + "step": 630 + }, + { + "epoch": 0.13252126430746614, + "grad_norm": 0.7458345890045166, + "learning_rate": 0.00019985698206534985, + "loss": 0.2066, + "step": 631 + }, + { + "epoch": 0.13273128215898353, + "grad_norm": 0.6166645884513855, + "learning_rate": 0.0001998551191092172, + "loss": 0.2239, + "step": 632 + }, + { + "epoch": 0.1329413000105009, + "grad_norm": 0.7050312161445618, + "learning_rate": 0.00019985324410680514, + "loss": 0.2692, + "step": 633 + }, + { + "epoch": 0.13315131786201828, + "grad_norm": 0.6465736627578735, + "learning_rate": 0.00019985135705833984, + "loss": 0.235, + "step": 634 + }, + { + "epoch": 0.13336133571353564, + "grad_norm": 0.6108490228652954, + "learning_rate": 0.00019984945796404894, + "loss": 0.2472, + "step": 635 + }, + { + "epoch": 0.13357135356505304, + "grad_norm": 0.725173830986023, + "learning_rate": 0.00019984754682416157, + "loss": 0.2521, + "step": 636 + }, + { + "epoch": 0.1337813714165704, + "grad_norm": 0.5391446352005005, + "learning_rate": 0.00019984562363890832, + "loss": 0.2151, + "step": 637 + }, + { + "epoch": 0.1339913892680878, + "grad_norm": 0.44177114963531494, + "learning_rate": 0.00019984368840852114, + "loss": 0.179, + "step": 638 + }, + { + "epoch": 0.13420140711960515, + "grad_norm": 0.48038744926452637, + "learning_rate": 0.00019984174113323353, + "loss": 0.2296, + "step": 639 + }, + { + "epoch": 0.13441142497112255, + "grad_norm": 0.5720350742340088, + "learning_rate": 0.00019983978181328037, + "loss": 0.1843, + "step": 640 + }, + { + "epoch": 0.13462144282263994, + "grad_norm": 0.4996393322944641, + "learning_rate": 0.00019983781044889803, + "loss": 0.225, + "step": 641 + }, + { + "epoch": 0.1348314606741573, + "grad_norm": 0.5970807671546936, + "learning_rate": 0.00019983582704032434, + "loss": 0.179, + "step": 642 + }, + { + "epoch": 0.1350414785256747, + "grad_norm": 0.858808159828186, + "learning_rate": 0.0001998338315877986, + "loss": 0.246, + "step": 643 + }, + { + "epoch": 0.13525149637719205, + "grad_norm": 0.6708926558494568, + "learning_rate": 0.0001998318240915615, + "loss": 0.197, + "step": 644 + }, + { + "epoch": 0.13546151422870945, + "grad_norm": 0.8443548083305359, + "learning_rate": 0.00019982980455185526, + "loss": 0.1889, + "step": 645 + }, + { + "epoch": 0.1356715320802268, + "grad_norm": 0.6451512575149536, + "learning_rate": 0.00019982777296892346, + "loss": 0.2103, + "step": 646 + }, + { + "epoch": 0.1358815499317442, + "grad_norm": 0.7601468563079834, + "learning_rate": 0.00019982572934301122, + "loss": 0.2338, + "step": 647 + }, + { + "epoch": 0.1360915677832616, + "grad_norm": 0.5944762229919434, + "learning_rate": 0.00019982367367436506, + "loss": 0.1814, + "step": 648 + }, + { + "epoch": 0.13630158563477895, + "grad_norm": 0.7542382478713989, + "learning_rate": 0.00019982160596323297, + "loss": 0.2062, + "step": 649 + }, + { + "epoch": 0.13651160348629635, + "grad_norm": 0.560296893119812, + "learning_rate": 0.00019981952620986442, + "loss": 0.199, + "step": 650 + }, + { + "epoch": 0.1367216213378137, + "grad_norm": 0.5254395604133606, + "learning_rate": 0.0001998174344145103, + "loss": 0.2943, + "step": 651 + }, + { + "epoch": 0.1369316391893311, + "grad_norm": 0.6042603254318237, + "learning_rate": 0.00019981533057742294, + "loss": 0.2355, + "step": 652 + }, + { + "epoch": 0.13714165704084846, + "grad_norm": 0.6384417414665222, + "learning_rate": 0.00019981321469885615, + "loss": 0.202, + "step": 653 + }, + { + "epoch": 0.13735167489236585, + "grad_norm": 0.7300348877906799, + "learning_rate": 0.0001998110867790652, + "loss": 0.2422, + "step": 654 + }, + { + "epoch": 0.13756169274388322, + "grad_norm": 0.5238686800003052, + "learning_rate": 0.00019980894681830678, + "loss": 0.2491, + "step": 655 + }, + { + "epoch": 0.1377717105954006, + "grad_norm": 0.7352842092514038, + "learning_rate": 0.00019980679481683904, + "loss": 0.3193, + "step": 656 + }, + { + "epoch": 0.137981728446918, + "grad_norm": 0.6651904582977295, + "learning_rate": 0.0001998046307749216, + "loss": 0.2618, + "step": 657 + }, + { + "epoch": 0.13819174629843536, + "grad_norm": 0.6976970434188843, + "learning_rate": 0.00019980245469281553, + "loss": 0.2622, + "step": 658 + }, + { + "epoch": 0.13840176414995276, + "grad_norm": 0.6078370809555054, + "learning_rate": 0.00019980026657078336, + "loss": 0.2532, + "step": 659 + }, + { + "epoch": 0.13861178200147012, + "grad_norm": 0.7155233025550842, + "learning_rate": 0.00019979806640908906, + "loss": 0.3283, + "step": 660 + }, + { + "epoch": 0.1388217998529875, + "grad_norm": 0.519636869430542, + "learning_rate": 0.00019979585420799802, + "loss": 0.187, + "step": 661 + }, + { + "epoch": 0.13903181770450487, + "grad_norm": 0.7007790803909302, + "learning_rate": 0.00019979362996777714, + "loss": 0.2554, + "step": 662 + }, + { + "epoch": 0.13924183555602226, + "grad_norm": 0.6281614303588867, + "learning_rate": 0.00019979139368869473, + "loss": 0.2153, + "step": 663 + }, + { + "epoch": 0.13945185340753963, + "grad_norm": 0.5729889869689941, + "learning_rate": 0.00019978914537102055, + "loss": 0.2432, + "step": 664 + }, + { + "epoch": 0.13966187125905702, + "grad_norm": 0.4995453357696533, + "learning_rate": 0.00019978688501502592, + "loss": 0.1931, + "step": 665 + }, + { + "epoch": 0.1398718891105744, + "grad_norm": 0.48151615262031555, + "learning_rate": 0.00019978461262098343, + "loss": 0.1664, + "step": 666 + }, + { + "epoch": 0.14008190696209177, + "grad_norm": 0.6951011419296265, + "learning_rate": 0.00019978232818916727, + "loss": 0.2229, + "step": 667 + }, + { + "epoch": 0.14029192481360916, + "grad_norm": 0.5914542078971863, + "learning_rate": 0.000199780031719853, + "loss": 0.2126, + "step": 668 + }, + { + "epoch": 0.14050194266512653, + "grad_norm": 0.7551674246788025, + "learning_rate": 0.00019977772321331765, + "loss": 0.2806, + "step": 669 + }, + { + "epoch": 0.14071196051664392, + "grad_norm": 0.7960730195045471, + "learning_rate": 0.00019977540266983976, + "loss": 0.2653, + "step": 670 + }, + { + "epoch": 0.14092197836816128, + "grad_norm": 0.5545317530632019, + "learning_rate": 0.00019977307008969922, + "loss": 0.2141, + "step": 671 + }, + { + "epoch": 0.14113199621967867, + "grad_norm": 0.7467978596687317, + "learning_rate": 0.0001997707254731775, + "loss": 0.1961, + "step": 672 + }, + { + "epoch": 0.14134201407119606, + "grad_norm": 0.6775459051132202, + "learning_rate": 0.00019976836882055736, + "loss": 0.2304, + "step": 673 + }, + { + "epoch": 0.14155203192271343, + "grad_norm": 0.793547511100769, + "learning_rate": 0.00019976600013212317, + "loss": 0.2266, + "step": 674 + }, + { + "epoch": 0.14176204977423082, + "grad_norm": 0.6920728087425232, + "learning_rate": 0.00019976361940816063, + "loss": 0.3469, + "step": 675 + }, + { + "epoch": 0.14197206762574818, + "grad_norm": 0.840145468711853, + "learning_rate": 0.000199761226648957, + "loss": 0.235, + "step": 676 + }, + { + "epoch": 0.14218208547726557, + "grad_norm": 0.8047716617584229, + "learning_rate": 0.0001997588218548009, + "loss": 0.3034, + "step": 677 + }, + { + "epoch": 0.14239210332878294, + "grad_norm": 0.649042010307312, + "learning_rate": 0.00019975640502598244, + "loss": 0.2822, + "step": 678 + }, + { + "epoch": 0.14260212118030033, + "grad_norm": 0.6780881881713867, + "learning_rate": 0.0001997539761627932, + "loss": 0.1593, + "step": 679 + }, + { + "epoch": 0.1428121390318177, + "grad_norm": 0.6812571883201599, + "learning_rate": 0.00019975153526552615, + "loss": 0.1898, + "step": 680 + }, + { + "epoch": 0.14302215688333508, + "grad_norm": 0.5687631368637085, + "learning_rate": 0.0001997490823344758, + "loss": 0.2193, + "step": 681 + }, + { + "epoch": 0.14323217473485247, + "grad_norm": 0.8981772065162659, + "learning_rate": 0.00019974661736993804, + "loss": 0.2785, + "step": 682 + }, + { + "epoch": 0.14344219258636984, + "grad_norm": 0.6966889500617981, + "learning_rate": 0.00019974414037221027, + "loss": 0.2678, + "step": 683 + }, + { + "epoch": 0.14365221043788723, + "grad_norm": 0.5631129145622253, + "learning_rate": 0.00019974165134159126, + "loss": 0.2836, + "step": 684 + }, + { + "epoch": 0.1438622282894046, + "grad_norm": 0.7686763405799866, + "learning_rate": 0.00019973915027838134, + "loss": 0.2372, + "step": 685 + }, + { + "epoch": 0.14407224614092198, + "grad_norm": 0.881515383720398, + "learning_rate": 0.00019973663718288217, + "loss": 0.2901, + "step": 686 + }, + { + "epoch": 0.14428226399243935, + "grad_norm": 0.6973896026611328, + "learning_rate": 0.00019973411205539694, + "loss": 0.2577, + "step": 687 + }, + { + "epoch": 0.14449228184395674, + "grad_norm": 0.3398377299308777, + "learning_rate": 0.0001997315748962303, + "loss": 0.1305, + "step": 688 + }, + { + "epoch": 0.1447022996954741, + "grad_norm": 0.6775567531585693, + "learning_rate": 0.0001997290257056883, + "loss": 0.3746, + "step": 689 + }, + { + "epoch": 0.1449123175469915, + "grad_norm": 0.3776891827583313, + "learning_rate": 0.0001997264644840785, + "loss": 0.1731, + "step": 690 + }, + { + "epoch": 0.14512233539850888, + "grad_norm": 0.6515337824821472, + "learning_rate": 0.00019972389123170986, + "loss": 0.2596, + "step": 691 + }, + { + "epoch": 0.14533235325002625, + "grad_norm": 0.7165318131446838, + "learning_rate": 0.00019972130594889286, + "loss": 0.2673, + "step": 692 + }, + { + "epoch": 0.14554237110154364, + "grad_norm": 0.5702444314956665, + "learning_rate": 0.00019971870863593925, + "loss": 0.1928, + "step": 693 + }, + { + "epoch": 0.145752388953061, + "grad_norm": 0.3542981743812561, + "learning_rate": 0.0001997160992931625, + "loss": 0.1277, + "step": 694 + }, + { + "epoch": 0.1459624068045784, + "grad_norm": 0.6520780920982361, + "learning_rate": 0.00019971347792087732, + "loss": 0.2623, + "step": 695 + }, + { + "epoch": 0.14617242465609576, + "grad_norm": 0.4505969285964966, + "learning_rate": 0.00019971084451939997, + "loss": 0.2026, + "step": 696 + }, + { + "epoch": 0.14638244250761315, + "grad_norm": 0.623036801815033, + "learning_rate": 0.00019970819908904814, + "loss": 0.2371, + "step": 697 + }, + { + "epoch": 0.14659246035913054, + "grad_norm": 0.60871422290802, + "learning_rate": 0.00019970554163014097, + "loss": 0.3128, + "step": 698 + }, + { + "epoch": 0.1468024782106479, + "grad_norm": 0.6321155428886414, + "learning_rate": 0.00019970287214299902, + "loss": 0.2183, + "step": 699 + }, + { + "epoch": 0.1470124960621653, + "grad_norm": 0.7513316869735718, + "learning_rate": 0.0001997001906279444, + "loss": 0.2753, + "step": 700 + }, + { + "epoch": 0.14722251391368266, + "grad_norm": 0.4192676842212677, + "learning_rate": 0.0001996974970853005, + "loss": 0.3071, + "step": 701 + }, + { + "epoch": 0.14743253176520005, + "grad_norm": 0.5773706436157227, + "learning_rate": 0.00019969479151539236, + "loss": 0.2883, + "step": 702 + }, + { + "epoch": 0.1476425496167174, + "grad_norm": 0.4587963819503784, + "learning_rate": 0.00019969207391854632, + "loss": 0.2997, + "step": 703 + }, + { + "epoch": 0.1478525674682348, + "grad_norm": 0.6077782511711121, + "learning_rate": 0.00019968934429509023, + "loss": 0.182, + "step": 704 + }, + { + "epoch": 0.14806258531975217, + "grad_norm": 0.6285839676856995, + "learning_rate": 0.0001996866026453534, + "loss": 0.3573, + "step": 705 + }, + { + "epoch": 0.14827260317126956, + "grad_norm": 0.7416669726371765, + "learning_rate": 0.00019968384896966657, + "loss": 0.2424, + "step": 706 + }, + { + "epoch": 0.14848262102278695, + "grad_norm": 0.5475688576698303, + "learning_rate": 0.0001996810832683619, + "loss": 0.1766, + "step": 707 + }, + { + "epoch": 0.1486926388743043, + "grad_norm": 0.5601086020469666, + "learning_rate": 0.00019967830554177312, + "loss": 0.2725, + "step": 708 + }, + { + "epoch": 0.1489026567258217, + "grad_norm": 0.7686034440994263, + "learning_rate": 0.00019967551579023524, + "loss": 0.3008, + "step": 709 + }, + { + "epoch": 0.14911267457733907, + "grad_norm": 0.8172418475151062, + "learning_rate": 0.00019967271401408486, + "loss": 0.3042, + "step": 710 + }, + { + "epoch": 0.14932269242885646, + "grad_norm": 0.8726032972335815, + "learning_rate": 0.00019966990021366, + "loss": 0.224, + "step": 711 + }, + { + "epoch": 0.14953271028037382, + "grad_norm": 0.6053635478019714, + "learning_rate": 0.00019966707438930003, + "loss": 0.2325, + "step": 712 + }, + { + "epoch": 0.1497427281318912, + "grad_norm": 0.7157438397407532, + "learning_rate": 0.00019966423654134592, + "loss": 0.2656, + "step": 713 + }, + { + "epoch": 0.1499527459834086, + "grad_norm": 0.6943267583847046, + "learning_rate": 0.00019966138667014, + "loss": 0.2625, + "step": 714 + }, + { + "epoch": 0.15016276383492597, + "grad_norm": 0.7070578336715698, + "learning_rate": 0.00019965852477602604, + "loss": 0.2795, + "step": 715 + }, + { + "epoch": 0.15037278168644336, + "grad_norm": 0.654684841632843, + "learning_rate": 0.00019965565085934935, + "loss": 0.2168, + "step": 716 + }, + { + "epoch": 0.15058279953796072, + "grad_norm": 0.5972804427146912, + "learning_rate": 0.00019965276492045662, + "loss": 0.2337, + "step": 717 + }, + { + "epoch": 0.1507928173894781, + "grad_norm": 0.4990095794200897, + "learning_rate": 0.000199649866959696, + "loss": 0.3187, + "step": 718 + }, + { + "epoch": 0.15100283524099548, + "grad_norm": 0.6247003078460693, + "learning_rate": 0.00019964695697741703, + "loss": 0.2139, + "step": 719 + }, + { + "epoch": 0.15121285309251287, + "grad_norm": 0.6358337998390198, + "learning_rate": 0.00019964403497397084, + "loss": 0.244, + "step": 720 + }, + { + "epoch": 0.15142287094403023, + "grad_norm": 0.5211917161941528, + "learning_rate": 0.0001996411009497099, + "loss": 0.1784, + "step": 721 + }, + { + "epoch": 0.15163288879554762, + "grad_norm": 0.464606374502182, + "learning_rate": 0.00019963815490498817, + "loss": 0.2137, + "step": 722 + }, + { + "epoch": 0.151842906647065, + "grad_norm": 0.7099301815032959, + "learning_rate": 0.00019963519684016107, + "loss": 0.2927, + "step": 723 + }, + { + "epoch": 0.15205292449858238, + "grad_norm": 0.7805564999580383, + "learning_rate": 0.00019963222675558543, + "loss": 0.2374, + "step": 724 + }, + { + "epoch": 0.15226294235009977, + "grad_norm": 0.6172361373901367, + "learning_rate": 0.00019962924465161957, + "loss": 0.201, + "step": 725 + }, + { + "epoch": 0.15247296020161713, + "grad_norm": 0.6261605620384216, + "learning_rate": 0.0001996262505286232, + "loss": 0.1709, + "step": 726 + }, + { + "epoch": 0.15268297805313452, + "grad_norm": 0.6561511158943176, + "learning_rate": 0.00019962324438695762, + "loss": 0.2283, + "step": 727 + }, + { + "epoch": 0.15289299590465188, + "grad_norm": 0.5386349558830261, + "learning_rate": 0.0001996202262269854, + "loss": 0.231, + "step": 728 + }, + { + "epoch": 0.15310301375616928, + "grad_norm": 0.644136369228363, + "learning_rate": 0.00019961719604907066, + "loss": 0.1875, + "step": 729 + }, + { + "epoch": 0.15331303160768664, + "grad_norm": 0.6452980041503906, + "learning_rate": 0.00019961415385357897, + "loss": 0.2294, + "step": 730 + }, + { + "epoch": 0.15352304945920403, + "grad_norm": 0.5558809041976929, + "learning_rate": 0.0001996110996408773, + "loss": 0.1988, + "step": 731 + }, + { + "epoch": 0.15373306731072142, + "grad_norm": 0.6049979329109192, + "learning_rate": 0.00019960803341133413, + "loss": 0.2368, + "step": 732 + }, + { + "epoch": 0.15394308516223879, + "grad_norm": 0.6450143456459045, + "learning_rate": 0.00019960495516531935, + "loss": 0.2217, + "step": 733 + }, + { + "epoch": 0.15415310301375618, + "grad_norm": 0.6582781672477722, + "learning_rate": 0.00019960186490320436, + "loss": 0.1942, + "step": 734 + }, + { + "epoch": 0.15436312086527354, + "grad_norm": 0.5160269141197205, + "learning_rate": 0.0001995987626253619, + "loss": 0.1723, + "step": 735 + }, + { + "epoch": 0.15457313871679093, + "grad_norm": 0.6058139801025391, + "learning_rate": 0.00019959564833216625, + "loss": 0.2089, + "step": 736 + }, + { + "epoch": 0.1547831565683083, + "grad_norm": 0.540282666683197, + "learning_rate": 0.0001995925220239931, + "loss": 0.2089, + "step": 737 + }, + { + "epoch": 0.15499317441982569, + "grad_norm": 0.7635892033576965, + "learning_rate": 0.0001995893837012196, + "loss": 0.2825, + "step": 738 + }, + { + "epoch": 0.15520319227134308, + "grad_norm": 0.5233755111694336, + "learning_rate": 0.00019958623336422434, + "loss": 0.1514, + "step": 739 + }, + { + "epoch": 0.15541321012286044, + "grad_norm": 0.44758716225624084, + "learning_rate": 0.00019958307101338742, + "loss": 0.132, + "step": 740 + }, + { + "epoch": 0.15562322797437783, + "grad_norm": 0.7145951390266418, + "learning_rate": 0.00019957989664909026, + "loss": 0.2395, + "step": 741 + }, + { + "epoch": 0.1558332458258952, + "grad_norm": 0.6241814494132996, + "learning_rate": 0.0001995767102717159, + "loss": 0.2255, + "step": 742 + }, + { + "epoch": 0.15604326367741259, + "grad_norm": 0.502863883972168, + "learning_rate": 0.00019957351188164865, + "loss": 0.1941, + "step": 743 + }, + { + "epoch": 0.15625328152892995, + "grad_norm": 0.5572714805603027, + "learning_rate": 0.00019957030147927442, + "loss": 0.1664, + "step": 744 + }, + { + "epoch": 0.15646329938044734, + "grad_norm": 1.0500191450119019, + "learning_rate": 0.00019956707906498044, + "loss": 0.3229, + "step": 745 + }, + { + "epoch": 0.1566733172319647, + "grad_norm": 0.595522403717041, + "learning_rate": 0.0001995638446391555, + "loss": 0.1932, + "step": 746 + }, + { + "epoch": 0.1568833350834821, + "grad_norm": 0.38818204402923584, + "learning_rate": 0.00019956059820218982, + "loss": 0.1324, + "step": 747 + }, + { + "epoch": 0.1570933529349995, + "grad_norm": 0.6705027222633362, + "learning_rate": 0.000199557339754475, + "loss": 0.194, + "step": 748 + }, + { + "epoch": 0.15730337078651685, + "grad_norm": 0.4935189485549927, + "learning_rate": 0.0001995540692964041, + "loss": 0.2492, + "step": 749 + }, + { + "epoch": 0.15751338863803424, + "grad_norm": 0.3950806260108948, + "learning_rate": 0.00019955078682837174, + "loss": 0.1331, + "step": 750 + }, + { + "epoch": 0.1577234064895516, + "grad_norm": 0.6625058650970459, + "learning_rate": 0.00019954749235077384, + "loss": 0.297, + "step": 751 + }, + { + "epoch": 0.157933424341069, + "grad_norm": 0.5862818956375122, + "learning_rate": 0.00019954418586400787, + "loss": 0.2628, + "step": 752 + }, + { + "epoch": 0.15814344219258636, + "grad_norm": 0.6951611042022705, + "learning_rate": 0.0001995408673684727, + "loss": 0.2573, + "step": 753 + }, + { + "epoch": 0.15835346004410375, + "grad_norm": 0.8030470013618469, + "learning_rate": 0.0001995375368645687, + "loss": 0.2671, + "step": 754 + }, + { + "epoch": 0.15856347789562114, + "grad_norm": 0.4509555995464325, + "learning_rate": 0.00019953419435269764, + "loss": 0.1808, + "step": 755 + }, + { + "epoch": 0.1587734957471385, + "grad_norm": 0.7687417268753052, + "learning_rate": 0.0001995308398332627, + "loss": 0.2906, + "step": 756 + }, + { + "epoch": 0.1589835135986559, + "grad_norm": 0.7642715573310852, + "learning_rate": 0.00019952747330666867, + "loss": 0.3541, + "step": 757 + }, + { + "epoch": 0.15919353145017326, + "grad_norm": 0.6699778437614441, + "learning_rate": 0.00019952409477332156, + "loss": 0.2494, + "step": 758 + }, + { + "epoch": 0.15940354930169065, + "grad_norm": 0.7119278907775879, + "learning_rate": 0.00019952070423362903, + "loss": 0.1994, + "step": 759 + }, + { + "epoch": 0.159613567153208, + "grad_norm": 0.6130563616752625, + "learning_rate": 0.00019951730168800004, + "loss": 0.3433, + "step": 760 + }, + { + "epoch": 0.1598235850047254, + "grad_norm": 0.692933201789856, + "learning_rate": 0.00019951388713684514, + "loss": 0.1762, + "step": 761 + }, + { + "epoch": 0.16003360285624277, + "grad_norm": 0.5561717748641968, + "learning_rate": 0.00019951046058057622, + "loss": 0.2266, + "step": 762 + }, + { + "epoch": 0.16024362070776016, + "grad_norm": 0.8559679388999939, + "learning_rate": 0.00019950702201960665, + "loss": 0.3145, + "step": 763 + }, + { + "epoch": 0.16045363855927755, + "grad_norm": 0.7173314094543457, + "learning_rate": 0.00019950357145435122, + "loss": 0.2079, + "step": 764 + }, + { + "epoch": 0.16066365641079491, + "grad_norm": 0.4696892201900482, + "learning_rate": 0.00019950010888522625, + "loss": 0.2374, + "step": 765 + }, + { + "epoch": 0.1608736742623123, + "grad_norm": 0.5349077582359314, + "learning_rate": 0.00019949663431264943, + "loss": 0.2221, + "step": 766 + }, + { + "epoch": 0.16108369211382967, + "grad_norm": 0.49449819326400757, + "learning_rate": 0.0001994931477370399, + "loss": 0.1432, + "step": 767 + }, + { + "epoch": 0.16129370996534706, + "grad_norm": 0.652260422706604, + "learning_rate": 0.00019948964915881835, + "loss": 0.2122, + "step": 768 + }, + { + "epoch": 0.16150372781686442, + "grad_norm": 0.6549475789070129, + "learning_rate": 0.00019948613857840672, + "loss": 0.3484, + "step": 769 + }, + { + "epoch": 0.16171374566838181, + "grad_norm": 0.6772179007530212, + "learning_rate": 0.00019948261599622865, + "loss": 0.2784, + "step": 770 + }, + { + "epoch": 0.16192376351989918, + "grad_norm": 0.788960337638855, + "learning_rate": 0.00019947908141270898, + "loss": 0.1939, + "step": 771 + }, + { + "epoch": 0.16213378137141657, + "grad_norm": 0.6915500164031982, + "learning_rate": 0.00019947553482827418, + "loss": 0.1541, + "step": 772 + }, + { + "epoch": 0.16234379922293396, + "grad_norm": 0.604015052318573, + "learning_rate": 0.0001994719762433521, + "loss": 0.2148, + "step": 773 + }, + { + "epoch": 0.16255381707445132, + "grad_norm": 0.8275285959243774, + "learning_rate": 0.00019946840565837203, + "loss": 0.2808, + "step": 774 + }, + { + "epoch": 0.16276383492596871, + "grad_norm": 0.6737775802612305, + "learning_rate": 0.00019946482307376472, + "loss": 0.1813, + "step": 775 + }, + { + "epoch": 0.16297385277748608, + "grad_norm": 0.8311626315116882, + "learning_rate": 0.0001994612284899623, + "loss": 0.2819, + "step": 776 + }, + { + "epoch": 0.16318387062900347, + "grad_norm": 0.7368951439857483, + "learning_rate": 0.00019945762190739852, + "loss": 0.2619, + "step": 777 + }, + { + "epoch": 0.16339388848052083, + "grad_norm": 0.6095349788665771, + "learning_rate": 0.0001994540033265084, + "loss": 0.2449, + "step": 778 + }, + { + "epoch": 0.16360390633203822, + "grad_norm": 0.6738486886024475, + "learning_rate": 0.0001994503727477285, + "loss": 0.1493, + "step": 779 + }, + { + "epoch": 0.16381392418355561, + "grad_norm": 0.5636208653450012, + "learning_rate": 0.0001994467301714968, + "loss": 0.1949, + "step": 780 + }, + { + "epoch": 0.16402394203507298, + "grad_norm": 0.9404299259185791, + "learning_rate": 0.00019944307559825272, + "loss": 0.2503, + "step": 781 + }, + { + "epoch": 0.16423395988659037, + "grad_norm": 0.6188719868659973, + "learning_rate": 0.0001994394090284372, + "loss": 0.1658, + "step": 782 + }, + { + "epoch": 0.16444397773810773, + "grad_norm": 0.9498090147972107, + "learning_rate": 0.00019943573046249244, + "loss": 0.3425, + "step": 783 + }, + { + "epoch": 0.16465399558962512, + "grad_norm": 0.6508981585502625, + "learning_rate": 0.00019943203990086233, + "loss": 0.1384, + "step": 784 + }, + { + "epoch": 0.1648640134411425, + "grad_norm": 1.0658531188964844, + "learning_rate": 0.00019942833734399202, + "loss": 0.2609, + "step": 785 + }, + { + "epoch": 0.16507403129265988, + "grad_norm": 0.7281699776649475, + "learning_rate": 0.00019942462279232825, + "loss": 0.1985, + "step": 786 + }, + { + "epoch": 0.16528404914417724, + "grad_norm": 0.7734364867210388, + "learning_rate": 0.00019942089624631906, + "loss": 0.2617, + "step": 787 + }, + { + "epoch": 0.16549406699569463, + "grad_norm": 0.977069616317749, + "learning_rate": 0.00019941715770641408, + "loss": 0.2928, + "step": 788 + }, + { + "epoch": 0.16570408484721202, + "grad_norm": 0.7139049768447876, + "learning_rate": 0.00019941340717306424, + "loss": 0.3369, + "step": 789 + }, + { + "epoch": 0.1659141026987294, + "grad_norm": 0.5771147012710571, + "learning_rate": 0.00019940964464672205, + "loss": 0.2304, + "step": 790 + }, + { + "epoch": 0.16612412055024678, + "grad_norm": 0.5506160855293274, + "learning_rate": 0.00019940587012784138, + "loss": 0.2084, + "step": 791 + }, + { + "epoch": 0.16633413840176414, + "grad_norm": 0.48316794633865356, + "learning_rate": 0.0001994020836168776, + "loss": 0.1835, + "step": 792 + }, + { + "epoch": 0.16654415625328153, + "grad_norm": 0.5649861693382263, + "learning_rate": 0.00019939828511428753, + "loss": 0.2426, + "step": 793 + }, + { + "epoch": 0.1667541741047989, + "grad_norm": 0.5224729180335999, + "learning_rate": 0.00019939447462052936, + "loss": 0.1862, + "step": 794 + }, + { + "epoch": 0.1669641919563163, + "grad_norm": 0.5801841616630554, + "learning_rate": 0.00019939065213606282, + "loss": 0.2081, + "step": 795 + }, + { + "epoch": 0.16717420980783368, + "grad_norm": 0.4274038076400757, + "learning_rate": 0.00019938681766134902, + "loss": 0.14, + "step": 796 + }, + { + "epoch": 0.16738422765935104, + "grad_norm": 0.5294644236564636, + "learning_rate": 0.00019938297119685054, + "loss": 0.1851, + "step": 797 + }, + { + "epoch": 0.16759424551086843, + "grad_norm": 0.5110440850257874, + "learning_rate": 0.00019937911274303145, + "loss": 0.2346, + "step": 798 + }, + { + "epoch": 0.1678042633623858, + "grad_norm": 0.5785256028175354, + "learning_rate": 0.00019937524230035717, + "loss": 0.1554, + "step": 799 + }, + { + "epoch": 0.1680142812139032, + "grad_norm": 0.586320161819458, + "learning_rate": 0.00019937135986929465, + "loss": 0.2672, + "step": 800 + }, + { + "epoch": 0.16822429906542055, + "grad_norm": 0.502890408039093, + "learning_rate": 0.00019936746545031223, + "loss": 0.3023, + "step": 801 + }, + { + "epoch": 0.16843431691693794, + "grad_norm": 0.5421012043952942, + "learning_rate": 0.00019936355904387977, + "loss": 0.2331, + "step": 802 + }, + { + "epoch": 0.1686443347684553, + "grad_norm": 0.5681023001670837, + "learning_rate": 0.0001993596406504685, + "loss": 0.2064, + "step": 803 + }, + { + "epoch": 0.1688543526199727, + "grad_norm": 0.4179142713546753, + "learning_rate": 0.00019935571027055113, + "loss": 0.2302, + "step": 804 + }, + { + "epoch": 0.1690643704714901, + "grad_norm": 0.7016621232032776, + "learning_rate": 0.00019935176790460179, + "loss": 0.2442, + "step": 805 + }, + { + "epoch": 0.16927438832300745, + "grad_norm": 0.5401879549026489, + "learning_rate": 0.00019934781355309612, + "loss": 0.2798, + "step": 806 + }, + { + "epoch": 0.16948440617452484, + "grad_norm": 0.5687265396118164, + "learning_rate": 0.00019934384721651113, + "loss": 0.2097, + "step": 807 + }, + { + "epoch": 0.1696944240260422, + "grad_norm": 0.659520149230957, + "learning_rate": 0.00019933986889532533, + "loss": 0.1938, + "step": 808 + }, + { + "epoch": 0.1699044418775596, + "grad_norm": 0.8230718970298767, + "learning_rate": 0.00019933587859001866, + "loss": 0.4148, + "step": 809 + }, + { + "epoch": 0.17011445972907696, + "grad_norm": 0.7954551577568054, + "learning_rate": 0.00019933187630107244, + "loss": 0.4564, + "step": 810 + }, + { + "epoch": 0.17032447758059435, + "grad_norm": 0.6618001461029053, + "learning_rate": 0.0001993278620289696, + "loss": 0.2819, + "step": 811 + }, + { + "epoch": 0.17053449543211172, + "grad_norm": 0.9731025099754333, + "learning_rate": 0.00019932383577419432, + "loss": 0.3954, + "step": 812 + }, + { + "epoch": 0.1707445132836291, + "grad_norm": 0.7344256639480591, + "learning_rate": 0.00019931979753723232, + "loss": 0.2502, + "step": 813 + }, + { + "epoch": 0.1709545311351465, + "grad_norm": 0.6986575722694397, + "learning_rate": 0.00019931574731857086, + "loss": 0.2499, + "step": 814 + }, + { + "epoch": 0.17116454898666386, + "grad_norm": 0.5757253170013428, + "learning_rate": 0.00019931168511869846, + "loss": 0.2445, + "step": 815 + }, + { + "epoch": 0.17137456683818125, + "grad_norm": 0.5453664064407349, + "learning_rate": 0.0001993076109381052, + "loss": 0.2494, + "step": 816 + }, + { + "epoch": 0.17158458468969862, + "grad_norm": 0.7031118869781494, + "learning_rate": 0.00019930352477728257, + "loss": 0.2777, + "step": 817 + }, + { + "epoch": 0.171794602541216, + "grad_norm": 0.6201139092445374, + "learning_rate": 0.0001992994266367235, + "loss": 0.2145, + "step": 818 + }, + { + "epoch": 0.17200462039273337, + "grad_norm": 0.6421683430671692, + "learning_rate": 0.00019929531651692245, + "loss": 0.1951, + "step": 819 + }, + { + "epoch": 0.17221463824425076, + "grad_norm": 0.6390677094459534, + "learning_rate": 0.00019929119441837518, + "loss": 0.2011, + "step": 820 + }, + { + "epoch": 0.17242465609576815, + "grad_norm": 0.5171882510185242, + "learning_rate": 0.00019928706034157901, + "loss": 0.1459, + "step": 821 + }, + { + "epoch": 0.17263467394728552, + "grad_norm": 0.6737155914306641, + "learning_rate": 0.00019928291428703262, + "loss": 0.1507, + "step": 822 + }, + { + "epoch": 0.1728446917988029, + "grad_norm": 0.526128351688385, + "learning_rate": 0.00019927875625523625, + "loss": 0.1565, + "step": 823 + }, + { + "epoch": 0.17305470965032027, + "grad_norm": 0.7430817484855652, + "learning_rate": 0.00019927458624669145, + "loss": 0.2575, + "step": 824 + }, + { + "epoch": 0.17326472750183766, + "grad_norm": 0.4702281355857849, + "learning_rate": 0.0001992704042619013, + "loss": 0.1796, + "step": 825 + }, + { + "epoch": 0.17347474535335503, + "grad_norm": 0.5295049548149109, + "learning_rate": 0.00019926621030137034, + "loss": 0.1974, + "step": 826 + }, + { + "epoch": 0.17368476320487242, + "grad_norm": 0.667036771774292, + "learning_rate": 0.00019926200436560447, + "loss": 0.2125, + "step": 827 + }, + { + "epoch": 0.17389478105638978, + "grad_norm": 0.7351561188697815, + "learning_rate": 0.0001992577864551111, + "loss": 0.2271, + "step": 828 + }, + { + "epoch": 0.17410479890790717, + "grad_norm": 0.8084509372711182, + "learning_rate": 0.0001992535565703991, + "loss": 0.2301, + "step": 829 + }, + { + "epoch": 0.17431481675942456, + "grad_norm": 0.7022576928138733, + "learning_rate": 0.0001992493147119787, + "loss": 0.2662, + "step": 830 + }, + { + "epoch": 0.17452483461094193, + "grad_norm": 0.7098193168640137, + "learning_rate": 0.00019924506088036165, + "loss": 0.1979, + "step": 831 + }, + { + "epoch": 0.17473485246245932, + "grad_norm": 0.590630292892456, + "learning_rate": 0.00019924079507606114, + "loss": 0.1872, + "step": 832 + }, + { + "epoch": 0.17494487031397668, + "grad_norm": 0.7556937336921692, + "learning_rate": 0.00019923651729959177, + "loss": 0.1651, + "step": 833 + }, + { + "epoch": 0.17515488816549407, + "grad_norm": 0.6680110096931458, + "learning_rate": 0.00019923222755146956, + "loss": 0.1837, + "step": 834 + }, + { + "epoch": 0.17536490601701143, + "grad_norm": 0.7310810685157776, + "learning_rate": 0.0001992279258322121, + "loss": 0.3201, + "step": 835 + }, + { + "epoch": 0.17557492386852883, + "grad_norm": 0.5796787142753601, + "learning_rate": 0.0001992236121423383, + "loss": 0.178, + "step": 836 + }, + { + "epoch": 0.17578494172004622, + "grad_norm": 0.45521265268325806, + "learning_rate": 0.00019921928648236853, + "loss": 0.1723, + "step": 837 + }, + { + "epoch": 0.17599495957156358, + "grad_norm": 0.43274396657943726, + "learning_rate": 0.00019921494885282467, + "loss": 0.1597, + "step": 838 + }, + { + "epoch": 0.17620497742308097, + "grad_norm": 0.40754616260528564, + "learning_rate": 0.00019921059925422996, + "loss": 0.1299, + "step": 839 + }, + { + "epoch": 0.17641499527459834, + "grad_norm": 0.6628978252410889, + "learning_rate": 0.00019920623768710912, + "loss": 0.1931, + "step": 840 + }, + { + "epoch": 0.17662501312611573, + "grad_norm": 0.644637405872345, + "learning_rate": 0.0001992018641519884, + "loss": 0.199, + "step": 841 + }, + { + "epoch": 0.1768350309776331, + "grad_norm": 0.5001009106636047, + "learning_rate": 0.0001991974786493953, + "loss": 0.2109, + "step": 842 + }, + { + "epoch": 0.17704504882915048, + "grad_norm": 0.49435755610466003, + "learning_rate": 0.00019919308117985894, + "loss": 0.1832, + "step": 843 + }, + { + "epoch": 0.17725506668066784, + "grad_norm": 0.7176212668418884, + "learning_rate": 0.0001991886717439098, + "loss": 0.2491, + "step": 844 + }, + { + "epoch": 0.17746508453218524, + "grad_norm": 0.5122328996658325, + "learning_rate": 0.00019918425034207984, + "loss": 0.2618, + "step": 845 + }, + { + "epoch": 0.17767510238370263, + "grad_norm": 0.6069608926773071, + "learning_rate": 0.00019917981697490245, + "loss": 0.2119, + "step": 846 + }, + { + "epoch": 0.17788512023522, + "grad_norm": 0.8389537334442139, + "learning_rate": 0.00019917537164291244, + "loss": 0.2619, + "step": 847 + }, + { + "epoch": 0.17809513808673738, + "grad_norm": 0.5856572389602661, + "learning_rate": 0.00019917091434664612, + "loss": 0.1928, + "step": 848 + }, + { + "epoch": 0.17830515593825474, + "grad_norm": 0.5682632327079773, + "learning_rate": 0.00019916644508664115, + "loss": 0.2963, + "step": 849 + }, + { + "epoch": 0.17851517378977214, + "grad_norm": 0.45547807216644287, + "learning_rate": 0.00019916196386343674, + "loss": 0.1277, + "step": 850 + }, + { + "epoch": 0.1787251916412895, + "grad_norm": 0.648499071598053, + "learning_rate": 0.00019915747067757349, + "loss": 0.3407, + "step": 851 + }, + { + "epoch": 0.1789352094928069, + "grad_norm": 0.48874902725219727, + "learning_rate": 0.0001991529655295934, + "loss": 0.185, + "step": 852 + }, + { + "epoch": 0.17914522734432425, + "grad_norm": 0.7059923410415649, + "learning_rate": 0.00019914844842004002, + "loss": 0.2352, + "step": 853 + }, + { + "epoch": 0.17935524519584164, + "grad_norm": 0.6532195210456848, + "learning_rate": 0.00019914391934945823, + "loss": 0.292, + "step": 854 + }, + { + "epoch": 0.17956526304735904, + "grad_norm": 0.6922776103019714, + "learning_rate": 0.0001991393783183945, + "loss": 0.4635, + "step": 855 + }, + { + "epoch": 0.1797752808988764, + "grad_norm": 0.6560776829719543, + "learning_rate": 0.00019913482532739652, + "loss": 0.2684, + "step": 856 + }, + { + "epoch": 0.1799852987503938, + "grad_norm": 0.5644369125366211, + "learning_rate": 0.00019913026037701362, + "loss": 0.2018, + "step": 857 + }, + { + "epoch": 0.18019531660191115, + "grad_norm": 0.6108200550079346, + "learning_rate": 0.00019912568346779652, + "loss": 0.1746, + "step": 858 + }, + { + "epoch": 0.18040533445342855, + "grad_norm": 0.6762723326683044, + "learning_rate": 0.00019912109460029734, + "loss": 0.4662, + "step": 859 + }, + { + "epoch": 0.1806153523049459, + "grad_norm": 0.5877822041511536, + "learning_rate": 0.00019911649377506966, + "loss": 0.2546, + "step": 860 + }, + { + "epoch": 0.1808253701564633, + "grad_norm": 0.5038641095161438, + "learning_rate": 0.00019911188099266855, + "loss": 0.3073, + "step": 861 + }, + { + "epoch": 0.1810353880079807, + "grad_norm": 0.6587141156196594, + "learning_rate": 0.00019910725625365045, + "loss": 0.2991, + "step": 862 + }, + { + "epoch": 0.18124540585949805, + "grad_norm": 0.9041693210601807, + "learning_rate": 0.0001991026195585733, + "loss": 0.3111, + "step": 863 + }, + { + "epoch": 0.18145542371101545, + "grad_norm": 0.6296244263648987, + "learning_rate": 0.00019909797090799644, + "loss": 0.2272, + "step": 864 + }, + { + "epoch": 0.1816654415625328, + "grad_norm": 0.6931461691856384, + "learning_rate": 0.00019909331030248072, + "loss": 0.3503, + "step": 865 + }, + { + "epoch": 0.1818754594140502, + "grad_norm": 0.7656722664833069, + "learning_rate": 0.00019908863774258827, + "loss": 0.3773, + "step": 866 + }, + { + "epoch": 0.18208547726556756, + "grad_norm": 0.6011465787887573, + "learning_rate": 0.00019908395322888294, + "loss": 0.2101, + "step": 867 + }, + { + "epoch": 0.18229549511708495, + "grad_norm": 0.6926429867744446, + "learning_rate": 0.0001990792567619297, + "loss": 0.2027, + "step": 868 + }, + { + "epoch": 0.18250551296860232, + "grad_norm": 0.5799981355667114, + "learning_rate": 0.00019907454834229525, + "loss": 0.2129, + "step": 869 + }, + { + "epoch": 0.1827155308201197, + "grad_norm": 0.5605289936065674, + "learning_rate": 0.0001990698279705475, + "loss": 0.2104, + "step": 870 + }, + { + "epoch": 0.1829255486716371, + "grad_norm": 0.9048646092414856, + "learning_rate": 0.00019906509564725596, + "loss": 0.4131, + "step": 871 + }, + { + "epoch": 0.18313556652315446, + "grad_norm": 0.6802535057067871, + "learning_rate": 0.0001990603513729915, + "loss": 0.2715, + "step": 872 + }, + { + "epoch": 0.18334558437467185, + "grad_norm": 0.4949076771736145, + "learning_rate": 0.0001990555951483265, + "loss": 0.1725, + "step": 873 + }, + { + "epoch": 0.18355560222618922, + "grad_norm": 0.6589632034301758, + "learning_rate": 0.0001990508269738347, + "loss": 0.1424, + "step": 874 + }, + { + "epoch": 0.1837656200777066, + "grad_norm": 0.5366025567054749, + "learning_rate": 0.00019904604685009133, + "loss": 0.1578, + "step": 875 + }, + { + "epoch": 0.18397563792922397, + "grad_norm": 0.584173858165741, + "learning_rate": 0.00019904125477767303, + "loss": 0.2381, + "step": 876 + }, + { + "epoch": 0.18418565578074136, + "grad_norm": 0.6884530186653137, + "learning_rate": 0.00019903645075715798, + "loss": 0.2043, + "step": 877 + }, + { + "epoch": 0.18439567363225876, + "grad_norm": 0.6070178747177124, + "learning_rate": 0.00019903163478912563, + "loss": 0.1792, + "step": 878 + }, + { + "epoch": 0.18460569148377612, + "grad_norm": 0.6375721096992493, + "learning_rate": 0.00019902680687415705, + "loss": 0.218, + "step": 879 + }, + { + "epoch": 0.1848157093352935, + "grad_norm": 0.564017653465271, + "learning_rate": 0.0001990219670128346, + "loss": 0.1885, + "step": 880 + }, + { + "epoch": 0.18502572718681087, + "grad_norm": 0.6779912710189819, + "learning_rate": 0.0001990171152057422, + "loss": 0.1949, + "step": 881 + }, + { + "epoch": 0.18523574503832826, + "grad_norm": 0.6086128950119019, + "learning_rate": 0.0001990122514534651, + "loss": 0.1818, + "step": 882 + }, + { + "epoch": 0.18544576288984563, + "grad_norm": 0.4768702983856201, + "learning_rate": 0.0001990073757565901, + "loss": 0.1459, + "step": 883 + }, + { + "epoch": 0.18565578074136302, + "grad_norm": 0.5171164870262146, + "learning_rate": 0.0001990024881157054, + "loss": 0.1624, + "step": 884 + }, + { + "epoch": 0.18586579859288038, + "grad_norm": 0.6542419195175171, + "learning_rate": 0.00019899758853140064, + "loss": 0.2035, + "step": 885 + }, + { + "epoch": 0.18607581644439777, + "grad_norm": 0.7479321956634521, + "learning_rate": 0.0001989926770042668, + "loss": 0.3654, + "step": 886 + }, + { + "epoch": 0.18628583429591516, + "grad_norm": 0.7446826696395874, + "learning_rate": 0.0001989877535348965, + "loss": 0.236, + "step": 887 + }, + { + "epoch": 0.18649585214743253, + "grad_norm": 0.5898016691207886, + "learning_rate": 0.00019898281812388366, + "loss": 0.2013, + "step": 888 + }, + { + "epoch": 0.18670586999894992, + "grad_norm": 0.6942265629768372, + "learning_rate": 0.00019897787077182368, + "loss": 0.1912, + "step": 889 + }, + { + "epoch": 0.18691588785046728, + "grad_norm": 0.7095215320587158, + "learning_rate": 0.0001989729114793134, + "loss": 0.2031, + "step": 890 + }, + { + "epoch": 0.18712590570198467, + "grad_norm": 0.49590814113616943, + "learning_rate": 0.00019896794024695108, + "loss": 0.1848, + "step": 891 + }, + { + "epoch": 0.18733592355350204, + "grad_norm": 0.3615363836288452, + "learning_rate": 0.00019896295707533642, + "loss": 0.1357, + "step": 892 + }, + { + "epoch": 0.18754594140501943, + "grad_norm": 0.540952205657959, + "learning_rate": 0.00019895796196507063, + "loss": 0.1622, + "step": 893 + }, + { + "epoch": 0.1877559592565368, + "grad_norm": 0.6152564883232117, + "learning_rate": 0.00019895295491675628, + "loss": 0.2229, + "step": 894 + }, + { + "epoch": 0.18796597710805418, + "grad_norm": 0.6287555694580078, + "learning_rate": 0.0001989479359309974, + "loss": 0.1855, + "step": 895 + }, + { + "epoch": 0.18817599495957157, + "grad_norm": 0.6615211963653564, + "learning_rate": 0.00019894290500839946, + "loss": 0.2001, + "step": 896 + }, + { + "epoch": 0.18838601281108894, + "grad_norm": 0.6587905883789062, + "learning_rate": 0.00019893786214956945, + "loss": 0.2368, + "step": 897 + }, + { + "epoch": 0.18859603066260633, + "grad_norm": 0.3502175807952881, + "learning_rate": 0.00019893280735511565, + "loss": 0.1203, + "step": 898 + }, + { + "epoch": 0.1888060485141237, + "grad_norm": 0.6989165544509888, + "learning_rate": 0.00019892774062564786, + "loss": 0.2108, + "step": 899 + }, + { + "epoch": 0.18901606636564108, + "grad_norm": 0.5993213057518005, + "learning_rate": 0.00019892266196177736, + "loss": 0.2667, + "step": 900 + }, + { + "epoch": 0.18922608421715845, + "grad_norm": 0.6625016331672668, + "learning_rate": 0.0001989175713641168, + "loss": 0.3081, + "step": 901 + }, + { + "epoch": 0.18943610206867584, + "grad_norm": 0.6831103563308716, + "learning_rate": 0.0001989124688332803, + "loss": 0.2826, + "step": 902 + }, + { + "epoch": 0.18964611992019323, + "grad_norm": 0.6341603994369507, + "learning_rate": 0.00019890735436988347, + "loss": 0.2738, + "step": 903 + }, + { + "epoch": 0.1898561377717106, + "grad_norm": 0.6546643376350403, + "learning_rate": 0.0001989022279745432, + "loss": 0.3065, + "step": 904 + }, + { + "epoch": 0.19006615562322798, + "grad_norm": 0.7356497645378113, + "learning_rate": 0.000198897089647878, + "loss": 0.2955, + "step": 905 + }, + { + "epoch": 0.19027617347474535, + "grad_norm": 0.71455317735672, + "learning_rate": 0.00019889193939050777, + "loss": 0.2069, + "step": 906 + }, + { + "epoch": 0.19048619132626274, + "grad_norm": 0.7142229676246643, + "learning_rate": 0.00019888677720305374, + "loss": 0.3386, + "step": 907 + }, + { + "epoch": 0.1906962091777801, + "grad_norm": 0.6420140862464905, + "learning_rate": 0.00019888160308613874, + "loss": 0.2952, + "step": 908 + }, + { + "epoch": 0.1909062270292975, + "grad_norm": 0.757895290851593, + "learning_rate": 0.00019887641704038688, + "loss": 0.299, + "step": 909 + }, + { + "epoch": 0.19111624488081486, + "grad_norm": 0.5329816937446594, + "learning_rate": 0.00019887121906642387, + "loss": 0.2005, + "step": 910 + }, + { + "epoch": 0.19132626273233225, + "grad_norm": 0.5069072842597961, + "learning_rate": 0.00019886600916487677, + "loss": 0.1971, + "step": 911 + }, + { + "epoch": 0.19153628058384964, + "grad_norm": 0.7712031602859497, + "learning_rate": 0.00019886078733637408, + "loss": 0.2952, + "step": 912 + }, + { + "epoch": 0.191746298435367, + "grad_norm": 0.6340819001197815, + "learning_rate": 0.00019885555358154574, + "loss": 0.2403, + "step": 913 + }, + { + "epoch": 0.1919563162868844, + "grad_norm": 0.707127034664154, + "learning_rate": 0.0001988503079010231, + "loss": 0.262, + "step": 914 + }, + { + "epoch": 0.19216633413840176, + "grad_norm": 0.5502609014511108, + "learning_rate": 0.00019884505029543908, + "loss": 0.1767, + "step": 915 + }, + { + "epoch": 0.19237635198991915, + "grad_norm": 0.6637031435966492, + "learning_rate": 0.00019883978076542787, + "loss": 0.317, + "step": 916 + }, + { + "epoch": 0.1925863698414365, + "grad_norm": 0.5921664237976074, + "learning_rate": 0.00019883449931162517, + "loss": 0.2848, + "step": 917 + }, + { + "epoch": 0.1927963876929539, + "grad_norm": 0.8460182547569275, + "learning_rate": 0.0001988292059346682, + "loss": 0.2741, + "step": 918 + }, + { + "epoch": 0.1930064055444713, + "grad_norm": 0.7577118277549744, + "learning_rate": 0.00019882390063519543, + "loss": 0.2589, + "step": 919 + }, + { + "epoch": 0.19321642339598866, + "grad_norm": 0.5957863330841064, + "learning_rate": 0.00019881858341384696, + "loss": 0.1834, + "step": 920 + }, + { + "epoch": 0.19342644124750605, + "grad_norm": 0.6584639549255371, + "learning_rate": 0.00019881325427126422, + "loss": 0.232, + "step": 921 + }, + { + "epoch": 0.1936364590990234, + "grad_norm": 0.6941714882850647, + "learning_rate": 0.0001988079132080901, + "loss": 0.2514, + "step": 922 + }, + { + "epoch": 0.1938464769505408, + "grad_norm": 0.829231321811676, + "learning_rate": 0.00019880256022496897, + "loss": 0.2023, + "step": 923 + }, + { + "epoch": 0.19405649480205817, + "grad_norm": 0.6720934510231018, + "learning_rate": 0.00019879719532254654, + "loss": 0.2535, + "step": 924 + }, + { + "epoch": 0.19426651265357556, + "grad_norm": 0.7159935832023621, + "learning_rate": 0.00019879181850147005, + "loss": 0.3129, + "step": 925 + }, + { + "epoch": 0.19447653050509292, + "grad_norm": 0.6411039233207703, + "learning_rate": 0.00019878642976238817, + "loss": 0.1729, + "step": 926 + }, + { + "epoch": 0.1946865483566103, + "grad_norm": 0.7253606915473938, + "learning_rate": 0.00019878102910595095, + "loss": 0.2599, + "step": 927 + }, + { + "epoch": 0.1948965662081277, + "grad_norm": 0.6732550859451294, + "learning_rate": 0.0001987756165328099, + "loss": 0.1881, + "step": 928 + }, + { + "epoch": 0.19510658405964507, + "grad_norm": 0.6675817966461182, + "learning_rate": 0.00019877019204361804, + "loss": 0.2417, + "step": 929 + }, + { + "epoch": 0.19531660191116246, + "grad_norm": 0.5525332093238831, + "learning_rate": 0.0001987647556390297, + "loss": 0.2445, + "step": 930 + }, + { + "epoch": 0.19552661976267982, + "grad_norm": 0.7800937294960022, + "learning_rate": 0.00019875930731970076, + "loss": 0.2401, + "step": 931 + }, + { + "epoch": 0.1957366376141972, + "grad_norm": 0.5669112205505371, + "learning_rate": 0.00019875384708628848, + "loss": 0.1925, + "step": 932 + }, + { + "epoch": 0.19594665546571458, + "grad_norm": 0.6367275714874268, + "learning_rate": 0.00019874837493945156, + "loss": 0.205, + "step": 933 + }, + { + "epoch": 0.19615667331723197, + "grad_norm": 0.6173298954963684, + "learning_rate": 0.00019874289087985013, + "loss": 0.2426, + "step": 934 + }, + { + "epoch": 0.19636669116874933, + "grad_norm": 0.7045214176177979, + "learning_rate": 0.00019873739490814583, + "loss": 0.1647, + "step": 935 + }, + { + "epoch": 0.19657670902026672, + "grad_norm": 0.5824179649353027, + "learning_rate": 0.00019873188702500163, + "loss": 0.1527, + "step": 936 + }, + { + "epoch": 0.1967867268717841, + "grad_norm": 0.585749626159668, + "learning_rate": 0.000198726367231082, + "loss": 0.2119, + "step": 937 + }, + { + "epoch": 0.19699674472330148, + "grad_norm": 0.679140031337738, + "learning_rate": 0.00019872083552705284, + "loss": 0.2037, + "step": 938 + }, + { + "epoch": 0.19720676257481887, + "grad_norm": 0.3865984380245209, + "learning_rate": 0.0001987152919135815, + "loss": 0.1508, + "step": 939 + }, + { + "epoch": 0.19741678042633623, + "grad_norm": 0.5994648933410645, + "learning_rate": 0.0001987097363913367, + "loss": 0.1536, + "step": 940 + }, + { + "epoch": 0.19762679827785362, + "grad_norm": 0.8374373912811279, + "learning_rate": 0.0001987041689609887, + "loss": 0.3113, + "step": 941 + }, + { + "epoch": 0.19783681612937098, + "grad_norm": 0.4448517858982086, + "learning_rate": 0.0001986985896232091, + "loss": 0.1523, + "step": 942 + }, + { + "epoch": 0.19804683398088838, + "grad_norm": 0.5031003952026367, + "learning_rate": 0.00019869299837867098, + "loss": 0.2351, + "step": 943 + }, + { + "epoch": 0.19825685183240577, + "grad_norm": 0.8319448232650757, + "learning_rate": 0.0001986873952280489, + "loss": 0.2768, + "step": 944 + }, + { + "epoch": 0.19846686968392313, + "grad_norm": 0.4768364429473877, + "learning_rate": 0.00019868178017201874, + "loss": 0.2041, + "step": 945 + }, + { + "epoch": 0.19867688753544052, + "grad_norm": 0.36797624826431274, + "learning_rate": 0.00019867615321125795, + "loss": 0.1703, + "step": 946 + }, + { + "epoch": 0.19888690538695789, + "grad_norm": 0.629489541053772, + "learning_rate": 0.0001986705143464453, + "loss": 0.1989, + "step": 947 + }, + { + "epoch": 0.19909692323847528, + "grad_norm": 0.757764458656311, + "learning_rate": 0.00019866486357826107, + "loss": 0.1972, + "step": 948 + }, + { + "epoch": 0.19930694108999264, + "grad_norm": 0.884556770324707, + "learning_rate": 0.00019865920090738698, + "loss": 0.2592, + "step": 949 + }, + { + "epoch": 0.19951695894151003, + "grad_norm": 0.5489534139633179, + "learning_rate": 0.00019865352633450614, + "loss": 0.2075, + "step": 950 + }, + { + "epoch": 0.1997269767930274, + "grad_norm": 0.6485860347747803, + "learning_rate": 0.00019864783986030314, + "loss": 0.3648, + "step": 951 + }, + { + "epoch": 0.19993699464454479, + "grad_norm": 0.8612170219421387, + "learning_rate": 0.00019864214148546393, + "loss": 0.2175, + "step": 952 + }, + { + "epoch": 0.20014701249606218, + "grad_norm": 0.6336376070976257, + "learning_rate": 0.00019863643121067597, + "loss": 0.2935, + "step": 953 + }, + { + "epoch": 0.20035703034757954, + "grad_norm": 0.7330135703086853, + "learning_rate": 0.00019863070903662817, + "loss": 0.4322, + "step": 954 + }, + { + "epoch": 0.20056704819909693, + "grad_norm": 0.6464625000953674, + "learning_rate": 0.0001986249749640108, + "loss": 0.242, + "step": 955 + }, + { + "epoch": 0.2007770660506143, + "grad_norm": 0.6884174942970276, + "learning_rate": 0.00019861922899351561, + "loss": 0.3043, + "step": 956 + }, + { + "epoch": 0.20098708390213169, + "grad_norm": 0.4948609471321106, + "learning_rate": 0.0001986134711258358, + "loss": 0.1735, + "step": 957 + }, + { + "epoch": 0.20119710175364905, + "grad_norm": 0.9207262396812439, + "learning_rate": 0.00019860770136166596, + "loss": 0.2473, + "step": 958 + }, + { + "epoch": 0.20140711960516644, + "grad_norm": 0.6444927453994751, + "learning_rate": 0.00019860191970170216, + "loss": 0.2995, + "step": 959 + }, + { + "epoch": 0.20161713745668383, + "grad_norm": 0.8041002750396729, + "learning_rate": 0.00019859612614664185, + "loss": 0.3079, + "step": 960 + }, + { + "epoch": 0.2018271553082012, + "grad_norm": 0.520293653011322, + "learning_rate": 0.000198590320697184, + "loss": 0.2038, + "step": 961 + }, + { + "epoch": 0.20203717315971859, + "grad_norm": 0.6968462467193604, + "learning_rate": 0.00019858450335402897, + "loss": 0.2791, + "step": 962 + }, + { + "epoch": 0.20224719101123595, + "grad_norm": 0.5260444283485413, + "learning_rate": 0.00019857867411787847, + "loss": 0.2164, + "step": 963 + }, + { + "epoch": 0.20245720886275334, + "grad_norm": 0.7742235660552979, + "learning_rate": 0.0001985728329894358, + "loss": 0.3005, + "step": 964 + }, + { + "epoch": 0.2026672267142707, + "grad_norm": 0.4388875961303711, + "learning_rate": 0.0001985669799694056, + "loss": 0.2046, + "step": 965 + }, + { + "epoch": 0.2028772445657881, + "grad_norm": 0.8159006237983704, + "learning_rate": 0.00019856111505849394, + "loss": 0.2219, + "step": 966 + }, + { + "epoch": 0.20308726241730546, + "grad_norm": 0.5616422295570374, + "learning_rate": 0.0001985552382574084, + "loss": 0.3792, + "step": 967 + }, + { + "epoch": 0.20329728026882285, + "grad_norm": 0.5863935351371765, + "learning_rate": 0.00019854934956685792, + "loss": 0.2077, + "step": 968 + }, + { + "epoch": 0.20350729812034024, + "grad_norm": 0.5828328728675842, + "learning_rate": 0.00019854344898755287, + "loss": 0.273, + "step": 969 + }, + { + "epoch": 0.2037173159718576, + "grad_norm": 0.5963171124458313, + "learning_rate": 0.00019853753652020507, + "loss": 0.2407, + "step": 970 + }, + { + "epoch": 0.203927333823375, + "grad_norm": 0.5114577412605286, + "learning_rate": 0.00019853161216552788, + "loss": 0.1663, + "step": 971 + }, + { + "epoch": 0.20413735167489236, + "grad_norm": 0.5106688737869263, + "learning_rate": 0.0001985256759242359, + "loss": 0.1823, + "step": 972 + }, + { + "epoch": 0.20434736952640975, + "grad_norm": 0.5732094645500183, + "learning_rate": 0.00019851972779704534, + "loss": 0.2206, + "step": 973 + }, + { + "epoch": 0.2045573873779271, + "grad_norm": 0.5627723932266235, + "learning_rate": 0.00019851376778467366, + "loss": 0.1715, + "step": 974 + }, + { + "epoch": 0.2047674052294445, + "grad_norm": 0.7939655184745789, + "learning_rate": 0.00019850779588783998, + "loss": 0.1669, + "step": 975 + }, + { + "epoch": 0.20497742308096187, + "grad_norm": 0.5675683617591858, + "learning_rate": 0.00019850181210726467, + "loss": 0.1696, + "step": 976 + }, + { + "epoch": 0.20518744093247926, + "grad_norm": 0.9706809520721436, + "learning_rate": 0.00019849581644366965, + "loss": 0.376, + "step": 977 + }, + { + "epoch": 0.20539745878399665, + "grad_norm": 0.6787039041519165, + "learning_rate": 0.00019848980889777815, + "loss": 0.1528, + "step": 978 + }, + { + "epoch": 0.205607476635514, + "grad_norm": 0.8098447918891907, + "learning_rate": 0.00019848378947031492, + "loss": 0.1659, + "step": 979 + }, + { + "epoch": 0.2058174944870314, + "grad_norm": 0.46255457401275635, + "learning_rate": 0.0001984777581620062, + "loss": 0.1445, + "step": 980 + }, + { + "epoch": 0.20602751233854877, + "grad_norm": 0.7909157872200012, + "learning_rate": 0.00019847171497357953, + "loss": 0.2294, + "step": 981 + }, + { + "epoch": 0.20623753019006616, + "grad_norm": 0.6739736795425415, + "learning_rate": 0.00019846565990576392, + "loss": 0.2624, + "step": 982 + }, + { + "epoch": 0.20644754804158352, + "grad_norm": 0.7638704180717468, + "learning_rate": 0.00019845959295928994, + "loss": 0.2863, + "step": 983 + }, + { + "epoch": 0.20665756589310091, + "grad_norm": 0.48239898681640625, + "learning_rate": 0.00019845351413488939, + "loss": 0.1564, + "step": 984 + }, + { + "epoch": 0.2068675837446183, + "grad_norm": 0.6511039137840271, + "learning_rate": 0.00019844742343329568, + "loss": 0.1856, + "step": 985 + }, + { + "epoch": 0.20707760159613567, + "grad_norm": 0.48949161171913147, + "learning_rate": 0.0001984413208552435, + "loss": 0.1862, + "step": 986 + }, + { + "epoch": 0.20728761944765306, + "grad_norm": 0.6529719829559326, + "learning_rate": 0.0001984352064014691, + "loss": 0.2008, + "step": 987 + }, + { + "epoch": 0.20749763729917042, + "grad_norm": 0.5295738577842712, + "learning_rate": 0.00019842908007271012, + "loss": 0.2141, + "step": 988 + }, + { + "epoch": 0.20770765515068781, + "grad_norm": 0.5440765023231506, + "learning_rate": 0.00019842294186970562, + "loss": 0.264, + "step": 989 + }, + { + "epoch": 0.20791767300220518, + "grad_norm": 0.5315092206001282, + "learning_rate": 0.00019841679179319606, + "loss": 0.2116, + "step": 990 + }, + { + "epoch": 0.20812769085372257, + "grad_norm": 0.4537929892539978, + "learning_rate": 0.0001984106298439234, + "loss": 0.1269, + "step": 991 + }, + { + "epoch": 0.20833770870523993, + "grad_norm": 0.5806244015693665, + "learning_rate": 0.000198404456022631, + "loss": 0.1808, + "step": 992 + }, + { + "epoch": 0.20854772655675732, + "grad_norm": 0.5772647261619568, + "learning_rate": 0.00019839827033006372, + "loss": 0.1637, + "step": 993 + }, + { + "epoch": 0.20875774440827471, + "grad_norm": 0.4130006432533264, + "learning_rate": 0.00019839207276696764, + "loss": 0.1398, + "step": 994 + }, + { + "epoch": 0.20896776225979208, + "grad_norm": 0.47043028473854065, + "learning_rate": 0.00019838586333409056, + "loss": 0.1209, + "step": 995 + }, + { + "epoch": 0.20917778011130947, + "grad_norm": 0.713445782661438, + "learning_rate": 0.00019837964203218148, + "loss": 0.2175, + "step": 996 + }, + { + "epoch": 0.20938779796282683, + "grad_norm": 0.7947505116462708, + "learning_rate": 0.00019837340886199096, + "loss": 0.3172, + "step": 997 + }, + { + "epoch": 0.20959781581434422, + "grad_norm": 0.7544185519218445, + "learning_rate": 0.00019836716382427096, + "loss": 0.2506, + "step": 998 + }, + { + "epoch": 0.2098078336658616, + "grad_norm": 0.8411846160888672, + "learning_rate": 0.00019836090691977484, + "loss": 0.2619, + "step": 999 + }, + { + "epoch": 0.21001785151737898, + "grad_norm": 0.7358798384666443, + "learning_rate": 0.00019835463814925745, + "loss": 0.264, + "step": 1000 + }, + { + "epoch": 0.21022786936889637, + "grad_norm": 0.623121440410614, + "learning_rate": 0.00019834835751347503, + "loss": 0.4566, + "step": 1001 + }, + { + "epoch": 0.21043788722041373, + "grad_norm": 0.6662508845329285, + "learning_rate": 0.00019834206501318524, + "loss": 0.232, + "step": 1002 + }, + { + "epoch": 0.21064790507193112, + "grad_norm": 0.7510089874267578, + "learning_rate": 0.00019833576064914722, + "loss": 0.2207, + "step": 1003 + }, + { + "epoch": 0.2108579229234485, + "grad_norm": 0.6487518548965454, + "learning_rate": 0.0001983294444221215, + "loss": 0.2665, + "step": 1004 + }, + { + "epoch": 0.21106794077496588, + "grad_norm": 0.4078707695007324, + "learning_rate": 0.00019832311633287002, + "loss": 0.2028, + "step": 1005 + }, + { + "epoch": 0.21127795862648324, + "grad_norm": 0.7619323134422302, + "learning_rate": 0.00019831677638215624, + "loss": 0.29, + "step": 1006 + }, + { + "epoch": 0.21148797647800063, + "grad_norm": 0.6697717308998108, + "learning_rate": 0.00019831042457074498, + "loss": 0.2623, + "step": 1007 + }, + { + "epoch": 0.211697994329518, + "grad_norm": 0.6049818396568298, + "learning_rate": 0.0001983040608994025, + "loss": 0.2045, + "step": 1008 + }, + { + "epoch": 0.2119080121810354, + "grad_norm": 0.7906011343002319, + "learning_rate": 0.0001982976853688965, + "loss": 0.244, + "step": 1009 + }, + { + "epoch": 0.21211803003255278, + "grad_norm": 0.44965484738349915, + "learning_rate": 0.0001982912979799961, + "loss": 0.2391, + "step": 1010 + }, + { + "epoch": 0.21232804788407014, + "grad_norm": 0.9092258214950562, + "learning_rate": 0.00019828489873347188, + "loss": 0.3971, + "step": 1011 + }, + { + "epoch": 0.21253806573558753, + "grad_norm": 0.6709781289100647, + "learning_rate": 0.0001982784876300958, + "loss": 0.1851, + "step": 1012 + }, + { + "epoch": 0.2127480835871049, + "grad_norm": 0.608507513999939, + "learning_rate": 0.00019827206467064133, + "loss": 0.2602, + "step": 1013 + }, + { + "epoch": 0.2129581014386223, + "grad_norm": 0.7793399095535278, + "learning_rate": 0.00019826562985588328, + "loss": 0.288, + "step": 1014 + }, + { + "epoch": 0.21316811929013965, + "grad_norm": 0.8137920498847961, + "learning_rate": 0.00019825918318659792, + "loss": 0.2724, + "step": 1015 + }, + { + "epoch": 0.21337813714165704, + "grad_norm": 0.7229858636856079, + "learning_rate": 0.000198252724663563, + "loss": 0.1762, + "step": 1016 + }, + { + "epoch": 0.2135881549931744, + "grad_norm": 0.7144889831542969, + "learning_rate": 0.0001982462542875576, + "loss": 0.2199, + "step": 1017 + }, + { + "epoch": 0.2137981728446918, + "grad_norm": 0.5533698797225952, + "learning_rate": 0.00019823977205936236, + "loss": 0.1532, + "step": 1018 + }, + { + "epoch": 0.2140081906962092, + "grad_norm": 0.6681041717529297, + "learning_rate": 0.00019823327797975927, + "loss": 0.2221, + "step": 1019 + }, + { + "epoch": 0.21421820854772655, + "grad_norm": 0.7258886098861694, + "learning_rate": 0.0001982267720495317, + "loss": 0.2249, + "step": 1020 + }, + { + "epoch": 0.21442822639924394, + "grad_norm": 0.7201298475265503, + "learning_rate": 0.00019822025426946457, + "loss": 0.3041, + "step": 1021 + }, + { + "epoch": 0.2146382442507613, + "grad_norm": 0.5295907855033875, + "learning_rate": 0.00019821372464034416, + "loss": 0.1514, + "step": 1022 + }, + { + "epoch": 0.2148482621022787, + "grad_norm": 0.6460062861442566, + "learning_rate": 0.00019820718316295816, + "loss": 0.2194, + "step": 1023 + }, + { + "epoch": 0.21505827995379606, + "grad_norm": 0.6456478834152222, + "learning_rate": 0.00019820062983809576, + "loss": 0.1645, + "step": 1024 + }, + { + "epoch": 0.21526829780531345, + "grad_norm": 0.5274572372436523, + "learning_rate": 0.0001981940646665475, + "loss": 0.1434, + "step": 1025 + }, + { + "epoch": 0.21547831565683084, + "grad_norm": 0.7552756667137146, + "learning_rate": 0.00019818748764910537, + "loss": 0.3032, + "step": 1026 + }, + { + "epoch": 0.2156883335083482, + "grad_norm": 0.4831709861755371, + "learning_rate": 0.00019818089878656287, + "loss": 0.2132, + "step": 1027 + }, + { + "epoch": 0.2158983513598656, + "grad_norm": 0.8715883493423462, + "learning_rate": 0.00019817429807971482, + "loss": 0.2465, + "step": 1028 + }, + { + "epoch": 0.21610836921138296, + "grad_norm": 0.7408828735351562, + "learning_rate": 0.0001981676855293575, + "loss": 0.2652, + "step": 1029 + }, + { + "epoch": 0.21631838706290035, + "grad_norm": 0.4847085177898407, + "learning_rate": 0.00019816106113628866, + "loss": 0.239, + "step": 1030 + }, + { + "epoch": 0.21652840491441772, + "grad_norm": 0.6377763152122498, + "learning_rate": 0.00019815442490130747, + "loss": 0.2536, + "step": 1031 + }, + { + "epoch": 0.2167384227659351, + "grad_norm": 0.6141781806945801, + "learning_rate": 0.00019814777682521445, + "loss": 0.1938, + "step": 1032 + }, + { + "epoch": 0.21694844061745247, + "grad_norm": 0.8136757016181946, + "learning_rate": 0.0001981411169088117, + "loss": 0.2595, + "step": 1033 + }, + { + "epoch": 0.21715845846896986, + "grad_norm": 0.645139217376709, + "learning_rate": 0.00019813444515290253, + "loss": 0.2553, + "step": 1034 + }, + { + "epoch": 0.21736847632048725, + "grad_norm": 0.4579085409641266, + "learning_rate": 0.00019812776155829194, + "loss": 0.1846, + "step": 1035 + }, + { + "epoch": 0.21757849417200462, + "grad_norm": 0.6354373097419739, + "learning_rate": 0.0001981210661257861, + "loss": 0.1565, + "step": 1036 + }, + { + "epoch": 0.217788512023522, + "grad_norm": 0.7238495945930481, + "learning_rate": 0.0001981143588561928, + "loss": 0.1602, + "step": 1037 + }, + { + "epoch": 0.21799852987503937, + "grad_norm": 0.4990311563014984, + "learning_rate": 0.00019810763975032118, + "loss": 0.1736, + "step": 1038 + }, + { + "epoch": 0.21820854772655676, + "grad_norm": 0.6193257570266724, + "learning_rate": 0.0001981009088089818, + "loss": 0.2047, + "step": 1039 + }, + { + "epoch": 0.21841856557807413, + "grad_norm": 0.6221904158592224, + "learning_rate": 0.0001980941660329867, + "loss": 0.18, + "step": 1040 + }, + { + "epoch": 0.21862858342959152, + "grad_norm": 0.5321659445762634, + "learning_rate": 0.0001980874114231493, + "loss": 0.1644, + "step": 1041 + }, + { + "epoch": 0.2188386012811089, + "grad_norm": 0.6771279573440552, + "learning_rate": 0.00019808064498028443, + "loss": 0.2196, + "step": 1042 + }, + { + "epoch": 0.21904861913262627, + "grad_norm": 0.5956505537033081, + "learning_rate": 0.00019807386670520836, + "loss": 0.2051, + "step": 1043 + }, + { + "epoch": 0.21925863698414366, + "grad_norm": 0.7111203670501709, + "learning_rate": 0.00019806707659873887, + "loss": 0.1721, + "step": 1044 + }, + { + "epoch": 0.21946865483566103, + "grad_norm": 0.5506051182746887, + "learning_rate": 0.00019806027466169506, + "loss": 0.1351, + "step": 1045 + }, + { + "epoch": 0.21967867268717842, + "grad_norm": 0.5250877737998962, + "learning_rate": 0.00019805346089489753, + "loss": 0.2592, + "step": 1046 + }, + { + "epoch": 0.21988869053869578, + "grad_norm": 0.7672072649002075, + "learning_rate": 0.00019804663529916826, + "loss": 0.2551, + "step": 1047 + }, + { + "epoch": 0.22009870839021317, + "grad_norm": 0.3871646821498871, + "learning_rate": 0.00019803979787533064, + "loss": 0.1114, + "step": 1048 + }, + { + "epoch": 0.22030872624173053, + "grad_norm": 0.49425801634788513, + "learning_rate": 0.00019803294862420957, + "loss": 0.1579, + "step": 1049 + }, + { + "epoch": 0.22051874409324793, + "grad_norm": 0.8091879487037659, + "learning_rate": 0.0001980260875466313, + "loss": 0.3894, + "step": 1050 + }, + { + "epoch": 0.22072876194476532, + "grad_norm": 0.5933718085289001, + "learning_rate": 0.00019801921464342358, + "loss": 0.2827, + "step": 1051 + }, + { + "epoch": 0.22093877979628268, + "grad_norm": 0.8242902159690857, + "learning_rate": 0.00019801232991541548, + "loss": 0.4183, + "step": 1052 + }, + { + "epoch": 0.22114879764780007, + "grad_norm": 0.5886990427970886, + "learning_rate": 0.00019800543336343757, + "loss": 0.1928, + "step": 1053 + }, + { + "epoch": 0.22135881549931744, + "grad_norm": 0.559283435344696, + "learning_rate": 0.00019799852498832184, + "loss": 0.2042, + "step": 1054 + }, + { + "epoch": 0.22156883335083483, + "grad_norm": 0.4517320990562439, + "learning_rate": 0.0001979916047909017, + "loss": 0.1895, + "step": 1055 + }, + { + "epoch": 0.2217788512023522, + "grad_norm": 0.6435453295707703, + "learning_rate": 0.000197984672772012, + "loss": 0.2571, + "step": 1056 + }, + { + "epoch": 0.22198886905386958, + "grad_norm": 0.6028056740760803, + "learning_rate": 0.00019797772893248897, + "loss": 0.2509, + "step": 1057 + }, + { + "epoch": 0.22219888690538694, + "grad_norm": 0.5414546132087708, + "learning_rate": 0.00019797077327317033, + "loss": 0.2198, + "step": 1058 + }, + { + "epoch": 0.22240890475690434, + "grad_norm": 0.42511531710624695, + "learning_rate": 0.0001979638057948952, + "loss": 0.1837, + "step": 1059 + }, + { + "epoch": 0.22261892260842173, + "grad_norm": 0.7643983960151672, + "learning_rate": 0.00019795682649850408, + "loss": 0.2282, + "step": 1060 + }, + { + "epoch": 0.2228289404599391, + "grad_norm": 0.713973343372345, + "learning_rate": 0.00019794983538483894, + "loss": 0.2568, + "step": 1061 + }, + { + "epoch": 0.22303895831145648, + "grad_norm": 0.4897744655609131, + "learning_rate": 0.0001979428324547432, + "loss": 0.176, + "step": 1062 + }, + { + "epoch": 0.22324897616297384, + "grad_norm": 0.4883119463920593, + "learning_rate": 0.0001979358177090617, + "loss": 0.2891, + "step": 1063 + }, + { + "epoch": 0.22345899401449124, + "grad_norm": 0.7395027875900269, + "learning_rate": 0.0001979287911486406, + "loss": 0.1988, + "step": 1064 + }, + { + "epoch": 0.2236690118660086, + "grad_norm": 0.8084394931793213, + "learning_rate": 0.00019792175277432762, + "loss": 0.2925, + "step": 1065 + }, + { + "epoch": 0.223879029717526, + "grad_norm": 0.452434778213501, + "learning_rate": 0.00019791470258697188, + "loss": 0.1406, + "step": 1066 + }, + { + "epoch": 0.22408904756904338, + "grad_norm": 0.6661890745162964, + "learning_rate": 0.00019790764058742383, + "loss": 0.2275, + "step": 1067 + }, + { + "epoch": 0.22429906542056074, + "grad_norm": 0.6966044902801514, + "learning_rate": 0.00019790056677653547, + "loss": 0.2291, + "step": 1068 + }, + { + "epoch": 0.22450908327207814, + "grad_norm": 0.6009211540222168, + "learning_rate": 0.00019789348115516008, + "loss": 0.1446, + "step": 1069 + }, + { + "epoch": 0.2247191011235955, + "grad_norm": 0.6025848388671875, + "learning_rate": 0.0001978863837241526, + "loss": 0.1969, + "step": 1070 + }, + { + "epoch": 0.2249291189751129, + "grad_norm": 0.6915929913520813, + "learning_rate": 0.0001978792744843691, + "loss": 0.1574, + "step": 1071 + }, + { + "epoch": 0.22513913682663025, + "grad_norm": 0.6229645609855652, + "learning_rate": 0.00019787215343666732, + "loss": 0.3064, + "step": 1072 + }, + { + "epoch": 0.22534915467814765, + "grad_norm": 0.5794253945350647, + "learning_rate": 0.00019786502058190627, + "loss": 0.1497, + "step": 1073 + }, + { + "epoch": 0.225559172529665, + "grad_norm": 0.8496554493904114, + "learning_rate": 0.00019785787592094647, + "loss": 0.2076, + "step": 1074 + }, + { + "epoch": 0.2257691903811824, + "grad_norm": 0.6304386258125305, + "learning_rate": 0.0001978507194546498, + "loss": 0.2767, + "step": 1075 + }, + { + "epoch": 0.2259792082326998, + "grad_norm": 0.928598940372467, + "learning_rate": 0.00019784355118387966, + "loss": 0.2917, + "step": 1076 + }, + { + "epoch": 0.22618922608421715, + "grad_norm": 0.6034556031227112, + "learning_rate": 0.00019783637110950075, + "loss": 0.1979, + "step": 1077 + }, + { + "epoch": 0.22639924393573455, + "grad_norm": 0.6918156743049622, + "learning_rate": 0.0001978291792323793, + "loss": 0.2555, + "step": 1078 + }, + { + "epoch": 0.2266092617872519, + "grad_norm": 0.8922538757324219, + "learning_rate": 0.00019782197555338288, + "loss": 0.1926, + "step": 1079 + }, + { + "epoch": 0.2268192796387693, + "grad_norm": 1.1521648168563843, + "learning_rate": 0.00019781476007338058, + "loss": 0.3391, + "step": 1080 + }, + { + "epoch": 0.22702929749028666, + "grad_norm": 0.760995626449585, + "learning_rate": 0.0001978075327932428, + "loss": 0.2672, + "step": 1081 + }, + { + "epoch": 0.22723931534180405, + "grad_norm": 0.6859043836593628, + "learning_rate": 0.00019780029371384145, + "loss": 0.2383, + "step": 1082 + }, + { + "epoch": 0.22744933319332145, + "grad_norm": 0.8544055223464966, + "learning_rate": 0.00019779304283604985, + "loss": 0.2957, + "step": 1083 + }, + { + "epoch": 0.2276593510448388, + "grad_norm": 0.8154661655426025, + "learning_rate": 0.0001977857801607427, + "loss": 0.2017, + "step": 1084 + }, + { + "epoch": 0.2278693688963562, + "grad_norm": 0.5382252931594849, + "learning_rate": 0.00019777850568879614, + "loss": 0.179, + "step": 1085 + }, + { + "epoch": 0.22807938674787356, + "grad_norm": 0.6206135749816895, + "learning_rate": 0.0001977712194210878, + "loss": 0.2559, + "step": 1086 + }, + { + "epoch": 0.22828940459939095, + "grad_norm": 0.6016466021537781, + "learning_rate": 0.00019776392135849663, + "loss": 0.1619, + "step": 1087 + }, + { + "epoch": 0.22849942245090832, + "grad_norm": 0.3538142442703247, + "learning_rate": 0.00019775661150190306, + "loss": 0.1291, + "step": 1088 + }, + { + "epoch": 0.2287094403024257, + "grad_norm": 0.5222025513648987, + "learning_rate": 0.00019774928985218893, + "loss": 0.1718, + "step": 1089 + }, + { + "epoch": 0.22891945815394307, + "grad_norm": 0.7469238638877869, + "learning_rate": 0.00019774195641023755, + "loss": 0.2725, + "step": 1090 + }, + { + "epoch": 0.22912947600546046, + "grad_norm": 0.5208479166030884, + "learning_rate": 0.00019773461117693355, + "loss": 0.2309, + "step": 1091 + }, + { + "epoch": 0.22933949385697786, + "grad_norm": 0.6402025818824768, + "learning_rate": 0.00019772725415316303, + "loss": 0.2116, + "step": 1092 + }, + { + "epoch": 0.22954951170849522, + "grad_norm": 0.7494028806686401, + "learning_rate": 0.0001977198853398136, + "loss": 0.2141, + "step": 1093 + }, + { + "epoch": 0.2297595295600126, + "grad_norm": 0.6795624494552612, + "learning_rate": 0.00019771250473777418, + "loss": 0.3321, + "step": 1094 + }, + { + "epoch": 0.22996954741152997, + "grad_norm": 0.6777510046958923, + "learning_rate": 0.0001977051123479351, + "loss": 0.1976, + "step": 1095 + }, + { + "epoch": 0.23017956526304736, + "grad_norm": 0.8358330130577087, + "learning_rate": 0.00019769770817118824, + "loss": 0.2433, + "step": 1096 + }, + { + "epoch": 0.23038958311456473, + "grad_norm": 0.6221851110458374, + "learning_rate": 0.00019769029220842677, + "loss": 0.15, + "step": 1097 + }, + { + "epoch": 0.23059960096608212, + "grad_norm": 0.9023957252502441, + "learning_rate": 0.00019768286446054532, + "loss": 0.25, + "step": 1098 + }, + { + "epoch": 0.23080961881759948, + "grad_norm": 0.4097208082675934, + "learning_rate": 0.00019767542492844006, + "loss": 0.1478, + "step": 1099 + }, + { + "epoch": 0.23101963666911687, + "grad_norm": 0.5772308707237244, + "learning_rate": 0.00019766797361300833, + "loss": 0.1754, + "step": 1100 + }, + { + "epoch": 0.23122965452063426, + "grad_norm": 0.5117380619049072, + "learning_rate": 0.00019766051051514914, + "loss": 0.2119, + "step": 1101 + }, + { + "epoch": 0.23143967237215163, + "grad_norm": 0.7716235518455505, + "learning_rate": 0.00019765303563576276, + "loss": 0.3488, + "step": 1102 + }, + { + "epoch": 0.23164969022366902, + "grad_norm": 0.7328181862831116, + "learning_rate": 0.000197645548975751, + "loss": 0.2983, + "step": 1103 + }, + { + "epoch": 0.23185970807518638, + "grad_norm": 0.5517509579658508, + "learning_rate": 0.00019763805053601695, + "loss": 0.3993, + "step": 1104 + }, + { + "epoch": 0.23206972592670377, + "grad_norm": 0.639631450176239, + "learning_rate": 0.00019763054031746532, + "loss": 0.3138, + "step": 1105 + }, + { + "epoch": 0.23227974377822114, + "grad_norm": 0.6691192388534546, + "learning_rate": 0.00019762301832100204, + "loss": 0.2601, + "step": 1106 + }, + { + "epoch": 0.23248976162973853, + "grad_norm": 0.6119483709335327, + "learning_rate": 0.00019761548454753453, + "loss": 0.2382, + "step": 1107 + }, + { + "epoch": 0.23269977948125592, + "grad_norm": 0.7129586935043335, + "learning_rate": 0.00019760793899797172, + "loss": 0.2975, + "step": 1108 + }, + { + "epoch": 0.23290979733277328, + "grad_norm": 0.5922974944114685, + "learning_rate": 0.00019760038167322382, + "loss": 0.2665, + "step": 1109 + }, + { + "epoch": 0.23311981518429067, + "grad_norm": 0.438453733921051, + "learning_rate": 0.0001975928125742026, + "loss": 0.217, + "step": 1110 + }, + { + "epoch": 0.23332983303580804, + "grad_norm": 0.5052367448806763, + "learning_rate": 0.0001975852317018211, + "loss": 0.1992, + "step": 1111 + }, + { + "epoch": 0.23353985088732543, + "grad_norm": 0.7396247386932373, + "learning_rate": 0.0001975776390569939, + "loss": 0.2561, + "step": 1112 + }, + { + "epoch": 0.2337498687388428, + "grad_norm": 0.502675473690033, + "learning_rate": 0.00019757003464063695, + "loss": 0.1859, + "step": 1113 + }, + { + "epoch": 0.23395988659036018, + "grad_norm": 0.5194231271743774, + "learning_rate": 0.0001975624184536676, + "loss": 0.19, + "step": 1114 + }, + { + "epoch": 0.23416990444187755, + "grad_norm": 0.6101930737495422, + "learning_rate": 0.00019755479049700473, + "loss": 0.2109, + "step": 1115 + }, + { + "epoch": 0.23437992229339494, + "grad_norm": 0.5229907631874084, + "learning_rate": 0.0001975471507715685, + "loss": 0.2359, + "step": 1116 + }, + { + "epoch": 0.23458994014491233, + "grad_norm": 0.7651471495628357, + "learning_rate": 0.0001975394992782805, + "loss": 0.3001, + "step": 1117 + }, + { + "epoch": 0.2347999579964297, + "grad_norm": 0.7726680636405945, + "learning_rate": 0.0001975318360180639, + "loss": 0.2422, + "step": 1118 + }, + { + "epoch": 0.23500997584794708, + "grad_norm": 0.8400713801383972, + "learning_rate": 0.00019752416099184307, + "loss": 0.305, + "step": 1119 + }, + { + "epoch": 0.23521999369946445, + "grad_norm": 0.7534275650978088, + "learning_rate": 0.00019751647420054397, + "loss": 0.2488, + "step": 1120 + }, + { + "epoch": 0.23543001155098184, + "grad_norm": 0.5462124347686768, + "learning_rate": 0.0001975087756450939, + "loss": 0.1503, + "step": 1121 + }, + { + "epoch": 0.2356400294024992, + "grad_norm": 0.5736708641052246, + "learning_rate": 0.0001975010653264216, + "loss": 0.239, + "step": 1122 + }, + { + "epoch": 0.2358500472540166, + "grad_norm": 0.6881362795829773, + "learning_rate": 0.00019749334324545723, + "loss": 0.29, + "step": 1123 + }, + { + "epoch": 0.23606006510553398, + "grad_norm": 0.6573613286018372, + "learning_rate": 0.00019748560940313232, + "loss": 0.1698, + "step": 1124 + }, + { + "epoch": 0.23627008295705135, + "grad_norm": 0.5784547924995422, + "learning_rate": 0.0001974778638003799, + "loss": 0.1637, + "step": 1125 + }, + { + "epoch": 0.23648010080856874, + "grad_norm": 0.5579979419708252, + "learning_rate": 0.0001974701064381344, + "loss": 0.289, + "step": 1126 + }, + { + "epoch": 0.2366901186600861, + "grad_norm": 0.6849252581596375, + "learning_rate": 0.00019746233731733162, + "loss": 0.2424, + "step": 1127 + }, + { + "epoch": 0.2369001365116035, + "grad_norm": 0.735053300857544, + "learning_rate": 0.0001974545564389088, + "loss": 0.2899, + "step": 1128 + }, + { + "epoch": 0.23711015436312086, + "grad_norm": 0.5866673588752747, + "learning_rate": 0.00019744676380380462, + "loss": 0.1714, + "step": 1129 + }, + { + "epoch": 0.23732017221463825, + "grad_norm": 0.863044023513794, + "learning_rate": 0.00019743895941295918, + "loss": 0.2236, + "step": 1130 + }, + { + "epoch": 0.2375301900661556, + "grad_norm": 0.5845204591751099, + "learning_rate": 0.00019743114326731395, + "loss": 0.1645, + "step": 1131 + }, + { + "epoch": 0.237740207917673, + "grad_norm": 0.5500687956809998, + "learning_rate": 0.00019742331536781187, + "loss": 0.1583, + "step": 1132 + }, + { + "epoch": 0.2379502257691904, + "grad_norm": 0.4377477169036865, + "learning_rate": 0.00019741547571539727, + "loss": 0.177, + "step": 1133 + }, + { + "epoch": 0.23816024362070776, + "grad_norm": 0.6978058218955994, + "learning_rate": 0.0001974076243110159, + "loss": 0.2614, + "step": 1134 + }, + { + "epoch": 0.23837026147222515, + "grad_norm": 0.5615448355674744, + "learning_rate": 0.00019739976115561495, + "loss": 0.2396, + "step": 1135 + }, + { + "epoch": 0.2385802793237425, + "grad_norm": 0.674802839756012, + "learning_rate": 0.00019739188625014304, + "loss": 0.1782, + "step": 1136 + }, + { + "epoch": 0.2387902971752599, + "grad_norm": 0.5170165300369263, + "learning_rate": 0.0001973839995955501, + "loss": 0.1917, + "step": 1137 + }, + { + "epoch": 0.23900031502677727, + "grad_norm": 0.34832054376602173, + "learning_rate": 0.00019737610119278766, + "loss": 0.0986, + "step": 1138 + }, + { + "epoch": 0.23921033287829466, + "grad_norm": 0.5801171064376831, + "learning_rate": 0.0001973681910428085, + "loss": 0.1948, + "step": 1139 + }, + { + "epoch": 0.23942035072981202, + "grad_norm": 0.6232351660728455, + "learning_rate": 0.00019736026914656687, + "loss": 0.172, + "step": 1140 + }, + { + "epoch": 0.2396303685813294, + "grad_norm": 0.4935660660266876, + "learning_rate": 0.00019735233550501847, + "loss": 0.1967, + "step": 1141 + }, + { + "epoch": 0.2398403864328468, + "grad_norm": 0.5983342528343201, + "learning_rate": 0.0001973443901191204, + "loss": 0.1474, + "step": 1142 + }, + { + "epoch": 0.24005040428436417, + "grad_norm": 0.4784546196460724, + "learning_rate": 0.00019733643298983116, + "loss": 0.1675, + "step": 1143 + }, + { + "epoch": 0.24026042213588156, + "grad_norm": 0.4301772713661194, + "learning_rate": 0.0001973284641181107, + "loss": 0.135, + "step": 1144 + }, + { + "epoch": 0.24047043998739892, + "grad_norm": 0.7016049027442932, + "learning_rate": 0.0001973204835049203, + "loss": 0.1911, + "step": 1145 + }, + { + "epoch": 0.2406804578389163, + "grad_norm": 0.6154897809028625, + "learning_rate": 0.00019731249115122283, + "loss": 0.1488, + "step": 1146 + }, + { + "epoch": 0.24089047569043368, + "grad_norm": 1.0888360738754272, + "learning_rate": 0.00019730448705798239, + "loss": 0.2994, + "step": 1147 + }, + { + "epoch": 0.24110049354195107, + "grad_norm": 0.5985755920410156, + "learning_rate": 0.0001972964712261646, + "loss": 0.188, + "step": 1148 + }, + { + "epoch": 0.24131051139346846, + "grad_norm": 0.40716445446014404, + "learning_rate": 0.00019728844365673646, + "loss": 0.1215, + "step": 1149 + }, + { + "epoch": 0.24152052924498582, + "grad_norm": 0.5432882905006409, + "learning_rate": 0.0001972804043506664, + "loss": 0.2196, + "step": 1150 + }, + { + "epoch": 0.2417305470965032, + "grad_norm": 0.564270555973053, + "learning_rate": 0.00019727235330892426, + "loss": 0.2982, + "step": 1151 + }, + { + "epoch": 0.24194056494802058, + "grad_norm": 0.6324896216392517, + "learning_rate": 0.0001972642905324813, + "loss": 0.3008, + "step": 1152 + }, + { + "epoch": 0.24215058279953797, + "grad_norm": 0.6564551591873169, + "learning_rate": 0.0001972562160223102, + "loss": 0.2006, + "step": 1153 + }, + { + "epoch": 0.24236060065105533, + "grad_norm": 0.5932807326316833, + "learning_rate": 0.00019724812977938507, + "loss": 0.32, + "step": 1154 + }, + { + "epoch": 0.24257061850257272, + "grad_norm": 0.6603784561157227, + "learning_rate": 0.00019724003180468137, + "loss": 0.2916, + "step": 1155 + }, + { + "epoch": 0.24278063635409008, + "grad_norm": 0.540715754032135, + "learning_rate": 0.00019723192209917604, + "loss": 0.2362, + "step": 1156 + }, + { + "epoch": 0.24299065420560748, + "grad_norm": 0.5223392844200134, + "learning_rate": 0.00019722380066384743, + "loss": 0.2172, + "step": 1157 + }, + { + "epoch": 0.24320067205712487, + "grad_norm": 0.6323894262313843, + "learning_rate": 0.00019721566749967523, + "loss": 0.1791, + "step": 1158 + }, + { + "epoch": 0.24341068990864223, + "grad_norm": 0.6830046772956848, + "learning_rate": 0.00019720752260764067, + "loss": 0.2777, + "step": 1159 + }, + { + "epoch": 0.24362070776015962, + "grad_norm": 0.6310597062110901, + "learning_rate": 0.00019719936598872634, + "loss": 0.2385, + "step": 1160 + }, + { + "epoch": 0.24383072561167698, + "grad_norm": 0.5483822822570801, + "learning_rate": 0.0001971911976439162, + "loss": 0.2548, + "step": 1161 + }, + { + "epoch": 0.24404074346319438, + "grad_norm": 0.6507066488265991, + "learning_rate": 0.00019718301757419565, + "loss": 0.1978, + "step": 1162 + }, + { + "epoch": 0.24425076131471174, + "grad_norm": 0.9127165079116821, + "learning_rate": 0.00019717482578055154, + "loss": 0.1521, + "step": 1163 + }, + { + "epoch": 0.24446077916622913, + "grad_norm": 0.49684974551200867, + "learning_rate": 0.0001971666222639721, + "loss": 0.2183, + "step": 1164 + }, + { + "epoch": 0.24467079701774652, + "grad_norm": 0.5121373534202576, + "learning_rate": 0.00019715840702544694, + "loss": 0.1734, + "step": 1165 + }, + { + "epoch": 0.24488081486926389, + "grad_norm": 0.5407108068466187, + "learning_rate": 0.0001971501800659672, + "loss": 0.2193, + "step": 1166 + }, + { + "epoch": 0.24509083272078128, + "grad_norm": 0.9084086418151855, + "learning_rate": 0.00019714194138652533, + "loss": 0.2936, + "step": 1167 + }, + { + "epoch": 0.24530085057229864, + "grad_norm": 0.5651358366012573, + "learning_rate": 0.0001971336909881152, + "loss": 0.1931, + "step": 1168 + }, + { + "epoch": 0.24551086842381603, + "grad_norm": 0.7590070366859436, + "learning_rate": 0.00019712542887173213, + "loss": 0.2404, + "step": 1169 + }, + { + "epoch": 0.2457208862753334, + "grad_norm": 0.7864325642585754, + "learning_rate": 0.0001971171550383729, + "loss": 0.2917, + "step": 1170 + }, + { + "epoch": 0.24593090412685079, + "grad_norm": 0.6653593182563782, + "learning_rate": 0.00019710886948903555, + "loss": 0.2011, + "step": 1171 + }, + { + "epoch": 0.24614092197836815, + "grad_norm": 0.49746832251548767, + "learning_rate": 0.00019710057222471967, + "loss": 0.1644, + "step": 1172 + }, + { + "epoch": 0.24635093982988554, + "grad_norm": 0.6505274176597595, + "learning_rate": 0.00019709226324642626, + "loss": 0.2415, + "step": 1173 + }, + { + "epoch": 0.24656095768140293, + "grad_norm": 0.9204030632972717, + "learning_rate": 0.00019708394255515765, + "loss": 0.3014, + "step": 1174 + }, + { + "epoch": 0.2467709755329203, + "grad_norm": 0.7631303071975708, + "learning_rate": 0.00019707561015191763, + "loss": 0.1956, + "step": 1175 + }, + { + "epoch": 0.24698099338443769, + "grad_norm": 0.4676910936832428, + "learning_rate": 0.0001970672660377114, + "loss": 0.1778, + "step": 1176 + }, + { + "epoch": 0.24719101123595505, + "grad_norm": 0.5645480751991272, + "learning_rate": 0.0001970589102135456, + "loss": 0.2386, + "step": 1177 + }, + { + "epoch": 0.24740102908747244, + "grad_norm": 0.6220672130584717, + "learning_rate": 0.00019705054268042823, + "loss": 0.2715, + "step": 1178 + }, + { + "epoch": 0.2476110469389898, + "grad_norm": 0.6659072041511536, + "learning_rate": 0.00019704216343936873, + "loss": 0.176, + "step": 1179 + }, + { + "epoch": 0.2478210647905072, + "grad_norm": 0.5557059049606323, + "learning_rate": 0.000197033772491378, + "loss": 0.2246, + "step": 1180 + }, + { + "epoch": 0.24803108264202456, + "grad_norm": 0.8239718675613403, + "learning_rate": 0.00019702536983746822, + "loss": 0.2168, + "step": 1181 + }, + { + "epoch": 0.24824110049354195, + "grad_norm": 0.7284471988677979, + "learning_rate": 0.00019701695547865312, + "loss": 0.234, + "step": 1182 + }, + { + "epoch": 0.24845111834505934, + "grad_norm": 0.540712296962738, + "learning_rate": 0.00019700852941594778, + "loss": 0.2099, + "step": 1183 + }, + { + "epoch": 0.2486611361965767, + "grad_norm": 0.4334312379360199, + "learning_rate": 0.0001970000916503687, + "loss": 0.1851, + "step": 1184 + }, + { + "epoch": 0.2488711540480941, + "grad_norm": 0.5734902620315552, + "learning_rate": 0.0001969916421829338, + "loss": 0.1798, + "step": 1185 + }, + { + "epoch": 0.24908117189961146, + "grad_norm": 0.8426918983459473, + "learning_rate": 0.00019698318101466237, + "loss": 0.2933, + "step": 1186 + }, + { + "epoch": 0.24929118975112885, + "grad_norm": 0.6708935499191284, + "learning_rate": 0.0001969747081465752, + "loss": 0.1688, + "step": 1187 + }, + { + "epoch": 0.2495012076026462, + "grad_norm": 0.564127504825592, + "learning_rate": 0.00019696622357969436, + "loss": 0.1875, + "step": 1188 + }, + { + "epoch": 0.2497112254541636, + "grad_norm": 0.44726812839508057, + "learning_rate": 0.00019695772731504347, + "loss": 0.1629, + "step": 1189 + }, + { + "epoch": 0.249921243305681, + "grad_norm": 0.4793647229671478, + "learning_rate": 0.00019694921935364747, + "loss": 0.1696, + "step": 1190 + }, + { + "epoch": 0.25013126115719836, + "grad_norm": 0.5669199228286743, + "learning_rate": 0.00019694069969653278, + "loss": 0.241, + "step": 1191 + }, + { + "epoch": 0.2503412790087157, + "grad_norm": 0.48771825432777405, + "learning_rate": 0.0001969321683447271, + "loss": 0.1922, + "step": 1192 + }, + { + "epoch": 0.25055129686023314, + "grad_norm": 0.5992656350135803, + "learning_rate": 0.00019692362529925977, + "loss": 0.2127, + "step": 1193 + }, + { + "epoch": 0.2507613147117505, + "grad_norm": 0.6230032444000244, + "learning_rate": 0.00019691507056116128, + "loss": 0.2035, + "step": 1194 + }, + { + "epoch": 0.25097133256326787, + "grad_norm": 0.49502989649772644, + "learning_rate": 0.00019690650413146368, + "loss": 0.1737, + "step": 1195 + }, + { + "epoch": 0.25118135041478523, + "grad_norm": 0.5283368825912476, + "learning_rate": 0.00019689792601120044, + "loss": 0.1145, + "step": 1196 + }, + { + "epoch": 0.25139136826630265, + "grad_norm": 0.4804970324039459, + "learning_rate": 0.00019688933620140637, + "loss": 0.159, + "step": 1197 + }, + { + "epoch": 0.25160138611782, + "grad_norm": 0.48868709802627563, + "learning_rate": 0.00019688073470311776, + "loss": 0.1543, + "step": 1198 + }, + { + "epoch": 0.2518114039693374, + "grad_norm": 0.638172447681427, + "learning_rate": 0.00019687212151737224, + "loss": 0.2184, + "step": 1199 + }, + { + "epoch": 0.2520214218208548, + "grad_norm": 0.5951210856437683, + "learning_rate": 0.0001968634966452089, + "loss": 0.203, + "step": 1200 + }, + { + "epoch": 0.25223143967237216, + "grad_norm": 0.687088668346405, + "learning_rate": 0.0001968548600876682, + "loss": 0.2776, + "step": 1201 + }, + { + "epoch": 0.2524414575238895, + "grad_norm": 0.572437584400177, + "learning_rate": 0.00019684621184579208, + "loss": 0.2266, + "step": 1202 + }, + { + "epoch": 0.2526514753754069, + "grad_norm": 0.47711312770843506, + "learning_rate": 0.0001968375519206238, + "loss": 0.2029, + "step": 1203 + }, + { + "epoch": 0.2528614932269243, + "grad_norm": 0.624614953994751, + "learning_rate": 0.0001968288803132081, + "loss": 0.19, + "step": 1204 + }, + { + "epoch": 0.25307151107844167, + "grad_norm": 0.544484555721283, + "learning_rate": 0.00019682019702459106, + "loss": 0.2015, + "step": 1205 + }, + { + "epoch": 0.25328152892995903, + "grad_norm": 0.8100757598876953, + "learning_rate": 0.00019681150205582025, + "loss": 0.3052, + "step": 1206 + }, + { + "epoch": 0.25349154678147645, + "grad_norm": 0.4385722279548645, + "learning_rate": 0.00019680279540794463, + "loss": 0.2004, + "step": 1207 + }, + { + "epoch": 0.2537015646329938, + "grad_norm": 0.5414237976074219, + "learning_rate": 0.0001967940770820145, + "loss": 0.2526, + "step": 1208 + }, + { + "epoch": 0.2539115824845112, + "grad_norm": 0.8186227083206177, + "learning_rate": 0.00019678534707908161, + "loss": 0.4667, + "step": 1209 + }, + { + "epoch": 0.25412160033602854, + "grad_norm": 1.0033893585205078, + "learning_rate": 0.0001967766054001992, + "loss": 0.2813, + "step": 1210 + }, + { + "epoch": 0.25433161818754596, + "grad_norm": 0.7333622574806213, + "learning_rate": 0.00019676785204642176, + "loss": 0.3368, + "step": 1211 + }, + { + "epoch": 0.2545416360390633, + "grad_norm": 0.7219803333282471, + "learning_rate": 0.00019675908701880532, + "loss": 0.3029, + "step": 1212 + }, + { + "epoch": 0.2547516538905807, + "grad_norm": 0.651919960975647, + "learning_rate": 0.00019675031031840727, + "loss": 0.2637, + "step": 1213 + }, + { + "epoch": 0.2549616717420981, + "grad_norm": 0.6580129265785217, + "learning_rate": 0.00019674152194628638, + "loss": 0.1855, + "step": 1214 + }, + { + "epoch": 0.25517168959361547, + "grad_norm": 0.6222065687179565, + "learning_rate": 0.00019673272190350293, + "loss": 0.2179, + "step": 1215 + }, + { + "epoch": 0.25538170744513283, + "grad_norm": 0.8877820372581482, + "learning_rate": 0.00019672391019111846, + "loss": 0.3499, + "step": 1216 + }, + { + "epoch": 0.2555917252966502, + "grad_norm": 0.5138952732086182, + "learning_rate": 0.000196715086810196, + "loss": 0.3148, + "step": 1217 + }, + { + "epoch": 0.2558017431481676, + "grad_norm": 0.7864513993263245, + "learning_rate": 0.00019670625176180002, + "loss": 0.1968, + "step": 1218 + }, + { + "epoch": 0.256011760999685, + "grad_norm": 0.5596680641174316, + "learning_rate": 0.00019669740504699634, + "loss": 0.2496, + "step": 1219 + }, + { + "epoch": 0.25622177885120234, + "grad_norm": 0.5104958415031433, + "learning_rate": 0.0001966885466668522, + "loss": 0.1978, + "step": 1220 + }, + { + "epoch": 0.2564317967027197, + "grad_norm": 0.5994324684143066, + "learning_rate": 0.00019667967662243628, + "loss": 0.1641, + "step": 1221 + }, + { + "epoch": 0.2566418145542371, + "grad_norm": 0.5470710396766663, + "learning_rate": 0.0001966707949148186, + "loss": 0.1936, + "step": 1222 + }, + { + "epoch": 0.2568518324057545, + "grad_norm": 0.5853798389434814, + "learning_rate": 0.00019666190154507066, + "loss": 0.3196, + "step": 1223 + }, + { + "epoch": 0.25706185025727185, + "grad_norm": 0.5121351480484009, + "learning_rate": 0.0001966529965142653, + "loss": 0.2273, + "step": 1224 + }, + { + "epoch": 0.25727186810878927, + "grad_norm": 0.5247400999069214, + "learning_rate": 0.00019664407982347684, + "loss": 0.2338, + "step": 1225 + }, + { + "epoch": 0.25748188596030663, + "grad_norm": 0.49028703570365906, + "learning_rate": 0.00019663515147378096, + "loss": 0.1937, + "step": 1226 + }, + { + "epoch": 0.257691903811824, + "grad_norm": 0.6964942216873169, + "learning_rate": 0.00019662621146625473, + "loss": 0.2488, + "step": 1227 + }, + { + "epoch": 0.25790192166334136, + "grad_norm": 0.5908783674240112, + "learning_rate": 0.00019661725980197668, + "loss": 0.2333, + "step": 1228 + }, + { + "epoch": 0.2581119395148588, + "grad_norm": 0.5808953046798706, + "learning_rate": 0.0001966082964820267, + "loss": 0.2952, + "step": 1229 + }, + { + "epoch": 0.25832195736637614, + "grad_norm": 0.7375782132148743, + "learning_rate": 0.0001965993215074861, + "loss": 0.212, + "step": 1230 + }, + { + "epoch": 0.2585319752178935, + "grad_norm": 0.5336644649505615, + "learning_rate": 0.00019659033487943762, + "loss": 0.1298, + "step": 1231 + }, + { + "epoch": 0.2587419930694109, + "grad_norm": 0.6842905879020691, + "learning_rate": 0.00019658133659896537, + "loss": 0.1563, + "step": 1232 + }, + { + "epoch": 0.2589520109209283, + "grad_norm": 0.6709338426589966, + "learning_rate": 0.00019657232666715486, + "loss": 0.238, + "step": 1233 + }, + { + "epoch": 0.25916202877244565, + "grad_norm": 0.6698547005653381, + "learning_rate": 0.00019656330508509306, + "loss": 0.2658, + "step": 1234 + }, + { + "epoch": 0.259372046623963, + "grad_norm": 0.5134257078170776, + "learning_rate": 0.00019655427185386832, + "loss": 0.1681, + "step": 1235 + }, + { + "epoch": 0.25958206447548043, + "grad_norm": 0.6031914353370667, + "learning_rate": 0.00019654522697457036, + "loss": 0.2422, + "step": 1236 + }, + { + "epoch": 0.2597920823269978, + "grad_norm": 0.7376905083656311, + "learning_rate": 0.00019653617044829033, + "loss": 0.2232, + "step": 1237 + }, + { + "epoch": 0.26000210017851516, + "grad_norm": 0.661484956741333, + "learning_rate": 0.0001965271022761208, + "loss": 0.2423, + "step": 1238 + }, + { + "epoch": 0.2602121180300326, + "grad_norm": 0.47797203063964844, + "learning_rate": 0.00019651802245915573, + "loss": 0.1129, + "step": 1239 + }, + { + "epoch": 0.26042213588154994, + "grad_norm": 0.5190190672874451, + "learning_rate": 0.00019650893099849048, + "loss": 0.1518, + "step": 1240 + }, + { + "epoch": 0.2606321537330673, + "grad_norm": 0.786258339881897, + "learning_rate": 0.00019649982789522182, + "loss": 0.2716, + "step": 1241 + }, + { + "epoch": 0.26084217158458467, + "grad_norm": 0.528610110282898, + "learning_rate": 0.00019649071315044797, + "loss": 0.2129, + "step": 1242 + }, + { + "epoch": 0.2610521894361021, + "grad_norm": 0.4367738664150238, + "learning_rate": 0.00019648158676526846, + "loss": 0.1212, + "step": 1243 + }, + { + "epoch": 0.26126220728761945, + "grad_norm": 0.8334630131721497, + "learning_rate": 0.0001964724487407843, + "loss": 0.2661, + "step": 1244 + }, + { + "epoch": 0.2614722251391368, + "grad_norm": 0.42555752396583557, + "learning_rate": 0.00019646329907809786, + "loss": 0.1394, + "step": 1245 + }, + { + "epoch": 0.2616822429906542, + "grad_norm": 0.7967664003372192, + "learning_rate": 0.00019645413777831294, + "loss": 0.2523, + "step": 1246 + }, + { + "epoch": 0.2618922608421716, + "grad_norm": 0.606065571308136, + "learning_rate": 0.00019644496484253474, + "loss": 0.2318, + "step": 1247 + }, + { + "epoch": 0.26210227869368896, + "grad_norm": 0.7196457982063293, + "learning_rate": 0.00019643578027186983, + "loss": 0.1763, + "step": 1248 + }, + { + "epoch": 0.2623122965452063, + "grad_norm": 0.6310532093048096, + "learning_rate": 0.0001964265840674263, + "loss": 0.1654, + "step": 1249 + }, + { + "epoch": 0.26252231439672374, + "grad_norm": 0.48370879888534546, + "learning_rate": 0.00019641737623031348, + "loss": 0.1486, + "step": 1250 + }, + { + "epoch": 0.2627323322482411, + "grad_norm": 0.5700078010559082, + "learning_rate": 0.00019640815676164218, + "loss": 0.1662, + "step": 1251 + }, + { + "epoch": 0.26294235009975847, + "grad_norm": 0.5395934581756592, + "learning_rate": 0.00019639892566252466, + "loss": 0.2383, + "step": 1252 + }, + { + "epoch": 0.26315236795127583, + "grad_norm": 0.8851696252822876, + "learning_rate": 0.00019638968293407452, + "loss": 0.3027, + "step": 1253 + }, + { + "epoch": 0.26336238580279325, + "grad_norm": 0.968064546585083, + "learning_rate": 0.00019638042857740676, + "loss": 0.216, + "step": 1254 + }, + { + "epoch": 0.2635724036543106, + "grad_norm": 0.7724452018737793, + "learning_rate": 0.00019637116259363783, + "loss": 0.2397, + "step": 1255 + }, + { + "epoch": 0.263782421505828, + "grad_norm": 0.5674394369125366, + "learning_rate": 0.00019636188498388556, + "loss": 0.2246, + "step": 1256 + }, + { + "epoch": 0.2639924393573454, + "grad_norm": 0.9327764511108398, + "learning_rate": 0.00019635259574926912, + "loss": 0.2233, + "step": 1257 + }, + { + "epoch": 0.26420245720886276, + "grad_norm": 0.7048301100730896, + "learning_rate": 0.00019634329489090925, + "loss": 0.2003, + "step": 1258 + }, + { + "epoch": 0.2644124750603801, + "grad_norm": 0.6150112152099609, + "learning_rate": 0.00019633398240992785, + "loss": 0.1781, + "step": 1259 + }, + { + "epoch": 0.2646224929118975, + "grad_norm": 0.7421455979347229, + "learning_rate": 0.00019632465830744846, + "loss": 0.1789, + "step": 1260 + }, + { + "epoch": 0.2648325107634149, + "grad_norm": 0.8235256671905518, + "learning_rate": 0.00019631532258459586, + "loss": 0.2139, + "step": 1261 + }, + { + "epoch": 0.26504252861493227, + "grad_norm": 0.605702817440033, + "learning_rate": 0.00019630597524249632, + "loss": 0.1901, + "step": 1262 + }, + { + "epoch": 0.26525254646644963, + "grad_norm": 0.605636715888977, + "learning_rate": 0.00019629661628227748, + "loss": 0.2281, + "step": 1263 + }, + { + "epoch": 0.26546256431796705, + "grad_norm": 0.4805593490600586, + "learning_rate": 0.00019628724570506834, + "loss": 0.1407, + "step": 1264 + }, + { + "epoch": 0.2656725821694844, + "grad_norm": 0.7765632271766663, + "learning_rate": 0.00019627786351199936, + "loss": 0.3228, + "step": 1265 + }, + { + "epoch": 0.2658826000210018, + "grad_norm": 0.9582871794700623, + "learning_rate": 0.00019626846970420244, + "loss": 0.2351, + "step": 1266 + }, + { + "epoch": 0.26609261787251914, + "grad_norm": 0.545551061630249, + "learning_rate": 0.00019625906428281077, + "loss": 0.2201, + "step": 1267 + }, + { + "epoch": 0.26630263572403656, + "grad_norm": 0.6563419103622437, + "learning_rate": 0.00019624964724895906, + "loss": 0.2605, + "step": 1268 + }, + { + "epoch": 0.2665126535755539, + "grad_norm": 0.685642659664154, + "learning_rate": 0.00019624021860378325, + "loss": 0.2217, + "step": 1269 + }, + { + "epoch": 0.2667226714270713, + "grad_norm": 0.5008343458175659, + "learning_rate": 0.00019623077834842088, + "loss": 0.1678, + "step": 1270 + }, + { + "epoch": 0.26693268927858865, + "grad_norm": 0.7975322008132935, + "learning_rate": 0.00019622132648401076, + "loss": 0.2541, + "step": 1271 + }, + { + "epoch": 0.26714270713010607, + "grad_norm": 0.7770018577575684, + "learning_rate": 0.00019621186301169315, + "loss": 0.2721, + "step": 1272 + }, + { + "epoch": 0.26735272498162344, + "grad_norm": 0.7773646712303162, + "learning_rate": 0.00019620238793260968, + "loss": 0.2694, + "step": 1273 + }, + { + "epoch": 0.2675627428331408, + "grad_norm": 0.5535711050033569, + "learning_rate": 0.00019619290124790344, + "loss": 0.1834, + "step": 1274 + }, + { + "epoch": 0.2677727606846582, + "grad_norm": 0.6472839713096619, + "learning_rate": 0.00019618340295871888, + "loss": 0.2094, + "step": 1275 + }, + { + "epoch": 0.2679827785361756, + "grad_norm": 0.7514201998710632, + "learning_rate": 0.0001961738930662018, + "loss": 0.316, + "step": 1276 + }, + { + "epoch": 0.26819279638769294, + "grad_norm": 0.6256974935531616, + "learning_rate": 0.00019616437157149948, + "loss": 0.1446, + "step": 1277 + }, + { + "epoch": 0.2684028142392103, + "grad_norm": 0.618664026260376, + "learning_rate": 0.0001961548384757606, + "loss": 0.217, + "step": 1278 + }, + { + "epoch": 0.2686128320907277, + "grad_norm": 0.7553743124008179, + "learning_rate": 0.00019614529378013517, + "loss": 0.2514, + "step": 1279 + }, + { + "epoch": 0.2688228499422451, + "grad_norm": 0.5204927921295166, + "learning_rate": 0.00019613573748577468, + "loss": 0.2412, + "step": 1280 + }, + { + "epoch": 0.26903286779376245, + "grad_norm": 0.725058376789093, + "learning_rate": 0.0001961261695938319, + "loss": 0.2307, + "step": 1281 + }, + { + "epoch": 0.2692428856452799, + "grad_norm": 0.3503430485725403, + "learning_rate": 0.00019611659010546114, + "loss": 0.101, + "step": 1282 + }, + { + "epoch": 0.26945290349679724, + "grad_norm": 0.5544008016586304, + "learning_rate": 0.00019610699902181803, + "loss": 0.2656, + "step": 1283 + }, + { + "epoch": 0.2696629213483146, + "grad_norm": 0.6133410334587097, + "learning_rate": 0.00019609739634405963, + "loss": 0.2247, + "step": 1284 + }, + { + "epoch": 0.26987293919983196, + "grad_norm": 0.4621675908565521, + "learning_rate": 0.00019608778207334438, + "loss": 0.1388, + "step": 1285 + }, + { + "epoch": 0.2700829570513494, + "grad_norm": 0.450885146856308, + "learning_rate": 0.00019607815621083209, + "loss": 0.1387, + "step": 1286 + }, + { + "epoch": 0.27029297490286674, + "grad_norm": 0.4488065838813782, + "learning_rate": 0.000196068518757684, + "loss": 0.1359, + "step": 1287 + }, + { + "epoch": 0.2705029927543841, + "grad_norm": 0.3958616554737091, + "learning_rate": 0.00019605886971506284, + "loss": 0.1378, + "step": 1288 + }, + { + "epoch": 0.2707130106059015, + "grad_norm": 0.6814693212509155, + "learning_rate": 0.00019604920908413255, + "loss": 0.1745, + "step": 1289 + }, + { + "epoch": 0.2709230284574189, + "grad_norm": 0.5673383474349976, + "learning_rate": 0.0001960395368660586, + "loss": 0.2067, + "step": 1290 + }, + { + "epoch": 0.27113304630893625, + "grad_norm": 0.468888521194458, + "learning_rate": 0.0001960298530620078, + "loss": 0.1938, + "step": 1291 + }, + { + "epoch": 0.2713430641604536, + "grad_norm": 0.5092310309410095, + "learning_rate": 0.00019602015767314842, + "loss": 0.1773, + "step": 1292 + }, + { + "epoch": 0.27155308201197104, + "grad_norm": 0.6761277318000793, + "learning_rate": 0.00019601045070065005, + "loss": 0.2399, + "step": 1293 + }, + { + "epoch": 0.2717630998634884, + "grad_norm": 0.682894229888916, + "learning_rate": 0.00019600073214568373, + "loss": 0.173, + "step": 1294 + }, + { + "epoch": 0.27197311771500576, + "grad_norm": 0.9188515543937683, + "learning_rate": 0.0001959910020094219, + "loss": 0.2221, + "step": 1295 + }, + { + "epoch": 0.2721831355665232, + "grad_norm": 0.5459915995597839, + "learning_rate": 0.00019598126029303836, + "loss": 0.1978, + "step": 1296 + }, + { + "epoch": 0.27239315341804055, + "grad_norm": 0.7605480551719666, + "learning_rate": 0.00019597150699770835, + "loss": 0.2157, + "step": 1297 + }, + { + "epoch": 0.2726031712695579, + "grad_norm": 0.6047395467758179, + "learning_rate": 0.00019596174212460846, + "loss": 0.1816, + "step": 1298 + }, + { + "epoch": 0.2728131891210753, + "grad_norm": 0.9177975058555603, + "learning_rate": 0.00019595196567491667, + "loss": 0.4008, + "step": 1299 + }, + { + "epoch": 0.2730232069725927, + "grad_norm": 0.492226779460907, + "learning_rate": 0.00019594217764981245, + "loss": 0.1476, + "step": 1300 + }, + { + "epoch": 0.27323322482411005, + "grad_norm": 0.700484573841095, + "learning_rate": 0.00019593237805047656, + "loss": 0.466, + "step": 1301 + }, + { + "epoch": 0.2734432426756274, + "grad_norm": 0.6979876756668091, + "learning_rate": 0.00019592256687809125, + "loss": 0.3519, + "step": 1302 + }, + { + "epoch": 0.2736532605271448, + "grad_norm": 0.7417042255401611, + "learning_rate": 0.0001959127441338401, + "loss": 0.2779, + "step": 1303 + }, + { + "epoch": 0.2738632783786622, + "grad_norm": 0.6014847755432129, + "learning_rate": 0.00019590290981890803, + "loss": 0.3175, + "step": 1304 + }, + { + "epoch": 0.27407329623017956, + "grad_norm": 0.6579805016517639, + "learning_rate": 0.00019589306393448153, + "loss": 0.3319, + "step": 1305 + }, + { + "epoch": 0.2742833140816969, + "grad_norm": 0.44514089822769165, + "learning_rate": 0.0001958832064817483, + "loss": 0.172, + "step": 1306 + }, + { + "epoch": 0.27449333193321435, + "grad_norm": 0.4212436079978943, + "learning_rate": 0.0001958733374618976, + "loss": 0.1436, + "step": 1307 + }, + { + "epoch": 0.2747033497847317, + "grad_norm": 0.659275233745575, + "learning_rate": 0.00019586345687611992, + "loss": 0.1561, + "step": 1308 + }, + { + "epoch": 0.2749133676362491, + "grad_norm": 0.6682752370834351, + "learning_rate": 0.00019585356472560732, + "loss": 0.2189, + "step": 1309 + }, + { + "epoch": 0.27512338548776644, + "grad_norm": 0.5080957412719727, + "learning_rate": 0.00019584366101155307, + "loss": 0.1666, + "step": 1310 + }, + { + "epoch": 0.27533340333928386, + "grad_norm": 0.651543378829956, + "learning_rate": 0.00019583374573515198, + "loss": 0.2687, + "step": 1311 + }, + { + "epoch": 0.2755434211908012, + "grad_norm": 0.8879132270812988, + "learning_rate": 0.00019582381889760023, + "loss": 0.268, + "step": 1312 + }, + { + "epoch": 0.2757534390423186, + "grad_norm": 0.8491726517677307, + "learning_rate": 0.0001958138805000953, + "loss": 0.3921, + "step": 1313 + }, + { + "epoch": 0.275963456893836, + "grad_norm": 0.5724290013313293, + "learning_rate": 0.00019580393054383622, + "loss": 0.1724, + "step": 1314 + }, + { + "epoch": 0.27617347474535336, + "grad_norm": 0.6225562691688538, + "learning_rate": 0.00019579396903002328, + "loss": 0.2008, + "step": 1315 + }, + { + "epoch": 0.27638349259687073, + "grad_norm": 0.601390540599823, + "learning_rate": 0.0001957839959598582, + "loss": 0.2018, + "step": 1316 + }, + { + "epoch": 0.2765935104483881, + "grad_norm": 0.7996687889099121, + "learning_rate": 0.0001957740113345441, + "loss": 0.192, + "step": 1317 + }, + { + "epoch": 0.2768035282999055, + "grad_norm": 0.696975827217102, + "learning_rate": 0.00019576401515528555, + "loss": 0.249, + "step": 1318 + }, + { + "epoch": 0.2770135461514229, + "grad_norm": 0.5437553524971008, + "learning_rate": 0.00019575400742328843, + "loss": 0.1665, + "step": 1319 + }, + { + "epoch": 0.27722356400294024, + "grad_norm": 0.6747444868087769, + "learning_rate": 0.00019574398813976008, + "loss": 0.2043, + "step": 1320 + }, + { + "epoch": 0.27743358185445766, + "grad_norm": 0.7587124109268188, + "learning_rate": 0.00019573395730590915, + "loss": 0.2238, + "step": 1321 + }, + { + "epoch": 0.277643599705975, + "grad_norm": 0.8004297614097595, + "learning_rate": 0.0001957239149229458, + "loss": 0.2887, + "step": 1322 + }, + { + "epoch": 0.2778536175574924, + "grad_norm": 0.7234312295913696, + "learning_rate": 0.00019571386099208145, + "loss": 0.2323, + "step": 1323 + }, + { + "epoch": 0.27806363540900975, + "grad_norm": 0.7471593022346497, + "learning_rate": 0.000195703795514529, + "loss": 0.2379, + "step": 1324 + }, + { + "epoch": 0.27827365326052717, + "grad_norm": 0.5377416610717773, + "learning_rate": 0.00019569371849150282, + "loss": 0.2095, + "step": 1325 + }, + { + "epoch": 0.27848367111204453, + "grad_norm": 0.5749472975730896, + "learning_rate": 0.00019568362992421844, + "loss": 0.1778, + "step": 1326 + }, + { + "epoch": 0.2786936889635619, + "grad_norm": 0.8375914692878723, + "learning_rate": 0.00019567352981389298, + "loss": 0.2322, + "step": 1327 + }, + { + "epoch": 0.27890370681507926, + "grad_norm": 0.5606945157051086, + "learning_rate": 0.0001956634181617449, + "loss": 0.206, + "step": 1328 + }, + { + "epoch": 0.2791137246665967, + "grad_norm": 0.399161159992218, + "learning_rate": 0.00019565329496899406, + "loss": 0.124, + "step": 1329 + }, + { + "epoch": 0.27932374251811404, + "grad_norm": 0.6910118460655212, + "learning_rate": 0.00019564316023686163, + "loss": 0.2299, + "step": 1330 + }, + { + "epoch": 0.2795337603696314, + "grad_norm": 0.43964332342147827, + "learning_rate": 0.0001956330139665703, + "loss": 0.1785, + "step": 1331 + }, + { + "epoch": 0.2797437782211488, + "grad_norm": 0.6819096207618713, + "learning_rate": 0.0001956228561593441, + "loss": 0.1957, + "step": 1332 + }, + { + "epoch": 0.2799537960726662, + "grad_norm": 0.5263475775718689, + "learning_rate": 0.0001956126868164084, + "loss": 0.1348, + "step": 1333 + }, + { + "epoch": 0.28016381392418355, + "grad_norm": 0.5277931690216064, + "learning_rate": 0.00019560250593899002, + "loss": 0.1553, + "step": 1334 + }, + { + "epoch": 0.2803738317757009, + "grad_norm": 0.6066485643386841, + "learning_rate": 0.00019559231352831715, + "loss": 0.2063, + "step": 1335 + }, + { + "epoch": 0.28058384962721833, + "grad_norm": 0.668892502784729, + "learning_rate": 0.00019558210958561939, + "loss": 0.1936, + "step": 1336 + }, + { + "epoch": 0.2807938674787357, + "grad_norm": 0.7482750415802002, + "learning_rate": 0.00019557189411212772, + "loss": 0.2001, + "step": 1337 + }, + { + "epoch": 0.28100388533025306, + "grad_norm": 0.8485071659088135, + "learning_rate": 0.00019556166710907452, + "loss": 0.2095, + "step": 1338 + }, + { + "epoch": 0.2812139031817705, + "grad_norm": 0.6988540887832642, + "learning_rate": 0.00019555142857769354, + "loss": 0.1574, + "step": 1339 + }, + { + "epoch": 0.28142392103328784, + "grad_norm": 0.7636557817459106, + "learning_rate": 0.00019554117851921992, + "loss": 0.2992, + "step": 1340 + }, + { + "epoch": 0.2816339388848052, + "grad_norm": 0.47566983103752136, + "learning_rate": 0.00019553091693489018, + "loss": 0.1391, + "step": 1341 + }, + { + "epoch": 0.28184395673632257, + "grad_norm": 0.7958580255508423, + "learning_rate": 0.00019552064382594232, + "loss": 0.2805, + "step": 1342 + }, + { + "epoch": 0.28205397458784, + "grad_norm": 0.6596616506576538, + "learning_rate": 0.0001955103591936156, + "loss": 0.196, + "step": 1343 + }, + { + "epoch": 0.28226399243935735, + "grad_norm": 0.42053359746932983, + "learning_rate": 0.0001955000630391508, + "loss": 0.159, + "step": 1344 + }, + { + "epoch": 0.2824740102908747, + "grad_norm": 0.5475199818611145, + "learning_rate": 0.00019548975536378996, + "loss": 0.155, + "step": 1345 + }, + { + "epoch": 0.28268402814239213, + "grad_norm": 0.6333903074264526, + "learning_rate": 0.00019547943616877658, + "loss": 0.1959, + "step": 1346 + }, + { + "epoch": 0.2828940459939095, + "grad_norm": 0.891545832157135, + "learning_rate": 0.00019546910545535558, + "loss": 0.2339, + "step": 1347 + }, + { + "epoch": 0.28310406384542686, + "grad_norm": 0.5310159921646118, + "learning_rate": 0.0001954587632247732, + "loss": 0.1912, + "step": 1348 + }, + { + "epoch": 0.2833140816969442, + "grad_norm": 0.6109257340431213, + "learning_rate": 0.0001954484094782771, + "loss": 0.1883, + "step": 1349 + }, + { + "epoch": 0.28352409954846164, + "grad_norm": 0.5322039723396301, + "learning_rate": 0.00019543804421711639, + "loss": 0.2055, + "step": 1350 + }, + { + "epoch": 0.283734117399979, + "grad_norm": 0.4660230576992035, + "learning_rate": 0.00019542766744254142, + "loss": 0.2831, + "step": 1351 + }, + { + "epoch": 0.28394413525149637, + "grad_norm": 0.6315797567367554, + "learning_rate": 0.00019541727915580408, + "loss": 0.2592, + "step": 1352 + }, + { + "epoch": 0.28415415310301373, + "grad_norm": 0.488646000623703, + "learning_rate": 0.00019540687935815754, + "loss": 0.1616, + "step": 1353 + }, + { + "epoch": 0.28436417095453115, + "grad_norm": 0.6158355474472046, + "learning_rate": 0.00019539646805085648, + "loss": 0.291, + "step": 1354 + }, + { + "epoch": 0.2845741888060485, + "grad_norm": 0.9165632724761963, + "learning_rate": 0.00019538604523515682, + "loss": 0.3223, + "step": 1355 + }, + { + "epoch": 0.2847842066575659, + "grad_norm": 0.5712313055992126, + "learning_rate": 0.00019537561091231598, + "loss": 0.2613, + "step": 1356 + }, + { + "epoch": 0.2849942245090833, + "grad_norm": 0.7423316240310669, + "learning_rate": 0.00019536516508359273, + "loss": 0.26, + "step": 1357 + }, + { + "epoch": 0.28520424236060066, + "grad_norm": 0.5036366581916809, + "learning_rate": 0.00019535470775024723, + "loss": 0.1588, + "step": 1358 + }, + { + "epoch": 0.285414260212118, + "grad_norm": 0.9148122668266296, + "learning_rate": 0.00019534423891354102, + "loss": 0.3, + "step": 1359 + }, + { + "epoch": 0.2856242780636354, + "grad_norm": 0.6107314825057983, + "learning_rate": 0.00019533375857473702, + "loss": 0.2617, + "step": 1360 + }, + { + "epoch": 0.2858342959151528, + "grad_norm": 0.658970832824707, + "learning_rate": 0.00019532326673509957, + "loss": 0.1758, + "step": 1361 + }, + { + "epoch": 0.28604431376667017, + "grad_norm": 0.47163254022598267, + "learning_rate": 0.00019531276339589438, + "loss": 0.1711, + "step": 1362 + }, + { + "epoch": 0.28625433161818753, + "grad_norm": 0.4909091293811798, + "learning_rate": 0.0001953022485583886, + "loss": 0.1717, + "step": 1363 + }, + { + "epoch": 0.28646434946970495, + "grad_norm": 0.7579825520515442, + "learning_rate": 0.00019529172222385063, + "loss": 0.2174, + "step": 1364 + }, + { + "epoch": 0.2866743673212223, + "grad_norm": 0.5118494033813477, + "learning_rate": 0.00019528118439355034, + "loss": 0.306, + "step": 1365 + }, + { + "epoch": 0.2868843851727397, + "grad_norm": 0.7731876969337463, + "learning_rate": 0.00019527063506875905, + "loss": 0.3721, + "step": 1366 + }, + { + "epoch": 0.28709440302425704, + "grad_norm": 0.7073164582252502, + "learning_rate": 0.00019526007425074937, + "loss": 0.1929, + "step": 1367 + }, + { + "epoch": 0.28730442087577446, + "grad_norm": 0.5947062969207764, + "learning_rate": 0.00019524950194079534, + "loss": 0.1888, + "step": 1368 + }, + { + "epoch": 0.2875144387272918, + "grad_norm": 0.6583393812179565, + "learning_rate": 0.00019523891814017237, + "loss": 0.1993, + "step": 1369 + }, + { + "epoch": 0.2877244565788092, + "grad_norm": 0.5850254893302917, + "learning_rate": 0.0001952283228501573, + "loss": 0.2447, + "step": 1370 + }, + { + "epoch": 0.2879344744303266, + "grad_norm": 0.6584855914115906, + "learning_rate": 0.00019521771607202822, + "loss": 0.2517, + "step": 1371 + }, + { + "epoch": 0.28814449228184397, + "grad_norm": 0.6195645332336426, + "learning_rate": 0.00019520709780706486, + "loss": 0.183, + "step": 1372 + }, + { + "epoch": 0.28835451013336133, + "grad_norm": 0.6261805295944214, + "learning_rate": 0.00019519646805654802, + "loss": 0.194, + "step": 1373 + }, + { + "epoch": 0.2885645279848787, + "grad_norm": 0.9010648727416992, + "learning_rate": 0.00019518582682176018, + "loss": 0.3978, + "step": 1374 + }, + { + "epoch": 0.2887745458363961, + "grad_norm": 0.7001438736915588, + "learning_rate": 0.00019517517410398501, + "loss": 0.1781, + "step": 1375 + }, + { + "epoch": 0.2889845636879135, + "grad_norm": 0.5351212024688721, + "learning_rate": 0.00019516450990450762, + "loss": 0.1933, + "step": 1376 + }, + { + "epoch": 0.28919458153943084, + "grad_norm": 0.5710474252700806, + "learning_rate": 0.00019515383422461454, + "loss": 0.2288, + "step": 1377 + }, + { + "epoch": 0.2894045993909482, + "grad_norm": 0.6244832873344421, + "learning_rate": 0.00019514314706559364, + "loss": 0.1796, + "step": 1378 + }, + { + "epoch": 0.2896146172424656, + "grad_norm": 0.8345138430595398, + "learning_rate": 0.0001951324484287342, + "loss": 0.2747, + "step": 1379 + }, + { + "epoch": 0.289824635093983, + "grad_norm": 0.9983725547790527, + "learning_rate": 0.0001951217383153269, + "loss": 0.2806, + "step": 1380 + }, + { + "epoch": 0.29003465294550035, + "grad_norm": 0.6748480200767517, + "learning_rate": 0.00019511101672666374, + "loss": 0.1839, + "step": 1381 + }, + { + "epoch": 0.29024467079701777, + "grad_norm": 0.5453517436981201, + "learning_rate": 0.0001951002836640382, + "loss": 0.143, + "step": 1382 + }, + { + "epoch": 0.29045468864853513, + "grad_norm": 0.4747224748134613, + "learning_rate": 0.00019508953912874503, + "loss": 0.1445, + "step": 1383 + }, + { + "epoch": 0.2906647065000525, + "grad_norm": 0.8038759231567383, + "learning_rate": 0.0001950787831220804, + "loss": 0.2043, + "step": 1384 + }, + { + "epoch": 0.29087472435156986, + "grad_norm": 0.7946329116821289, + "learning_rate": 0.000195068015645342, + "loss": 0.1471, + "step": 1385 + }, + { + "epoch": 0.2910847422030873, + "grad_norm": 0.7635067105293274, + "learning_rate": 0.0001950572366998287, + "loss": 0.193, + "step": 1386 + }, + { + "epoch": 0.29129476005460464, + "grad_norm": 0.8812301754951477, + "learning_rate": 0.0001950464462868409, + "loss": 0.3098, + "step": 1387 + }, + { + "epoch": 0.291504777906122, + "grad_norm": 0.6174623370170593, + "learning_rate": 0.00019503564440768033, + "loss": 0.22, + "step": 1388 + }, + { + "epoch": 0.2917147957576394, + "grad_norm": 0.6797242164611816, + "learning_rate": 0.00019502483106365005, + "loss": 0.1588, + "step": 1389 + }, + { + "epoch": 0.2919248136091568, + "grad_norm": 0.7031803727149963, + "learning_rate": 0.0001950140062560546, + "loss": 0.179, + "step": 1390 + }, + { + "epoch": 0.29213483146067415, + "grad_norm": 0.6158111095428467, + "learning_rate": 0.00019500316998619983, + "loss": 0.1637, + "step": 1391 + }, + { + "epoch": 0.2923448493121915, + "grad_norm": 0.5845907926559448, + "learning_rate": 0.000194992322255393, + "loss": 0.1435, + "step": 1392 + }, + { + "epoch": 0.29255486716370893, + "grad_norm": 0.5280815958976746, + "learning_rate": 0.00019498146306494283, + "loss": 0.1887, + "step": 1393 + }, + { + "epoch": 0.2927648850152263, + "grad_norm": 0.5925720930099487, + "learning_rate": 0.00019497059241615922, + "loss": 0.2006, + "step": 1394 + }, + { + "epoch": 0.29297490286674366, + "grad_norm": 0.6027230620384216, + "learning_rate": 0.00019495971031035367, + "loss": 0.2644, + "step": 1395 + }, + { + "epoch": 0.2931849207182611, + "grad_norm": 0.36043769121170044, + "learning_rate": 0.00019494881674883896, + "loss": 0.1382, + "step": 1396 + }, + { + "epoch": 0.29339493856977844, + "grad_norm": 0.7824574708938599, + "learning_rate": 0.00019493791173292923, + "loss": 0.3109, + "step": 1397 + }, + { + "epoch": 0.2936049564212958, + "grad_norm": 0.590056300163269, + "learning_rate": 0.00019492699526394005, + "loss": 0.1493, + "step": 1398 + }, + { + "epoch": 0.29381497427281317, + "grad_norm": 0.4534437954425812, + "learning_rate": 0.00019491606734318837, + "loss": 0.1195, + "step": 1399 + }, + { + "epoch": 0.2940249921243306, + "grad_norm": 0.6726170778274536, + "learning_rate": 0.0001949051279719925, + "loss": 0.2224, + "step": 1400 + }, + { + "epoch": 0.29423500997584795, + "grad_norm": 0.6782220602035522, + "learning_rate": 0.00019489417715167214, + "loss": 0.2842, + "step": 1401 + }, + { + "epoch": 0.2944450278273653, + "grad_norm": 0.6347521543502808, + "learning_rate": 0.00019488321488354834, + "loss": 0.3274, + "step": 1402 + }, + { + "epoch": 0.29465504567888273, + "grad_norm": 0.5405476689338684, + "learning_rate": 0.0001948722411689436, + "loss": 0.2027, + "step": 1403 + }, + { + "epoch": 0.2948650635304001, + "grad_norm": 0.7271124124526978, + "learning_rate": 0.00019486125600918177, + "loss": 0.3642, + "step": 1404 + }, + { + "epoch": 0.29507508138191746, + "grad_norm": 0.6853488683700562, + "learning_rate": 0.00019485025940558804, + "loss": 0.2335, + "step": 1405 + }, + { + "epoch": 0.2952850992334348, + "grad_norm": 0.9300238490104675, + "learning_rate": 0.00019483925135948903, + "loss": 0.3032, + "step": 1406 + }, + { + "epoch": 0.29549511708495224, + "grad_norm": 0.4893563985824585, + "learning_rate": 0.0001948282318722127, + "loss": 0.1537, + "step": 1407 + }, + { + "epoch": 0.2957051349364696, + "grad_norm": 0.4719460606575012, + "learning_rate": 0.00019481720094508847, + "loss": 0.208, + "step": 1408 + }, + { + "epoch": 0.29591515278798697, + "grad_norm": 0.5899675488471985, + "learning_rate": 0.00019480615857944705, + "loss": 0.339, + "step": 1409 + }, + { + "epoch": 0.29612517063950433, + "grad_norm": 0.7662889361381531, + "learning_rate": 0.00019479510477662053, + "loss": 0.2625, + "step": 1410 + }, + { + "epoch": 0.29633518849102175, + "grad_norm": 0.7491856813430786, + "learning_rate": 0.00019478403953794246, + "loss": 0.272, + "step": 1411 + }, + { + "epoch": 0.2965452063425391, + "grad_norm": 0.5673351287841797, + "learning_rate": 0.00019477296286474772, + "loss": 0.2619, + "step": 1412 + }, + { + "epoch": 0.2967552241940565, + "grad_norm": 0.5551652908325195, + "learning_rate": 0.00019476187475837256, + "loss": 0.1971, + "step": 1413 + }, + { + "epoch": 0.2969652420455739, + "grad_norm": 0.6211804747581482, + "learning_rate": 0.00019475077522015463, + "loss": 0.2488, + "step": 1414 + }, + { + "epoch": 0.29717525989709126, + "grad_norm": 0.5500017404556274, + "learning_rate": 0.00019473966425143292, + "loss": 0.1745, + "step": 1415 + }, + { + "epoch": 0.2973852777486086, + "grad_norm": 0.8525585532188416, + "learning_rate": 0.00019472854185354792, + "loss": 0.2419, + "step": 1416 + }, + { + "epoch": 0.297595295600126, + "grad_norm": 0.8762658834457397, + "learning_rate": 0.0001947174080278413, + "loss": 0.2314, + "step": 1417 + }, + { + "epoch": 0.2978053134516434, + "grad_norm": 0.6880355477333069, + "learning_rate": 0.00019470626277565627, + "loss": 0.1981, + "step": 1418 + }, + { + "epoch": 0.29801533130316077, + "grad_norm": 0.4452259838581085, + "learning_rate": 0.00019469510609833736, + "loss": 0.1546, + "step": 1419 + }, + { + "epoch": 0.29822534915467813, + "grad_norm": 0.6769298911094666, + "learning_rate": 0.0001946839379972305, + "loss": 0.2219, + "step": 1420 + }, + { + "epoch": 0.29843536700619555, + "grad_norm": 0.7594902515411377, + "learning_rate": 0.00019467275847368296, + "loss": 0.2559, + "step": 1421 + }, + { + "epoch": 0.2986453848577129, + "grad_norm": 0.7277812957763672, + "learning_rate": 0.00019466156752904343, + "loss": 0.1504, + "step": 1422 + }, + { + "epoch": 0.2988554027092303, + "grad_norm": 0.47229236364364624, + "learning_rate": 0.00019465036516466192, + "loss": 0.215, + "step": 1423 + }, + { + "epoch": 0.29906542056074764, + "grad_norm": 0.7107577919960022, + "learning_rate": 0.00019463915138188994, + "loss": 0.323, + "step": 1424 + }, + { + "epoch": 0.29927543841226506, + "grad_norm": 0.7239084243774414, + "learning_rate": 0.00019462792618208017, + "loss": 0.2366, + "step": 1425 + }, + { + "epoch": 0.2994854562637824, + "grad_norm": 0.5515862107276917, + "learning_rate": 0.0001946166895665869, + "loss": 0.1774, + "step": 1426 + }, + { + "epoch": 0.2996954741152998, + "grad_norm": 0.8459638357162476, + "learning_rate": 0.00019460544153676563, + "loss": 0.3038, + "step": 1427 + }, + { + "epoch": 0.2999054919668172, + "grad_norm": 0.7625037431716919, + "learning_rate": 0.0001945941820939733, + "loss": 0.226, + "step": 1428 + }, + { + "epoch": 0.30011550981833457, + "grad_norm": 0.6378094553947449, + "learning_rate": 0.00019458291123956823, + "loss": 0.2211, + "step": 1429 + }, + { + "epoch": 0.30032552766985193, + "grad_norm": 0.7584728002548218, + "learning_rate": 0.00019457162897491018, + "loss": 0.1805, + "step": 1430 + }, + { + "epoch": 0.3005355455213693, + "grad_norm": 0.7867018580436707, + "learning_rate": 0.0001945603353013601, + "loss": 0.2666, + "step": 1431 + }, + { + "epoch": 0.3007455633728867, + "grad_norm": 0.8175768852233887, + "learning_rate": 0.00019454903022028046, + "loss": 0.2373, + "step": 1432 + }, + { + "epoch": 0.3009555812244041, + "grad_norm": 0.4827120006084442, + "learning_rate": 0.0001945377137330351, + "loss": 0.1528, + "step": 1433 + }, + { + "epoch": 0.30116559907592144, + "grad_norm": 0.7759237885475159, + "learning_rate": 0.00019452638584098925, + "loss": 0.2113, + "step": 1434 + }, + { + "epoch": 0.3013756169274388, + "grad_norm": 0.4536662995815277, + "learning_rate": 0.00019451504654550937, + "loss": 0.1506, + "step": 1435 + }, + { + "epoch": 0.3015856347789562, + "grad_norm": 0.7473581433296204, + "learning_rate": 0.00019450369584796354, + "loss": 0.2763, + "step": 1436 + }, + { + "epoch": 0.3017956526304736, + "grad_norm": 0.4402840733528137, + "learning_rate": 0.00019449233374972097, + "loss": 0.1507, + "step": 1437 + }, + { + "epoch": 0.30200567048199095, + "grad_norm": 0.39426618814468384, + "learning_rate": 0.00019448096025215242, + "loss": 0.1298, + "step": 1438 + }, + { + "epoch": 0.30221568833350837, + "grad_norm": 0.5444751977920532, + "learning_rate": 0.00019446957535662992, + "loss": 0.2057, + "step": 1439 + }, + { + "epoch": 0.30242570618502573, + "grad_norm": 1.0035810470581055, + "learning_rate": 0.00019445817906452696, + "loss": 0.2256, + "step": 1440 + }, + { + "epoch": 0.3026357240365431, + "grad_norm": 0.6751492023468018, + "learning_rate": 0.00019444677137721834, + "loss": 0.2483, + "step": 1441 + }, + { + "epoch": 0.30284574188806046, + "grad_norm": 0.6625372767448425, + "learning_rate": 0.00019443535229608024, + "loss": 0.2556, + "step": 1442 + }, + { + "epoch": 0.3030557597395779, + "grad_norm": 0.5708040595054626, + "learning_rate": 0.00019442392182249024, + "loss": 0.2081, + "step": 1443 + }, + { + "epoch": 0.30326577759109524, + "grad_norm": 0.46071097254753113, + "learning_rate": 0.00019441247995782731, + "loss": 0.1115, + "step": 1444 + }, + { + "epoch": 0.3034757954426126, + "grad_norm": 0.7548226714134216, + "learning_rate": 0.00019440102670347176, + "loss": 0.2926, + "step": 1445 + }, + { + "epoch": 0.30368581329413, + "grad_norm": 0.4947049021720886, + "learning_rate": 0.00019438956206080526, + "loss": 0.1596, + "step": 1446 + }, + { + "epoch": 0.3038958311456474, + "grad_norm": 0.5163564085960388, + "learning_rate": 0.00019437808603121087, + "loss": 0.1452, + "step": 1447 + }, + { + "epoch": 0.30410584899716475, + "grad_norm": 0.7467600107192993, + "learning_rate": 0.00019436659861607304, + "loss": 0.2338, + "step": 1448 + }, + { + "epoch": 0.3043158668486821, + "grad_norm": 0.6491103172302246, + "learning_rate": 0.00019435509981677762, + "loss": 0.2451, + "step": 1449 + }, + { + "epoch": 0.30452588470019953, + "grad_norm": 0.5478000640869141, + "learning_rate": 0.00019434358963471175, + "loss": 0.206, + "step": 1450 + }, + { + "epoch": 0.3047359025517169, + "grad_norm": 0.6403598189353943, + "learning_rate": 0.000194332068071264, + "loss": 0.3738, + "step": 1451 + }, + { + "epoch": 0.30494592040323426, + "grad_norm": 0.7929242253303528, + "learning_rate": 0.00019432053512782435, + "loss": 0.3969, + "step": 1452 + }, + { + "epoch": 0.3051559382547517, + "grad_norm": 0.5820061564445496, + "learning_rate": 0.00019430899080578407, + "loss": 0.2117, + "step": 1453 + }, + { + "epoch": 0.30536595610626904, + "grad_norm": 0.5793547034263611, + "learning_rate": 0.0001942974351065358, + "loss": 0.2539, + "step": 1454 + }, + { + "epoch": 0.3055759739577864, + "grad_norm": 0.5798085331916809, + "learning_rate": 0.00019428586803147365, + "loss": 0.2523, + "step": 1455 + }, + { + "epoch": 0.30578599180930377, + "grad_norm": 0.5463792085647583, + "learning_rate": 0.00019427428958199302, + "loss": 0.2095, + "step": 1456 + }, + { + "epoch": 0.3059960096608212, + "grad_norm": 0.7590929865837097, + "learning_rate": 0.00019426269975949073, + "loss": 0.2232, + "step": 1457 + }, + { + "epoch": 0.30620602751233855, + "grad_norm": 0.6503840088844299, + "learning_rate": 0.0001942510985653649, + "loss": 0.2576, + "step": 1458 + }, + { + "epoch": 0.3064160453638559, + "grad_norm": 0.7579488158226013, + "learning_rate": 0.0001942394860010151, + "loss": 0.2439, + "step": 1459 + }, + { + "epoch": 0.3066260632153733, + "grad_norm": 0.6022104620933533, + "learning_rate": 0.00019422786206784224, + "loss": 0.2076, + "step": 1460 + }, + { + "epoch": 0.3068360810668907, + "grad_norm": 0.5977317094802856, + "learning_rate": 0.00019421622676724863, + "loss": 0.2094, + "step": 1461 + }, + { + "epoch": 0.30704609891840806, + "grad_norm": 0.6613571047782898, + "learning_rate": 0.00019420458010063787, + "loss": 0.3956, + "step": 1462 + }, + { + "epoch": 0.3072561167699254, + "grad_norm": 0.5474089980125427, + "learning_rate": 0.00019419292206941503, + "loss": 0.2094, + "step": 1463 + }, + { + "epoch": 0.30746613462144284, + "grad_norm": 0.6770557761192322, + "learning_rate": 0.0001941812526749865, + "loss": 0.2626, + "step": 1464 + }, + { + "epoch": 0.3076761524729602, + "grad_norm": 0.7406523823738098, + "learning_rate": 0.00019416957191876, + "loss": 0.1788, + "step": 1465 + }, + { + "epoch": 0.30788617032447757, + "grad_norm": 0.6642553210258484, + "learning_rate": 0.00019415787980214472, + "loss": 0.2695, + "step": 1466 + }, + { + "epoch": 0.30809618817599493, + "grad_norm": 0.5584815740585327, + "learning_rate": 0.00019414617632655115, + "loss": 0.1823, + "step": 1467 + }, + { + "epoch": 0.30830620602751235, + "grad_norm": 0.9342876076698303, + "learning_rate": 0.00019413446149339119, + "loss": 0.2435, + "step": 1468 + }, + { + "epoch": 0.3085162238790297, + "grad_norm": 0.9271596670150757, + "learning_rate": 0.00019412273530407804, + "loss": 0.2211, + "step": 1469 + }, + { + "epoch": 0.3087262417305471, + "grad_norm": 0.7659008502960205, + "learning_rate": 0.00019411099776002637, + "loss": 0.231, + "step": 1470 + }, + { + "epoch": 0.3089362595820645, + "grad_norm": 0.5303452014923096, + "learning_rate": 0.00019409924886265215, + "loss": 0.1846, + "step": 1471 + }, + { + "epoch": 0.30914627743358186, + "grad_norm": 0.6797159314155579, + "learning_rate": 0.00019408748861337273, + "loss": 0.1999, + "step": 1472 + }, + { + "epoch": 0.3093562952850992, + "grad_norm": 0.8978555798530579, + "learning_rate": 0.00019407571701360684, + "loss": 0.2137, + "step": 1473 + }, + { + "epoch": 0.3095663131366166, + "grad_norm": 0.7389368414878845, + "learning_rate": 0.0001940639340647746, + "loss": 0.21, + "step": 1474 + }, + { + "epoch": 0.309776330988134, + "grad_norm": 0.9239479899406433, + "learning_rate": 0.00019405213976829745, + "loss": 0.2784, + "step": 1475 + }, + { + "epoch": 0.30998634883965137, + "grad_norm": 0.5426455140113831, + "learning_rate": 0.00019404033412559826, + "loss": 0.1512, + "step": 1476 + }, + { + "epoch": 0.31019636669116873, + "grad_norm": 0.7385450005531311, + "learning_rate": 0.0001940285171381012, + "loss": 0.3286, + "step": 1477 + }, + { + "epoch": 0.31040638454268615, + "grad_norm": 1.2557915449142456, + "learning_rate": 0.00019401668880723183, + "loss": 0.2918, + "step": 1478 + }, + { + "epoch": 0.3106164023942035, + "grad_norm": 0.503313422203064, + "learning_rate": 0.0001940048491344171, + "loss": 0.1517, + "step": 1479 + }, + { + "epoch": 0.3108264202457209, + "grad_norm": 0.7646273374557495, + "learning_rate": 0.00019399299812108538, + "loss": 0.1937, + "step": 1480 + }, + { + "epoch": 0.31103643809723824, + "grad_norm": 0.7608271837234497, + "learning_rate": 0.00019398113576866627, + "loss": 0.2097, + "step": 1481 + }, + { + "epoch": 0.31124645594875566, + "grad_norm": 0.4755302667617798, + "learning_rate": 0.00019396926207859084, + "loss": 0.1295, + "step": 1482 + }, + { + "epoch": 0.311456473800273, + "grad_norm": 0.6126772165298462, + "learning_rate": 0.00019395737705229152, + "loss": 0.2683, + "step": 1483 + }, + { + "epoch": 0.3116664916517904, + "grad_norm": 0.5116230249404907, + "learning_rate": 0.0001939454806912021, + "loss": 0.1898, + "step": 1484 + }, + { + "epoch": 0.3118765095033078, + "grad_norm": 0.5085896849632263, + "learning_rate": 0.00019393357299675765, + "loss": 0.1396, + "step": 1485 + }, + { + "epoch": 0.31208652735482517, + "grad_norm": 0.6756225228309631, + "learning_rate": 0.0001939216539703948, + "loss": 0.246, + "step": 1486 + }, + { + "epoch": 0.31229654520634254, + "grad_norm": 0.7151501178741455, + "learning_rate": 0.00019390972361355132, + "loss": 0.2391, + "step": 1487 + }, + { + "epoch": 0.3125065630578599, + "grad_norm": 0.5057124495506287, + "learning_rate": 0.00019389778192766655, + "loss": 0.1646, + "step": 1488 + }, + { + "epoch": 0.3127165809093773, + "grad_norm": 0.4521079957485199, + "learning_rate": 0.0001938858289141811, + "loss": 0.1356, + "step": 1489 + }, + { + "epoch": 0.3129265987608947, + "grad_norm": 0.5103457570075989, + "learning_rate": 0.00019387386457453686, + "loss": 0.1519, + "step": 1490 + }, + { + "epoch": 0.31313661661241204, + "grad_norm": 0.6395041346549988, + "learning_rate": 0.0001938618889101773, + "loss": 0.2737, + "step": 1491 + }, + { + "epoch": 0.3133466344639294, + "grad_norm": 0.6011154055595398, + "learning_rate": 0.00019384990192254704, + "loss": 0.2148, + "step": 1492 + }, + { + "epoch": 0.3135566523154468, + "grad_norm": 0.554009735584259, + "learning_rate": 0.0001938379036130922, + "loss": 0.1939, + "step": 1493 + }, + { + "epoch": 0.3137666701669642, + "grad_norm": 0.6606349349021912, + "learning_rate": 0.00019382589398326023, + "loss": 0.2804, + "step": 1494 + }, + { + "epoch": 0.31397668801848155, + "grad_norm": 0.5309305787086487, + "learning_rate": 0.00019381387303449995, + "loss": 0.2004, + "step": 1495 + }, + { + "epoch": 0.314186705869999, + "grad_norm": 0.6982274055480957, + "learning_rate": 0.0001938018407682615, + "loss": 0.2301, + "step": 1496 + }, + { + "epoch": 0.31439672372151634, + "grad_norm": 0.6347817778587341, + "learning_rate": 0.00019378979718599645, + "loss": 0.2073, + "step": 1497 + }, + { + "epoch": 0.3146067415730337, + "grad_norm": 0.5214099884033203, + "learning_rate": 0.00019377774228915775, + "loss": 0.1831, + "step": 1498 + }, + { + "epoch": 0.31481675942455106, + "grad_norm": 0.5580305457115173, + "learning_rate": 0.0001937656760791996, + "loss": 0.2311, + "step": 1499 + }, + { + "epoch": 0.3150267772760685, + "grad_norm": 0.7240638732910156, + "learning_rate": 0.00019375359855757767, + "loss": 0.2134, + "step": 1500 + }, + { + "epoch": 0.31523679512758584, + "grad_norm": 0.681068480014801, + "learning_rate": 0.00019374150972574896, + "loss": 0.2452, + "step": 1501 + }, + { + "epoch": 0.3154468129791032, + "grad_norm": 0.723272442817688, + "learning_rate": 0.00019372940958517184, + "loss": 0.2387, + "step": 1502 + }, + { + "epoch": 0.3156568308306206, + "grad_norm": 0.5136252045631409, + "learning_rate": 0.00019371729813730606, + "loss": 0.161, + "step": 1503 + }, + { + "epoch": 0.315866848682138, + "grad_norm": 0.5376479625701904, + "learning_rate": 0.00019370517538361268, + "loss": 0.2566, + "step": 1504 + }, + { + "epoch": 0.31607686653365535, + "grad_norm": 0.4473995566368103, + "learning_rate": 0.00019369304132555416, + "loss": 0.2606, + "step": 1505 + }, + { + "epoch": 0.3162868843851727, + "grad_norm": 0.6067336797714233, + "learning_rate": 0.00019368089596459438, + "loss": 0.1801, + "step": 1506 + }, + { + "epoch": 0.31649690223669014, + "grad_norm": 0.654007077217102, + "learning_rate": 0.00019366873930219846, + "loss": 0.1709, + "step": 1507 + }, + { + "epoch": 0.3167069200882075, + "grad_norm": 0.803130567073822, + "learning_rate": 0.00019365657133983298, + "loss": 0.2791, + "step": 1508 + }, + { + "epoch": 0.31691693793972486, + "grad_norm": 0.6500551104545593, + "learning_rate": 0.00019364439207896584, + "loss": 0.2356, + "step": 1509 + }, + { + "epoch": 0.3171269557912423, + "grad_norm": 0.644919753074646, + "learning_rate": 0.00019363220152106636, + "loss": 0.3375, + "step": 1510 + }, + { + "epoch": 0.31733697364275965, + "grad_norm": 0.6162266135215759, + "learning_rate": 0.00019361999966760514, + "loss": 0.2503, + "step": 1511 + }, + { + "epoch": 0.317546991494277, + "grad_norm": 0.7240517139434814, + "learning_rate": 0.00019360778652005416, + "loss": 0.2265, + "step": 1512 + }, + { + "epoch": 0.3177570093457944, + "grad_norm": 0.6839559674263, + "learning_rate": 0.00019359556207988683, + "loss": 0.2455, + "step": 1513 + }, + { + "epoch": 0.3179670271973118, + "grad_norm": 0.6277573108673096, + "learning_rate": 0.00019358332634857787, + "loss": 0.1721, + "step": 1514 + }, + { + "epoch": 0.31817704504882915, + "grad_norm": 0.7677130103111267, + "learning_rate": 0.00019357107932760334, + "loss": 0.3157, + "step": 1515 + }, + { + "epoch": 0.3183870629003465, + "grad_norm": 0.8040608167648315, + "learning_rate": 0.00019355882101844074, + "loss": 0.2176, + "step": 1516 + }, + { + "epoch": 0.3185970807518639, + "grad_norm": 0.8311493396759033, + "learning_rate": 0.0001935465514225688, + "loss": 0.2872, + "step": 1517 + }, + { + "epoch": 0.3188070986033813, + "grad_norm": 0.5430156588554382, + "learning_rate": 0.0001935342705414678, + "loss": 0.14, + "step": 1518 + }, + { + "epoch": 0.31901711645489866, + "grad_norm": 0.6012011766433716, + "learning_rate": 0.00019352197837661922, + "loss": 0.2076, + "step": 1519 + }, + { + "epoch": 0.319227134306416, + "grad_norm": 0.6674286127090454, + "learning_rate": 0.0001935096749295059, + "loss": 0.2552, + "step": 1520 + }, + { + "epoch": 0.31943715215793345, + "grad_norm": 0.8329024910926819, + "learning_rate": 0.0001934973602016122, + "loss": 0.369, + "step": 1521 + }, + { + "epoch": 0.3196471700094508, + "grad_norm": 0.5887104868888855, + "learning_rate": 0.0001934850341944237, + "loss": 0.2622, + "step": 1522 + }, + { + "epoch": 0.3198571878609682, + "grad_norm": 0.5798863172531128, + "learning_rate": 0.00019347269690942736, + "loss": 0.1691, + "step": 1523 + }, + { + "epoch": 0.32006720571248554, + "grad_norm": 0.5610338449478149, + "learning_rate": 0.00019346034834811154, + "loss": 0.236, + "step": 1524 + }, + { + "epoch": 0.32027722356400296, + "grad_norm": 0.6147192120552063, + "learning_rate": 0.00019344798851196596, + "loss": 0.3255, + "step": 1525 + }, + { + "epoch": 0.3204872414155203, + "grad_norm": 0.5580636262893677, + "learning_rate": 0.00019343561740248165, + "loss": 0.2891, + "step": 1526 + }, + { + "epoch": 0.3206972592670377, + "grad_norm": 0.6528330445289612, + "learning_rate": 0.00019342323502115103, + "loss": 0.2725, + "step": 1527 + }, + { + "epoch": 0.3209072771185551, + "grad_norm": 0.641440212726593, + "learning_rate": 0.00019341084136946786, + "loss": 0.2535, + "step": 1528 + }, + { + "epoch": 0.32111729497007246, + "grad_norm": 0.5117334127426147, + "learning_rate": 0.00019339843644892735, + "loss": 0.2805, + "step": 1529 + }, + { + "epoch": 0.32132731282158983, + "grad_norm": 0.6867427825927734, + "learning_rate": 0.00019338602026102594, + "loss": 0.2848, + "step": 1530 + }, + { + "epoch": 0.3215373306731072, + "grad_norm": 0.635741651058197, + "learning_rate": 0.0001933735928072615, + "loss": 0.2608, + "step": 1531 + }, + { + "epoch": 0.3217473485246246, + "grad_norm": 0.745174765586853, + "learning_rate": 0.00019336115408913327, + "loss": 0.1958, + "step": 1532 + }, + { + "epoch": 0.321957366376142, + "grad_norm": 0.5438898205757141, + "learning_rate": 0.0001933487041081418, + "loss": 0.1649, + "step": 1533 + }, + { + "epoch": 0.32216738422765934, + "grad_norm": 0.6228841543197632, + "learning_rate": 0.00019333624286578904, + "loss": 0.2266, + "step": 1534 + }, + { + "epoch": 0.32237740207917676, + "grad_norm": 0.9263353943824768, + "learning_rate": 0.00019332377036357826, + "loss": 0.2374, + "step": 1535 + }, + { + "epoch": 0.3225874199306941, + "grad_norm": 0.5164491534233093, + "learning_rate": 0.00019331128660301418, + "loss": 0.1553, + "step": 1536 + }, + { + "epoch": 0.3227974377822115, + "grad_norm": 0.6456015110015869, + "learning_rate": 0.00019329879158560274, + "loss": 0.2299, + "step": 1537 + }, + { + "epoch": 0.32300745563372885, + "grad_norm": 0.735962450504303, + "learning_rate": 0.00019328628531285134, + "loss": 0.1944, + "step": 1538 + }, + { + "epoch": 0.32321747348524626, + "grad_norm": 0.5499973893165588, + "learning_rate": 0.0001932737677862687, + "loss": 0.2006, + "step": 1539 + }, + { + "epoch": 0.32342749133676363, + "grad_norm": 0.5221630334854126, + "learning_rate": 0.0001932612390073649, + "loss": 0.1536, + "step": 1540 + }, + { + "epoch": 0.323637509188281, + "grad_norm": 0.6822124123573303, + "learning_rate": 0.00019324869897765137, + "loss": 0.2061, + "step": 1541 + }, + { + "epoch": 0.32384752703979836, + "grad_norm": 0.6536619663238525, + "learning_rate": 0.00019323614769864095, + "loss": 0.1727, + "step": 1542 + }, + { + "epoch": 0.3240575448913158, + "grad_norm": 0.5224418640136719, + "learning_rate": 0.00019322358517184774, + "loss": 0.1519, + "step": 1543 + }, + { + "epoch": 0.32426756274283314, + "grad_norm": 0.6163767576217651, + "learning_rate": 0.00019321101139878729, + "loss": 0.1736, + "step": 1544 + }, + { + "epoch": 0.3244775805943505, + "grad_norm": 0.7845442295074463, + "learning_rate": 0.00019319842638097648, + "loss": 0.3157, + "step": 1545 + }, + { + "epoch": 0.3246875984458679, + "grad_norm": 0.5813778638839722, + "learning_rate": 0.0001931858301199335, + "loss": 0.1722, + "step": 1546 + }, + { + "epoch": 0.3248976162973853, + "grad_norm": 0.6266569495201111, + "learning_rate": 0.00019317322261717794, + "loss": 0.1653, + "step": 1547 + }, + { + "epoch": 0.32510763414890265, + "grad_norm": 0.6247707009315491, + "learning_rate": 0.00019316060387423076, + "loss": 0.2562, + "step": 1548 + }, + { + "epoch": 0.32531765200042, + "grad_norm": 0.777686357498169, + "learning_rate": 0.00019314797389261424, + "loss": 0.1978, + "step": 1549 + }, + { + "epoch": 0.32552766985193743, + "grad_norm": 0.5679663419723511, + "learning_rate": 0.00019313533267385205, + "loss": 0.134, + "step": 1550 + }, + { + "epoch": 0.3257376877034548, + "grad_norm": 0.4159435033798218, + "learning_rate": 0.00019312268021946918, + "loss": 0.2976, + "step": 1551 + }, + { + "epoch": 0.32594770555497216, + "grad_norm": 0.737212061882019, + "learning_rate": 0.00019311001653099193, + "loss": 0.2798, + "step": 1552 + }, + { + "epoch": 0.3261577234064896, + "grad_norm": 0.7335833311080933, + "learning_rate": 0.00019309734160994816, + "loss": 0.2681, + "step": 1553 + }, + { + "epoch": 0.32636774125800694, + "grad_norm": 0.5905042290687561, + "learning_rate": 0.00019308465545786683, + "loss": 0.2378, + "step": 1554 + }, + { + "epoch": 0.3265777591095243, + "grad_norm": 0.5820741057395935, + "learning_rate": 0.00019307195807627837, + "loss": 0.2429, + "step": 1555 + }, + { + "epoch": 0.32678777696104166, + "grad_norm": 0.6299948692321777, + "learning_rate": 0.00019305924946671463, + "loss": 0.1792, + "step": 1556 + }, + { + "epoch": 0.3269977948125591, + "grad_norm": 0.6246170997619629, + "learning_rate": 0.0001930465296307087, + "loss": 0.203, + "step": 1557 + }, + { + "epoch": 0.32720781266407645, + "grad_norm": 0.5834625363349915, + "learning_rate": 0.00019303379856979501, + "loss": 0.2687, + "step": 1558 + }, + { + "epoch": 0.3274178305155938, + "grad_norm": 0.648408830165863, + "learning_rate": 0.00019302105628550952, + "loss": 0.2195, + "step": 1559 + }, + { + "epoch": 0.32762784836711123, + "grad_norm": 0.7197003960609436, + "learning_rate": 0.00019300830277938936, + "loss": 0.2717, + "step": 1560 + }, + { + "epoch": 0.3278378662186286, + "grad_norm": 0.6736263036727905, + "learning_rate": 0.0001929955380529731, + "loss": 0.4104, + "step": 1561 + }, + { + "epoch": 0.32804788407014596, + "grad_norm": 0.5383338332176208, + "learning_rate": 0.00019298276210780068, + "loss": 0.2272, + "step": 1562 + }, + { + "epoch": 0.3282579019216633, + "grad_norm": 0.5980930328369141, + "learning_rate": 0.0001929699749454133, + "loss": 0.237, + "step": 1563 + }, + { + "epoch": 0.32846791977318074, + "grad_norm": 0.5906263589859009, + "learning_rate": 0.00019295717656735357, + "loss": 0.2028, + "step": 1564 + }, + { + "epoch": 0.3286779376246981, + "grad_norm": 0.5565531849861145, + "learning_rate": 0.0001929443669751655, + "loss": 0.1398, + "step": 1565 + }, + { + "epoch": 0.32888795547621547, + "grad_norm": 0.5388147830963135, + "learning_rate": 0.00019293154617039437, + "loss": 0.1712, + "step": 1566 + }, + { + "epoch": 0.3290979733277329, + "grad_norm": 0.5639932751655579, + "learning_rate": 0.00019291871415458688, + "loss": 0.1461, + "step": 1567 + }, + { + "epoch": 0.32930799117925025, + "grad_norm": 0.8730794787406921, + "learning_rate": 0.00019290587092929106, + "loss": 0.3195, + "step": 1568 + }, + { + "epoch": 0.3295180090307676, + "grad_norm": 0.8476155400276184, + "learning_rate": 0.00019289301649605625, + "loss": 0.2727, + "step": 1569 + }, + { + "epoch": 0.329728026882285, + "grad_norm": 0.8710721135139465, + "learning_rate": 0.0001928801508564332, + "loss": 0.2103, + "step": 1570 + }, + { + "epoch": 0.3299380447338024, + "grad_norm": 0.8265402913093567, + "learning_rate": 0.000192867274011974, + "loss": 0.3057, + "step": 1571 + }, + { + "epoch": 0.33014806258531976, + "grad_norm": 0.7343376278877258, + "learning_rate": 0.00019285438596423204, + "loss": 0.3153, + "step": 1572 + }, + { + "epoch": 0.3303580804368371, + "grad_norm": 0.6627643704414368, + "learning_rate": 0.00019284148671476215, + "loss": 0.2521, + "step": 1573 + }, + { + "epoch": 0.3305680982883545, + "grad_norm": 0.6346861124038696, + "learning_rate": 0.0001928285762651204, + "loss": 0.1335, + "step": 1574 + }, + { + "epoch": 0.3307781161398719, + "grad_norm": 0.7667282223701477, + "learning_rate": 0.00019281565461686437, + "loss": 0.3119, + "step": 1575 + }, + { + "epoch": 0.33098813399138927, + "grad_norm": 0.6494827270507812, + "learning_rate": 0.00019280272177155282, + "loss": 0.2105, + "step": 1576 + }, + { + "epoch": 0.33119815184290663, + "grad_norm": 0.7444926500320435, + "learning_rate": 0.00019278977773074595, + "loss": 0.2244, + "step": 1577 + }, + { + "epoch": 0.33140816969442405, + "grad_norm": 0.5099835991859436, + "learning_rate": 0.00019277682249600536, + "loss": 0.1396, + "step": 1578 + }, + { + "epoch": 0.3316181875459414, + "grad_norm": 0.7752482295036316, + "learning_rate": 0.00019276385606889384, + "loss": 0.2423, + "step": 1579 + }, + { + "epoch": 0.3318282053974588, + "grad_norm": 0.8107450008392334, + "learning_rate": 0.0001927508784509757, + "loss": 0.348, + "step": 1580 + }, + { + "epoch": 0.33203822324897614, + "grad_norm": 0.7489921450614929, + "learning_rate": 0.00019273788964381647, + "loss": 0.2613, + "step": 1581 + }, + { + "epoch": 0.33224824110049356, + "grad_norm": 0.7400534152984619, + "learning_rate": 0.00019272488964898316, + "loss": 0.2607, + "step": 1582 + }, + { + "epoch": 0.3324582589520109, + "grad_norm": 0.5672757029533386, + "learning_rate": 0.00019271187846804403, + "loss": 0.2251, + "step": 1583 + }, + { + "epoch": 0.3326682768035283, + "grad_norm": 0.7830666303634644, + "learning_rate": 0.00019269885610256865, + "loss": 0.2471, + "step": 1584 + }, + { + "epoch": 0.3328782946550457, + "grad_norm": 0.753364622592926, + "learning_rate": 0.00019268582255412814, + "loss": 0.2339, + "step": 1585 + }, + { + "epoch": 0.33308831250656307, + "grad_norm": 0.6402462124824524, + "learning_rate": 0.0001926727778242947, + "loss": 0.1535, + "step": 1586 + }, + { + "epoch": 0.33329833035808043, + "grad_norm": 0.9237805008888245, + "learning_rate": 0.00019265972191464213, + "loss": 0.1999, + "step": 1587 + }, + { + "epoch": 0.3335083482095978, + "grad_norm": 0.8045313954353333, + "learning_rate": 0.00019264665482674536, + "loss": 0.2769, + "step": 1588 + }, + { + "epoch": 0.3337183660611152, + "grad_norm": 0.6476468443870544, + "learning_rate": 0.0001926335765621808, + "loss": 0.2729, + "step": 1589 + }, + { + "epoch": 0.3339283839126326, + "grad_norm": 0.5111613869667053, + "learning_rate": 0.00019262048712252624, + "loss": 0.1721, + "step": 1590 + }, + { + "epoch": 0.33413840176414994, + "grad_norm": 0.6063307523727417, + "learning_rate": 0.00019260738650936073, + "loss": 0.1926, + "step": 1591 + }, + { + "epoch": 0.33434841961566736, + "grad_norm": 0.5078137516975403, + "learning_rate": 0.00019259427472426467, + "loss": 0.2166, + "step": 1592 + }, + { + "epoch": 0.3345584374671847, + "grad_norm": 0.5363006591796875, + "learning_rate": 0.00019258115176881986, + "loss": 0.233, + "step": 1593 + }, + { + "epoch": 0.3347684553187021, + "grad_norm": 0.4900319576263428, + "learning_rate": 0.00019256801764460936, + "loss": 0.1666, + "step": 1594 + }, + { + "epoch": 0.33497847317021945, + "grad_norm": 0.5940386056900024, + "learning_rate": 0.00019255487235321774, + "loss": 0.1563, + "step": 1595 + }, + { + "epoch": 0.33518849102173687, + "grad_norm": 0.6830379366874695, + "learning_rate": 0.00019254171589623076, + "loss": 0.2101, + "step": 1596 + }, + { + "epoch": 0.33539850887325423, + "grad_norm": 0.7788860201835632, + "learning_rate": 0.00019252854827523557, + "loss": 0.2819, + "step": 1597 + }, + { + "epoch": 0.3356085267247716, + "grad_norm": 0.4878160357475281, + "learning_rate": 0.0001925153694918207, + "loss": 0.173, + "step": 1598 + }, + { + "epoch": 0.33581854457628896, + "grad_norm": 0.643084704875946, + "learning_rate": 0.00019250217954757602, + "loss": 0.1881, + "step": 1599 + }, + { + "epoch": 0.3360285624278064, + "grad_norm": 0.5981395244598389, + "learning_rate": 0.0001924889784440927, + "loss": 0.2883, + "step": 1600 + }, + { + "epoch": 0.33623858027932374, + "grad_norm": 0.4534986615180969, + "learning_rate": 0.0001924757661829633, + "loss": 0.3139, + "step": 1601 + }, + { + "epoch": 0.3364485981308411, + "grad_norm": 0.5547178387641907, + "learning_rate": 0.00019246254276578174, + "loss": 0.3448, + "step": 1602 + }, + { + "epoch": 0.3366586159823585, + "grad_norm": 0.5542036890983582, + "learning_rate": 0.00019244930819414325, + "loss": 0.3081, + "step": 1603 + }, + { + "epoch": 0.3368686338338759, + "grad_norm": 0.5334186553955078, + "learning_rate": 0.00019243606246964438, + "loss": 0.2045, + "step": 1604 + }, + { + "epoch": 0.33707865168539325, + "grad_norm": 0.4146350026130676, + "learning_rate": 0.00019242280559388311, + "loss": 0.2225, + "step": 1605 + }, + { + "epoch": 0.3372886695369106, + "grad_norm": 0.726662814617157, + "learning_rate": 0.0001924095375684587, + "loss": 0.2042, + "step": 1606 + }, + { + "epoch": 0.33749868738842803, + "grad_norm": 0.6827337741851807, + "learning_rate": 0.00019239625839497174, + "loss": 0.2285, + "step": 1607 + }, + { + "epoch": 0.3377087052399454, + "grad_norm": 0.4500672519207001, + "learning_rate": 0.00019238296807502428, + "loss": 0.1484, + "step": 1608 + }, + { + "epoch": 0.33791872309146276, + "grad_norm": 0.6254449486732483, + "learning_rate": 0.00019236966661021954, + "loss": 0.1773, + "step": 1609 + }, + { + "epoch": 0.3381287409429802, + "grad_norm": 0.5149936676025391, + "learning_rate": 0.00019235635400216222, + "loss": 0.1525, + "step": 1610 + }, + { + "epoch": 0.33833875879449754, + "grad_norm": 0.4187399744987488, + "learning_rate": 0.00019234303025245835, + "loss": 0.1808, + "step": 1611 + }, + { + "epoch": 0.3385487766460149, + "grad_norm": 0.6186479330062866, + "learning_rate": 0.00019232969536271522, + "loss": 0.3883, + "step": 1612 + }, + { + "epoch": 0.33875879449753227, + "grad_norm": 0.5121021270751953, + "learning_rate": 0.00019231634933454154, + "loss": 0.1468, + "step": 1613 + }, + { + "epoch": 0.3389688123490497, + "grad_norm": 0.5343567132949829, + "learning_rate": 0.00019230299216954736, + "loss": 0.1794, + "step": 1614 + }, + { + "epoch": 0.33917883020056705, + "grad_norm": 0.933382511138916, + "learning_rate": 0.000192289623869344, + "loss": 0.3237, + "step": 1615 + }, + { + "epoch": 0.3393888480520844, + "grad_norm": 1.0121095180511475, + "learning_rate": 0.00019227624443554425, + "loss": 0.2953, + "step": 1616 + }, + { + "epoch": 0.33959886590360183, + "grad_norm": 0.6155586242675781, + "learning_rate": 0.00019226285386976212, + "loss": 0.1835, + "step": 1617 + }, + { + "epoch": 0.3398088837551192, + "grad_norm": 0.5891084671020508, + "learning_rate": 0.00019224945217361306, + "loss": 0.2457, + "step": 1618 + }, + { + "epoch": 0.34001890160663656, + "grad_norm": 1.027543067932129, + "learning_rate": 0.0001922360393487138, + "loss": 0.2847, + "step": 1619 + }, + { + "epoch": 0.3402289194581539, + "grad_norm": 0.5866810083389282, + "learning_rate": 0.0001922226153966824, + "loss": 0.2284, + "step": 1620 + }, + { + "epoch": 0.34043893730967134, + "grad_norm": 0.7704328298568726, + "learning_rate": 0.00019220918031913833, + "loss": 0.2465, + "step": 1621 + }, + { + "epoch": 0.3406489551611887, + "grad_norm": 0.7415633797645569, + "learning_rate": 0.00019219573411770235, + "loss": 0.1966, + "step": 1622 + }, + { + "epoch": 0.34085897301270607, + "grad_norm": 0.8156745433807373, + "learning_rate": 0.00019218227679399657, + "loss": 0.2097, + "step": 1623 + }, + { + "epoch": 0.34106899086422343, + "grad_norm": 0.7245871424674988, + "learning_rate": 0.00019216880834964448, + "loss": 0.2093, + "step": 1624 + }, + { + "epoch": 0.34127900871574085, + "grad_norm": 0.6942924857139587, + "learning_rate": 0.00019215532878627084, + "loss": 0.2203, + "step": 1625 + }, + { + "epoch": 0.3414890265672582, + "grad_norm": 0.7590915560722351, + "learning_rate": 0.00019214183810550183, + "loss": 0.209, + "step": 1626 + }, + { + "epoch": 0.3416990444187756, + "grad_norm": 0.6332980990409851, + "learning_rate": 0.0001921283363089649, + "loss": 0.2052, + "step": 1627 + }, + { + "epoch": 0.341909062270293, + "grad_norm": 0.7253382802009583, + "learning_rate": 0.00019211482339828893, + "loss": 0.2334, + "step": 1628 + }, + { + "epoch": 0.34211908012181036, + "grad_norm": 0.6251415014266968, + "learning_rate": 0.000192101299375104, + "loss": 0.1544, + "step": 1629 + }, + { + "epoch": 0.3423290979733277, + "grad_norm": 0.5384700894355774, + "learning_rate": 0.00019208776424104165, + "loss": 0.1295, + "step": 1630 + }, + { + "epoch": 0.3425391158248451, + "grad_norm": 0.788681149482727, + "learning_rate": 0.00019207421799773475, + "loss": 0.2974, + "step": 1631 + }, + { + "epoch": 0.3427491336763625, + "grad_norm": 0.48648181557655334, + "learning_rate": 0.0001920606606468175, + "loss": 0.1941, + "step": 1632 + }, + { + "epoch": 0.34295915152787987, + "grad_norm": 1.0241416692733765, + "learning_rate": 0.00019204709218992536, + "loss": 0.195, + "step": 1633 + }, + { + "epoch": 0.34316916937939723, + "grad_norm": 0.45205456018447876, + "learning_rate": 0.00019203351262869525, + "loss": 0.1465, + "step": 1634 + }, + { + "epoch": 0.34337918723091465, + "grad_norm": 0.5081611275672913, + "learning_rate": 0.00019201992196476533, + "loss": 0.1738, + "step": 1635 + }, + { + "epoch": 0.343589205082432, + "grad_norm": 0.8194214701652527, + "learning_rate": 0.00019200632019977521, + "loss": 0.2898, + "step": 1636 + }, + { + "epoch": 0.3437992229339494, + "grad_norm": 0.28592121601104736, + "learning_rate": 0.00019199270733536572, + "loss": 0.1141, + "step": 1637 + }, + { + "epoch": 0.34400924078546674, + "grad_norm": 0.7359632849693298, + "learning_rate": 0.0001919790833731791, + "loss": 0.3353, + "step": 1638 + }, + { + "epoch": 0.34421925863698416, + "grad_norm": 1.307955026626587, + "learning_rate": 0.00019196544831485892, + "loss": 0.2808, + "step": 1639 + }, + { + "epoch": 0.3444292764885015, + "grad_norm": 0.7657724618911743, + "learning_rate": 0.00019195180216205007, + "loss": 0.2771, + "step": 1640 + }, + { + "epoch": 0.3446392943400189, + "grad_norm": 0.4949481785297394, + "learning_rate": 0.0001919381449163988, + "loss": 0.1887, + "step": 1641 + }, + { + "epoch": 0.3448493121915363, + "grad_norm": 0.5673692226409912, + "learning_rate": 0.00019192447657955262, + "loss": 0.1795, + "step": 1642 + }, + { + "epoch": 0.34505933004305367, + "grad_norm": 0.6279990673065186, + "learning_rate": 0.00019191079715316056, + "loss": 0.1995, + "step": 1643 + }, + { + "epoch": 0.34526934789457103, + "grad_norm": 0.7078985571861267, + "learning_rate": 0.0001918971066388728, + "loss": 0.2012, + "step": 1644 + }, + { + "epoch": 0.3454793657460884, + "grad_norm": 0.47900810837745667, + "learning_rate": 0.00019188340503834095, + "loss": 0.2149, + "step": 1645 + }, + { + "epoch": 0.3456893835976058, + "grad_norm": 0.9682589769363403, + "learning_rate": 0.0001918696923532179, + "loss": 0.2038, + "step": 1646 + }, + { + "epoch": 0.3458994014491232, + "grad_norm": 0.6839099526405334, + "learning_rate": 0.000191855968585158, + "loss": 0.192, + "step": 1647 + }, + { + "epoch": 0.34610941930064054, + "grad_norm": 0.44559231400489807, + "learning_rate": 0.0001918422337358168, + "loss": 0.1752, + "step": 1648 + }, + { + "epoch": 0.34631943715215796, + "grad_norm": 0.48147323727607727, + "learning_rate": 0.00019182848780685115, + "loss": 0.1978, + "step": 1649 + }, + { + "epoch": 0.3465294550036753, + "grad_norm": 0.4760332405567169, + "learning_rate": 0.0001918147307999195, + "loss": 0.197, + "step": 1650 + }, + { + "epoch": 0.3467394728551927, + "grad_norm": 0.46297112107276917, + "learning_rate": 0.00019180096271668138, + "loss": 0.1567, + "step": 1651 + }, + { + "epoch": 0.34694949070671005, + "grad_norm": 0.5012452602386475, + "learning_rate": 0.0001917871835587977, + "loss": 0.218, + "step": 1652 + }, + { + "epoch": 0.34715950855822747, + "grad_norm": 0.5509222149848938, + "learning_rate": 0.00019177339332793078, + "loss": 0.2306, + "step": 1653 + }, + { + "epoch": 0.34736952640974483, + "grad_norm": 0.6989178657531738, + "learning_rate": 0.00019175959202574427, + "loss": 0.3619, + "step": 1654 + }, + { + "epoch": 0.3475795442612622, + "grad_norm": 0.6114615201950073, + "learning_rate": 0.00019174577965390304, + "loss": 0.2084, + "step": 1655 + }, + { + "epoch": 0.34778956211277956, + "grad_norm": 0.600771963596344, + "learning_rate": 0.0001917319562140735, + "loss": 0.1736, + "step": 1656 + }, + { + "epoch": 0.347999579964297, + "grad_norm": 0.8332004547119141, + "learning_rate": 0.00019171812170792318, + "loss": 0.2071, + "step": 1657 + }, + { + "epoch": 0.34820959781581434, + "grad_norm": 0.4657239317893982, + "learning_rate": 0.0001917042761371211, + "loss": 0.1708, + "step": 1658 + }, + { + "epoch": 0.3484196156673317, + "grad_norm": 0.6832460761070251, + "learning_rate": 0.0001916904195033375, + "loss": 0.2495, + "step": 1659 + }, + { + "epoch": 0.3486296335188491, + "grad_norm": 0.7702248692512512, + "learning_rate": 0.00019167655180824404, + "loss": 0.1839, + "step": 1660 + }, + { + "epoch": 0.3488396513703665, + "grad_norm": 0.6671183705329895, + "learning_rate": 0.0001916626730535137, + "loss": 0.3184, + "step": 1661 + }, + { + "epoch": 0.34904966922188385, + "grad_norm": 0.6315720081329346, + "learning_rate": 0.00019164878324082074, + "loss": 0.204, + "step": 1662 + }, + { + "epoch": 0.3492596870734012, + "grad_norm": 0.7643944621086121, + "learning_rate": 0.00019163488237184084, + "loss": 0.1952, + "step": 1663 + }, + { + "epoch": 0.34946970492491863, + "grad_norm": 0.6874391436576843, + "learning_rate": 0.00019162097044825096, + "loss": 0.157, + "step": 1664 + }, + { + "epoch": 0.349679722776436, + "grad_norm": 0.7709307074546814, + "learning_rate": 0.00019160704747172934, + "loss": 0.3383, + "step": 1665 + }, + { + "epoch": 0.34988974062795336, + "grad_norm": 0.4958166182041168, + "learning_rate": 0.0001915931134439557, + "loss": 0.1265, + "step": 1666 + }, + { + "epoch": 0.3500997584794708, + "grad_norm": 0.7482470273971558, + "learning_rate": 0.00019157916836661095, + "loss": 0.2388, + "step": 1667 + }, + { + "epoch": 0.35030977633098814, + "grad_norm": 0.7006984353065491, + "learning_rate": 0.00019156521224137743, + "loss": 0.1407, + "step": 1668 + }, + { + "epoch": 0.3505197941825055, + "grad_norm": 0.9646068811416626, + "learning_rate": 0.00019155124506993874, + "loss": 0.2785, + "step": 1669 + }, + { + "epoch": 0.35072981203402287, + "grad_norm": 0.5841155052185059, + "learning_rate": 0.00019153726685397984, + "loss": 0.2303, + "step": 1670 + }, + { + "epoch": 0.3509398298855403, + "grad_norm": 0.8856375813484192, + "learning_rate": 0.00019152327759518705, + "loss": 0.2264, + "step": 1671 + }, + { + "epoch": 0.35114984773705765, + "grad_norm": 0.7846825122833252, + "learning_rate": 0.000191509277295248, + "loss": 0.2239, + "step": 1672 + }, + { + "epoch": 0.351359865588575, + "grad_norm": 1.0474859476089478, + "learning_rate": 0.00019149526595585163, + "loss": 0.3247, + "step": 1673 + }, + { + "epoch": 0.35156988344009243, + "grad_norm": 0.8523626923561096, + "learning_rate": 0.0001914812435786883, + "loss": 0.2623, + "step": 1674 + }, + { + "epoch": 0.3517799012916098, + "grad_norm": 0.8021546006202698, + "learning_rate": 0.00019146721016544954, + "loss": 0.1953, + "step": 1675 + }, + { + "epoch": 0.35198991914312716, + "grad_norm": 0.9674374461174011, + "learning_rate": 0.00019145316571782836, + "loss": 0.2123, + "step": 1676 + }, + { + "epoch": 0.3521999369946445, + "grad_norm": 0.7059686183929443, + "learning_rate": 0.00019143911023751907, + "loss": 0.2011, + "step": 1677 + }, + { + "epoch": 0.35240995484616194, + "grad_norm": 0.5109952092170715, + "learning_rate": 0.00019142504372621723, + "loss": 0.1854, + "step": 1678 + }, + { + "epoch": 0.3526199726976793, + "grad_norm": 0.6696085333824158, + "learning_rate": 0.00019141096618561983, + "loss": 0.3242, + "step": 1679 + }, + { + "epoch": 0.35282999054919667, + "grad_norm": 0.6039811968803406, + "learning_rate": 0.00019139687761742514, + "loss": 0.1479, + "step": 1680 + }, + { + "epoch": 0.35304000840071403, + "grad_norm": 0.7796909213066101, + "learning_rate": 0.00019138277802333278, + "loss": 0.2724, + "step": 1681 + }, + { + "epoch": 0.35325002625223145, + "grad_norm": 0.740599513053894, + "learning_rate": 0.00019136866740504367, + "loss": 0.2449, + "step": 1682 + }, + { + "epoch": 0.3534600441037488, + "grad_norm": 0.5968409180641174, + "learning_rate": 0.0001913545457642601, + "loss": 0.145, + "step": 1683 + }, + { + "epoch": 0.3536700619552662, + "grad_norm": 0.5513572096824646, + "learning_rate": 0.00019134041310268568, + "loss": 0.166, + "step": 1684 + }, + { + "epoch": 0.3538800798067836, + "grad_norm": 0.6873581409454346, + "learning_rate": 0.0001913262694220253, + "loss": 0.3052, + "step": 1685 + }, + { + "epoch": 0.35409009765830096, + "grad_norm": 0.7691472768783569, + "learning_rate": 0.00019131211472398524, + "loss": 0.1708, + "step": 1686 + }, + { + "epoch": 0.3543001155098183, + "grad_norm": 0.4396502673625946, + "learning_rate": 0.00019129794901027308, + "loss": 0.1501, + "step": 1687 + }, + { + "epoch": 0.3545101333613357, + "grad_norm": 0.5293841361999512, + "learning_rate": 0.0001912837722825978, + "loss": 0.2255, + "step": 1688 + }, + { + "epoch": 0.3547201512128531, + "grad_norm": 0.43543753027915955, + "learning_rate": 0.00019126958454266957, + "loss": 0.1803, + "step": 1689 + }, + { + "epoch": 0.35493016906437047, + "grad_norm": 0.6785498261451721, + "learning_rate": 0.00019125538579219998, + "loss": 0.2687, + "step": 1690 + }, + { + "epoch": 0.35514018691588783, + "grad_norm": 0.42633846402168274, + "learning_rate": 0.00019124117603290194, + "loss": 0.1214, + "step": 1691 + }, + { + "epoch": 0.35535020476740525, + "grad_norm": 0.5481752157211304, + "learning_rate": 0.00019122695526648968, + "loss": 0.2199, + "step": 1692 + }, + { + "epoch": 0.3555602226189226, + "grad_norm": 0.8807294964790344, + "learning_rate": 0.00019121272349467878, + "loss": 0.1997, + "step": 1693 + }, + { + "epoch": 0.35577024047044, + "grad_norm": 0.8869441151618958, + "learning_rate": 0.0001911984807191861, + "loss": 0.1732, + "step": 1694 + }, + { + "epoch": 0.35598025832195734, + "grad_norm": 0.5254248976707458, + "learning_rate": 0.00019118422694172987, + "loss": 0.1422, + "step": 1695 + }, + { + "epoch": 0.35619027617347476, + "grad_norm": 0.5750237107276917, + "learning_rate": 0.00019116996216402961, + "loss": 0.1809, + "step": 1696 + }, + { + "epoch": 0.3564002940249921, + "grad_norm": 0.5350000262260437, + "learning_rate": 0.00019115568638780622, + "loss": 0.2012, + "step": 1697 + }, + { + "epoch": 0.3566103118765095, + "grad_norm": 0.7510976195335388, + "learning_rate": 0.00019114139961478186, + "loss": 0.3177, + "step": 1698 + }, + { + "epoch": 0.3568203297280269, + "grad_norm": 0.8712655901908875, + "learning_rate": 0.00019112710184668007, + "loss": 0.3399, + "step": 1699 + }, + { + "epoch": 0.35703034757954427, + "grad_norm": 0.4693390130996704, + "learning_rate": 0.0001911127930852257, + "loss": 0.1943, + "step": 1700 + }, + { + "epoch": 0.35724036543106163, + "grad_norm": 0.4523710012435913, + "learning_rate": 0.0001910984733321449, + "loss": 0.2226, + "step": 1701 + }, + { + "epoch": 0.357450383282579, + "grad_norm": 0.5822139978408813, + "learning_rate": 0.00019108414258916522, + "loss": 0.2993, + "step": 1702 + }, + { + "epoch": 0.3576604011340964, + "grad_norm": 0.5016921758651733, + "learning_rate": 0.00019106980085801544, + "loss": 0.1991, + "step": 1703 + }, + { + "epoch": 0.3578704189856138, + "grad_norm": 0.45039233565330505, + "learning_rate": 0.00019105544814042576, + "loss": 0.1539, + "step": 1704 + }, + { + "epoch": 0.35808043683713114, + "grad_norm": 0.5696535110473633, + "learning_rate": 0.00019104108443812758, + "loss": 0.2083, + "step": 1705 + }, + { + "epoch": 0.3582904546886485, + "grad_norm": 0.46891167759895325, + "learning_rate": 0.0001910267097528538, + "loss": 0.2401, + "step": 1706 + }, + { + "epoch": 0.3585004725401659, + "grad_norm": 0.5347768068313599, + "learning_rate": 0.00019101232408633845, + "loss": 0.2143, + "step": 1707 + }, + { + "epoch": 0.3587104903916833, + "grad_norm": 0.49566882848739624, + "learning_rate": 0.00019099792744031705, + "loss": 0.2299, + "step": 1708 + }, + { + "epoch": 0.35892050824320065, + "grad_norm": 0.39414703845977783, + "learning_rate": 0.00019098351981652634, + "loss": 0.2156, + "step": 1709 + }, + { + "epoch": 0.35913052609471807, + "grad_norm": 0.5206068754196167, + "learning_rate": 0.00019096910121670443, + "loss": 0.1837, + "step": 1710 + }, + { + "epoch": 0.35934054394623544, + "grad_norm": 0.527384340763092, + "learning_rate": 0.0001909546716425908, + "loss": 0.2735, + "step": 1711 + }, + { + "epoch": 0.3595505617977528, + "grad_norm": 0.5682224631309509, + "learning_rate": 0.00019094023109592607, + "loss": 0.1809, + "step": 1712 + }, + { + "epoch": 0.35976057964927016, + "grad_norm": 0.7277165055274963, + "learning_rate": 0.00019092577957845243, + "loss": 0.2018, + "step": 1713 + }, + { + "epoch": 0.3599705975007876, + "grad_norm": 0.5524745583534241, + "learning_rate": 0.00019091131709191324, + "loss": 0.222, + "step": 1714 + }, + { + "epoch": 0.36018061535230494, + "grad_norm": 0.7268317341804504, + "learning_rate": 0.0001908968436380532, + "loss": 0.2371, + "step": 1715 + }, + { + "epoch": 0.3603906332038223, + "grad_norm": 0.8419438600540161, + "learning_rate": 0.00019088235921861839, + "loss": 0.3045, + "step": 1716 + }, + { + "epoch": 0.3606006510553397, + "grad_norm": 0.6292913556098938, + "learning_rate": 0.00019086786383535614, + "loss": 0.1484, + "step": 1717 + }, + { + "epoch": 0.3608106689068571, + "grad_norm": 0.6693284511566162, + "learning_rate": 0.00019085335749001515, + "loss": 0.2263, + "step": 1718 + }, + { + "epoch": 0.36102068675837445, + "grad_norm": 1.0132428407669067, + "learning_rate": 0.00019083884018434547, + "loss": 0.2318, + "step": 1719 + }, + { + "epoch": 0.3612307046098918, + "grad_norm": 0.4503970146179199, + "learning_rate": 0.00019082431192009834, + "loss": 0.2717, + "step": 1720 + }, + { + "epoch": 0.36144072246140924, + "grad_norm": 0.5765817165374756, + "learning_rate": 0.0001908097726990265, + "loss": 0.1652, + "step": 1721 + }, + { + "epoch": 0.3616507403129266, + "grad_norm": 0.6819046139717102, + "learning_rate": 0.00019079522252288386, + "loss": 0.2853, + "step": 1722 + }, + { + "epoch": 0.36186075816444396, + "grad_norm": 0.7787238359451294, + "learning_rate": 0.0001907806613934258, + "loss": 0.2279, + "step": 1723 + }, + { + "epoch": 0.3620707760159614, + "grad_norm": 0.6114417314529419, + "learning_rate": 0.00019076608931240885, + "loss": 0.1959, + "step": 1724 + }, + { + "epoch": 0.36228079386747875, + "grad_norm": 0.7302160263061523, + "learning_rate": 0.00019075150628159105, + "loss": 0.2769, + "step": 1725 + }, + { + "epoch": 0.3624908117189961, + "grad_norm": 0.5485323667526245, + "learning_rate": 0.00019073691230273154, + "loss": 0.1907, + "step": 1726 + }, + { + "epoch": 0.36270082957051347, + "grad_norm": 0.4471105635166168, + "learning_rate": 0.000190722307377591, + "loss": 0.1456, + "step": 1727 + }, + { + "epoch": 0.3629108474220309, + "grad_norm": 0.5556613206863403, + "learning_rate": 0.0001907076915079313, + "loss": 0.1888, + "step": 1728 + }, + { + "epoch": 0.36312086527354825, + "grad_norm": 0.4751920998096466, + "learning_rate": 0.00019069306469551565, + "loss": 0.1423, + "step": 1729 + }, + { + "epoch": 0.3633308831250656, + "grad_norm": 0.6774847507476807, + "learning_rate": 0.0001906784269421086, + "loss": 0.2371, + "step": 1730 + }, + { + "epoch": 0.36354090097658304, + "grad_norm": 0.5875648260116577, + "learning_rate": 0.00019066377824947605, + "loss": 0.1678, + "step": 1731 + }, + { + "epoch": 0.3637509188281004, + "grad_norm": 0.864040732383728, + "learning_rate": 0.00019064911861938513, + "loss": 0.247, + "step": 1732 + }, + { + "epoch": 0.36396093667961776, + "grad_norm": 0.528397798538208, + "learning_rate": 0.00019063444805360438, + "loss": 0.2097, + "step": 1733 + }, + { + "epoch": 0.3641709545311351, + "grad_norm": 0.5226781368255615, + "learning_rate": 0.00019061976655390358, + "loss": 0.187, + "step": 1734 + }, + { + "epoch": 0.36438097238265255, + "grad_norm": 0.6590418219566345, + "learning_rate": 0.0001906050741220539, + "loss": 0.2919, + "step": 1735 + }, + { + "epoch": 0.3645909902341699, + "grad_norm": 0.6287481784820557, + "learning_rate": 0.00019059037075982782, + "loss": 0.1835, + "step": 1736 + }, + { + "epoch": 0.3648010080856873, + "grad_norm": 0.44010448455810547, + "learning_rate": 0.00019057565646899907, + "loss": 0.1346, + "step": 1737 + }, + { + "epoch": 0.36501102593720464, + "grad_norm": 0.44869789481163025, + "learning_rate": 0.00019056093125134277, + "loss": 0.117, + "step": 1738 + }, + { + "epoch": 0.36522104378872206, + "grad_norm": 0.5611539483070374, + "learning_rate": 0.00019054619510863534, + "loss": 0.1521, + "step": 1739 + }, + { + "epoch": 0.3654310616402394, + "grad_norm": 0.7520772218704224, + "learning_rate": 0.00019053144804265451, + "loss": 0.2429, + "step": 1740 + }, + { + "epoch": 0.3656410794917568, + "grad_norm": 0.6511863470077515, + "learning_rate": 0.00019051669005517932, + "loss": 0.1681, + "step": 1741 + }, + { + "epoch": 0.3658510973432742, + "grad_norm": 0.5798068046569824, + "learning_rate": 0.00019050192114799014, + "loss": 0.1843, + "step": 1742 + }, + { + "epoch": 0.36606111519479156, + "grad_norm": 1.0162572860717773, + "learning_rate": 0.0001904871413228687, + "loss": 0.1762, + "step": 1743 + }, + { + "epoch": 0.3662711330463089, + "grad_norm": 0.6126002073287964, + "learning_rate": 0.00019047235058159792, + "loss": 0.2216, + "step": 1744 + }, + { + "epoch": 0.3664811508978263, + "grad_norm": 0.593628466129303, + "learning_rate": 0.00019045754892596216, + "loss": 0.1877, + "step": 1745 + }, + { + "epoch": 0.3666911687493437, + "grad_norm": 0.8319538831710815, + "learning_rate": 0.00019044273635774705, + "loss": 0.2641, + "step": 1746 + }, + { + "epoch": 0.3669011866008611, + "grad_norm": 0.42954209446907043, + "learning_rate": 0.00019042791287873957, + "loss": 0.139, + "step": 1747 + }, + { + "epoch": 0.36711120445237844, + "grad_norm": 0.5830192565917969, + "learning_rate": 0.00019041307849072798, + "loss": 0.1926, + "step": 1748 + }, + { + "epoch": 0.36732122230389586, + "grad_norm": 0.7822489142417908, + "learning_rate": 0.00019039823319550182, + "loss": 0.2662, + "step": 1749 + }, + { + "epoch": 0.3675312401554132, + "grad_norm": 0.6861689686775208, + "learning_rate": 0.00019038337699485208, + "loss": 0.1516, + "step": 1750 + }, + { + "epoch": 0.3677412580069306, + "grad_norm": 0.5381961464881897, + "learning_rate": 0.00019036850989057088, + "loss": 0.2698, + "step": 1751 + }, + { + "epoch": 0.36795127585844795, + "grad_norm": 0.5581590533256531, + "learning_rate": 0.00019035363188445178, + "loss": 0.2409, + "step": 1752 + }, + { + "epoch": 0.36816129370996536, + "grad_norm": 0.5967221260070801, + "learning_rate": 0.00019033874297828964, + "loss": 0.2265, + "step": 1753 + }, + { + "epoch": 0.36837131156148273, + "grad_norm": 0.5475670695304871, + "learning_rate": 0.00019032384317388062, + "loss": 0.2678, + "step": 1754 + }, + { + "epoch": 0.3685813294130001, + "grad_norm": 0.8465176224708557, + "learning_rate": 0.0001903089324730222, + "loss": 0.3007, + "step": 1755 + }, + { + "epoch": 0.3687913472645175, + "grad_norm": 0.5701761841773987, + "learning_rate": 0.00019029401087751317, + "loss": 0.3469, + "step": 1756 + }, + { + "epoch": 0.3690013651160349, + "grad_norm": 0.47910815477371216, + "learning_rate": 0.00019027907838915363, + "loss": 0.1803, + "step": 1757 + }, + { + "epoch": 0.36921138296755224, + "grad_norm": 0.5992498397827148, + "learning_rate": 0.000190264135009745, + "loss": 0.303, + "step": 1758 + }, + { + "epoch": 0.3694214008190696, + "grad_norm": 0.662324845790863, + "learning_rate": 0.00019024918074109, + "loss": 0.3612, + "step": 1759 + }, + { + "epoch": 0.369631418670587, + "grad_norm": 0.6369922161102295, + "learning_rate": 0.00019023421558499274, + "loss": 0.174, + "step": 1760 + }, + { + "epoch": 0.3698414365221044, + "grad_norm": 0.5556310415267944, + "learning_rate": 0.00019021923954325845, + "loss": 0.1904, + "step": 1761 + }, + { + "epoch": 0.37005145437362175, + "grad_norm": 0.6958837509155273, + "learning_rate": 0.00019020425261769393, + "loss": 0.3308, + "step": 1762 + }, + { + "epoch": 0.3702614722251391, + "grad_norm": 0.6934865117073059, + "learning_rate": 0.00019018925481010713, + "loss": 0.2886, + "step": 1763 + }, + { + "epoch": 0.37047149007665653, + "grad_norm": 0.6797325015068054, + "learning_rate": 0.0001901742461223073, + "loss": 0.2314, + "step": 1764 + }, + { + "epoch": 0.3706815079281739, + "grad_norm": 0.680874764919281, + "learning_rate": 0.00019015922655610511, + "loss": 0.252, + "step": 1765 + }, + { + "epoch": 0.37089152577969126, + "grad_norm": 0.6662421822547913, + "learning_rate": 0.0001901441961133125, + "loss": 0.2375, + "step": 1766 + }, + { + "epoch": 0.3711015436312087, + "grad_norm": 0.6213268637657166, + "learning_rate": 0.00019012915479574264, + "loss": 0.2062, + "step": 1767 + }, + { + "epoch": 0.37131156148272604, + "grad_norm": 0.523102343082428, + "learning_rate": 0.00019011410260521007, + "loss": 0.1552, + "step": 1768 + }, + { + "epoch": 0.3715215793342434, + "grad_norm": 0.6789844036102295, + "learning_rate": 0.00019009903954353075, + "loss": 0.2338, + "step": 1769 + }, + { + "epoch": 0.37173159718576076, + "grad_norm": 0.5860502123832703, + "learning_rate": 0.00019008396561252173, + "loss": 0.2464, + "step": 1770 + }, + { + "epoch": 0.3719416150372782, + "grad_norm": 0.6294893622398376, + "learning_rate": 0.0001900688808140016, + "loss": 0.2327, + "step": 1771 + }, + { + "epoch": 0.37215163288879555, + "grad_norm": 0.6527291536331177, + "learning_rate": 0.00019005378514979008, + "loss": 0.1679, + "step": 1772 + }, + { + "epoch": 0.3723616507403129, + "grad_norm": 0.6352584958076477, + "learning_rate": 0.00019003867862170832, + "loss": 0.2133, + "step": 1773 + }, + { + "epoch": 0.37257166859183033, + "grad_norm": 0.3638416528701782, + "learning_rate": 0.0001900235612315787, + "loss": 0.1247, + "step": 1774 + }, + { + "epoch": 0.3727816864433477, + "grad_norm": 0.5455829501152039, + "learning_rate": 0.00019000843298122496, + "loss": 0.1433, + "step": 1775 + }, + { + "epoch": 0.37299170429486506, + "grad_norm": 0.7970212697982788, + "learning_rate": 0.00018999329387247216, + "loss": 0.2801, + "step": 1776 + }, + { + "epoch": 0.3732017221463824, + "grad_norm": 0.6363832354545593, + "learning_rate": 0.00018997814390714658, + "loss": 0.2533, + "step": 1777 + }, + { + "epoch": 0.37341173999789984, + "grad_norm": 0.6059479713439941, + "learning_rate": 0.00018996298308707595, + "loss": 0.1884, + "step": 1778 + }, + { + "epoch": 0.3736217578494172, + "grad_norm": 0.6873182654380798, + "learning_rate": 0.0001899478114140892, + "loss": 0.1553, + "step": 1779 + }, + { + "epoch": 0.37383177570093457, + "grad_norm": 0.6038809418678284, + "learning_rate": 0.00018993262889001658, + "loss": 0.152, + "step": 1780 + }, + { + "epoch": 0.374041793552452, + "grad_norm": 0.685796320438385, + "learning_rate": 0.00018991743551668972, + "loss": 0.1444, + "step": 1781 + }, + { + "epoch": 0.37425181140396935, + "grad_norm": 0.7347593903541565, + "learning_rate": 0.00018990223129594148, + "loss": 0.2191, + "step": 1782 + }, + { + "epoch": 0.3744618292554867, + "grad_norm": 0.6126787066459656, + "learning_rate": 0.00018988701622960606, + "loss": 0.1944, + "step": 1783 + }, + { + "epoch": 0.3746718471070041, + "grad_norm": 0.5741068124771118, + "learning_rate": 0.000189871790319519, + "loss": 0.1662, + "step": 1784 + }, + { + "epoch": 0.3748818649585215, + "grad_norm": 0.7055535912513733, + "learning_rate": 0.00018985655356751707, + "loss": 0.1707, + "step": 1785 + }, + { + "epoch": 0.37509188281003886, + "grad_norm": 0.7289325594902039, + "learning_rate": 0.00018984130597543843, + "loss": 0.2198, + "step": 1786 + }, + { + "epoch": 0.3753019006615562, + "grad_norm": 0.3809838891029358, + "learning_rate": 0.0001898260475451225, + "loss": 0.124, + "step": 1787 + }, + { + "epoch": 0.3755119185130736, + "grad_norm": 0.6106488108634949, + "learning_rate": 0.00018981077827841003, + "loss": 0.177, + "step": 1788 + }, + { + "epoch": 0.375721936364591, + "grad_norm": 0.48331376910209656, + "learning_rate": 0.00018979549817714305, + "loss": 0.1342, + "step": 1789 + }, + { + "epoch": 0.37593195421610837, + "grad_norm": 0.7540026307106018, + "learning_rate": 0.00018978020724316492, + "loss": 0.2452, + "step": 1790 + }, + { + "epoch": 0.37614197206762573, + "grad_norm": 0.6960353851318359, + "learning_rate": 0.00018976490547832034, + "loss": 0.3285, + "step": 1791 + }, + { + "epoch": 0.37635198991914315, + "grad_norm": 0.5434857606887817, + "learning_rate": 0.00018974959288445522, + "loss": 0.1576, + "step": 1792 + }, + { + "epoch": 0.3765620077706605, + "grad_norm": 0.5403981804847717, + "learning_rate": 0.00018973426946341683, + "loss": 0.128, + "step": 1793 + }, + { + "epoch": 0.3767720256221779, + "grad_norm": 0.5323323011398315, + "learning_rate": 0.00018971893521705383, + "loss": 0.2257, + "step": 1794 + }, + { + "epoch": 0.37698204347369524, + "grad_norm": 0.8335232138633728, + "learning_rate": 0.000189703590147216, + "loss": 0.1963, + "step": 1795 + }, + { + "epoch": 0.37719206132521266, + "grad_norm": 0.655535101890564, + "learning_rate": 0.0001896882342557546, + "loss": 0.1997, + "step": 1796 + }, + { + "epoch": 0.37740207917673, + "grad_norm": 0.5367481112480164, + "learning_rate": 0.00018967286754452214, + "loss": 0.2294, + "step": 1797 + }, + { + "epoch": 0.3776120970282474, + "grad_norm": 0.9579022526741028, + "learning_rate": 0.00018965749001537238, + "loss": 0.2051, + "step": 1798 + }, + { + "epoch": 0.3778221148797648, + "grad_norm": 0.6223967671394348, + "learning_rate": 0.00018964210167016045, + "loss": 0.3153, + "step": 1799 + }, + { + "epoch": 0.37803213273128217, + "grad_norm": 0.7840321660041809, + "learning_rate": 0.00018962670251074275, + "loss": 0.2971, + "step": 1800 + }, + { + "epoch": 0.37824215058279953, + "grad_norm": 0.531063437461853, + "learning_rate": 0.000189611292538977, + "loss": 0.3475, + "step": 1801 + }, + { + "epoch": 0.3784521684343169, + "grad_norm": 0.5457683801651001, + "learning_rate": 0.00018959587175672223, + "loss": 0.2136, + "step": 1802 + }, + { + "epoch": 0.3786621862858343, + "grad_norm": 0.513944685459137, + "learning_rate": 0.00018958044016583877, + "loss": 0.1691, + "step": 1803 + }, + { + "epoch": 0.3788722041373517, + "grad_norm": 0.5060524344444275, + "learning_rate": 0.00018956499776818822, + "loss": 0.1869, + "step": 1804 + }, + { + "epoch": 0.37908222198886904, + "grad_norm": 0.6220847964286804, + "learning_rate": 0.00018954954456563356, + "loss": 0.2979, + "step": 1805 + }, + { + "epoch": 0.37929223984038646, + "grad_norm": 0.515264630317688, + "learning_rate": 0.000189534080560039, + "loss": 0.1507, + "step": 1806 + }, + { + "epoch": 0.3795022576919038, + "grad_norm": 0.4674825668334961, + "learning_rate": 0.00018951860575327006, + "loss": 0.1454, + "step": 1807 + }, + { + "epoch": 0.3797122755434212, + "grad_norm": 0.5598218441009521, + "learning_rate": 0.0001895031201471936, + "loss": 0.2971, + "step": 1808 + }, + { + "epoch": 0.37992229339493855, + "grad_norm": 0.42762941122055054, + "learning_rate": 0.00018948762374367778, + "loss": 0.163, + "step": 1809 + }, + { + "epoch": 0.38013231124645597, + "grad_norm": 0.5744110345840454, + "learning_rate": 0.00018947211654459208, + "loss": 0.2604, + "step": 1810 + }, + { + "epoch": 0.38034232909797333, + "grad_norm": 0.5260441303253174, + "learning_rate": 0.00018945659855180714, + "loss": 0.1798, + "step": 1811 + }, + { + "epoch": 0.3805523469494907, + "grad_norm": 0.5875288844108582, + "learning_rate": 0.00018944106976719513, + "loss": 0.231, + "step": 1812 + }, + { + "epoch": 0.3807623648010081, + "grad_norm": 0.8182070851325989, + "learning_rate": 0.00018942553019262937, + "loss": 0.2722, + "step": 1813 + }, + { + "epoch": 0.3809723826525255, + "grad_norm": 0.5589549541473389, + "learning_rate": 0.00018940997982998446, + "loss": 0.2102, + "step": 1814 + }, + { + "epoch": 0.38118240050404284, + "grad_norm": 0.7185777425765991, + "learning_rate": 0.0001893944186811364, + "loss": 0.1846, + "step": 1815 + }, + { + "epoch": 0.3813924183555602, + "grad_norm": 0.9046144485473633, + "learning_rate": 0.0001893788467479625, + "loss": 0.2614, + "step": 1816 + }, + { + "epoch": 0.3816024362070776, + "grad_norm": 0.5875299572944641, + "learning_rate": 0.00018936326403234125, + "loss": 0.3383, + "step": 1817 + }, + { + "epoch": 0.381812454058595, + "grad_norm": 0.7503211498260498, + "learning_rate": 0.0001893476705361525, + "loss": 0.1901, + "step": 1818 + }, + { + "epoch": 0.38202247191011235, + "grad_norm": 0.6497657895088196, + "learning_rate": 0.00018933206626127748, + "loss": 0.2323, + "step": 1819 + }, + { + "epoch": 0.3822324897616297, + "grad_norm": 0.6288959383964539, + "learning_rate": 0.00018931645120959863, + "loss": 0.2134, + "step": 1820 + }, + { + "epoch": 0.38244250761314713, + "grad_norm": 0.6646990776062012, + "learning_rate": 0.00018930082538299968, + "loss": 0.1965, + "step": 1821 + }, + { + "epoch": 0.3826525254646645, + "grad_norm": 0.741087794303894, + "learning_rate": 0.0001892851887833657, + "loss": 0.2197, + "step": 1822 + }, + { + "epoch": 0.38286254331618186, + "grad_norm": 0.9484357833862305, + "learning_rate": 0.00018926954141258305, + "loss": 0.2848, + "step": 1823 + }, + { + "epoch": 0.3830725611676993, + "grad_norm": 0.4895227551460266, + "learning_rate": 0.00018925388327253943, + "loss": 0.1414, + "step": 1824 + }, + { + "epoch": 0.38328257901921664, + "grad_norm": 0.41706594824790955, + "learning_rate": 0.00018923821436512376, + "loss": 0.1251, + "step": 1825 + }, + { + "epoch": 0.383492596870734, + "grad_norm": 0.7507045269012451, + "learning_rate": 0.00018922253469222633, + "loss": 0.2518, + "step": 1826 + }, + { + "epoch": 0.38370261472225137, + "grad_norm": 0.521273672580719, + "learning_rate": 0.00018920684425573865, + "loss": 0.2102, + "step": 1827 + }, + { + "epoch": 0.3839126325737688, + "grad_norm": 0.7842849493026733, + "learning_rate": 0.00018919114305755364, + "loss": 0.208, + "step": 1828 + }, + { + "epoch": 0.38412265042528615, + "grad_norm": 0.39324215054512024, + "learning_rate": 0.0001891754310995654, + "loss": 0.1627, + "step": 1829 + }, + { + "epoch": 0.3843326682768035, + "grad_norm": 0.4770127534866333, + "learning_rate": 0.0001891597083836694, + "loss": 0.1703, + "step": 1830 + }, + { + "epoch": 0.38454268612832093, + "grad_norm": 0.6963898539543152, + "learning_rate": 0.00018914397491176242, + "loss": 0.2574, + "step": 1831 + }, + { + "epoch": 0.3847527039798383, + "grad_norm": 0.42212000489234924, + "learning_rate": 0.00018912823068574242, + "loss": 0.1968, + "step": 1832 + }, + { + "epoch": 0.38496272183135566, + "grad_norm": 0.695845365524292, + "learning_rate": 0.00018911247570750885, + "loss": 0.1894, + "step": 1833 + }, + { + "epoch": 0.385172739682873, + "grad_norm": 0.7652266621589661, + "learning_rate": 0.00018909670997896232, + "loss": 0.273, + "step": 1834 + }, + { + "epoch": 0.38538275753439044, + "grad_norm": 0.4933221638202667, + "learning_rate": 0.00018908093350200473, + "loss": 0.1563, + "step": 1835 + }, + { + "epoch": 0.3855927753859078, + "grad_norm": 0.6543354392051697, + "learning_rate": 0.00018906514627853936, + "loss": 0.1746, + "step": 1836 + }, + { + "epoch": 0.38580279323742517, + "grad_norm": 0.7568399310112, + "learning_rate": 0.00018904934831047072, + "loss": 0.2761, + "step": 1837 + }, + { + "epoch": 0.3860128110889426, + "grad_norm": 0.7224242091178894, + "learning_rate": 0.00018903353959970462, + "loss": 0.2429, + "step": 1838 + }, + { + "epoch": 0.38622282894045995, + "grad_norm": 0.9286659359931946, + "learning_rate": 0.00018901772014814824, + "loss": 0.2213, + "step": 1839 + }, + { + "epoch": 0.3864328467919773, + "grad_norm": 0.35347288846969604, + "learning_rate": 0.00018900188995770996, + "loss": 0.1102, + "step": 1840 + }, + { + "epoch": 0.3866428646434947, + "grad_norm": 0.5769889950752258, + "learning_rate": 0.00018898604903029952, + "loss": 0.2866, + "step": 1841 + }, + { + "epoch": 0.3868528824950121, + "grad_norm": 0.8280730843544006, + "learning_rate": 0.0001889701973678279, + "loss": 0.135, + "step": 1842 + }, + { + "epoch": 0.38706290034652946, + "grad_norm": 0.6738591194152832, + "learning_rate": 0.00018895433497220744, + "loss": 0.2106, + "step": 1843 + }, + { + "epoch": 0.3872729181980468, + "grad_norm": 0.5238274335861206, + "learning_rate": 0.0001889384618453517, + "loss": 0.1742, + "step": 1844 + }, + { + "epoch": 0.3874829360495642, + "grad_norm": 0.4777994155883789, + "learning_rate": 0.0001889225779891756, + "loss": 0.2191, + "step": 1845 + }, + { + "epoch": 0.3876929539010816, + "grad_norm": 0.7966942191123962, + "learning_rate": 0.00018890668340559535, + "loss": 0.2583, + "step": 1846 + }, + { + "epoch": 0.38790297175259897, + "grad_norm": 0.7737426161766052, + "learning_rate": 0.0001888907780965284, + "loss": 0.225, + "step": 1847 + }, + { + "epoch": 0.38811298960411633, + "grad_norm": 0.712505578994751, + "learning_rate": 0.0001888748620638935, + "loss": 0.2436, + "step": 1848 + }, + { + "epoch": 0.38832300745563375, + "grad_norm": 0.5878315567970276, + "learning_rate": 0.00018885893530961082, + "loss": 0.1689, + "step": 1849 + }, + { + "epoch": 0.3885330253071511, + "grad_norm": 0.8706987500190735, + "learning_rate": 0.00018884299783560165, + "loss": 0.3054, + "step": 1850 + }, + { + "epoch": 0.3887430431586685, + "grad_norm": 0.5303587913513184, + "learning_rate": 0.00018882704964378867, + "loss": 0.2341, + "step": 1851 + }, + { + "epoch": 0.38895306101018584, + "grad_norm": 0.5766577124595642, + "learning_rate": 0.00018881109073609582, + "loss": 0.1952, + "step": 1852 + }, + { + "epoch": 0.38916307886170326, + "grad_norm": 0.5528069138526917, + "learning_rate": 0.00018879512111444834, + "loss": 0.2278, + "step": 1853 + }, + { + "epoch": 0.3893730967132206, + "grad_norm": 0.7019896507263184, + "learning_rate": 0.0001887791407807728, + "loss": 0.2488, + "step": 1854 + }, + { + "epoch": 0.389583114564738, + "grad_norm": 0.6992369294166565, + "learning_rate": 0.000188763149736997, + "loss": 0.3551, + "step": 1855 + }, + { + "epoch": 0.3897931324162554, + "grad_norm": 0.5621406435966492, + "learning_rate": 0.00018874714798505004, + "loss": 0.2692, + "step": 1856 + }, + { + "epoch": 0.39000315026777277, + "grad_norm": 0.7229904532432556, + "learning_rate": 0.0001887311355268624, + "loss": 0.3149, + "step": 1857 + }, + { + "epoch": 0.39021316811929013, + "grad_norm": 0.7131726741790771, + "learning_rate": 0.0001887151123643657, + "loss": 0.2333, + "step": 1858 + }, + { + "epoch": 0.3904231859708075, + "grad_norm": 0.8763433694839478, + "learning_rate": 0.000188699078499493, + "loss": 0.3252, + "step": 1859 + }, + { + "epoch": 0.3906332038223249, + "grad_norm": 0.7169297337532043, + "learning_rate": 0.00018868303393417856, + "loss": 0.3808, + "step": 1860 + }, + { + "epoch": 0.3908432216738423, + "grad_norm": 0.47738391160964966, + "learning_rate": 0.00018866697867035796, + "loss": 0.2403, + "step": 1861 + }, + { + "epoch": 0.39105323952535964, + "grad_norm": 0.6296969056129456, + "learning_rate": 0.00018865091270996807, + "loss": 0.2048, + "step": 1862 + }, + { + "epoch": 0.39126325737687706, + "grad_norm": 0.5875237584114075, + "learning_rate": 0.00018863483605494709, + "loss": 0.175, + "step": 1863 + }, + { + "epoch": 0.3914732752283944, + "grad_norm": 0.5673640370368958, + "learning_rate": 0.00018861874870723438, + "loss": 0.1633, + "step": 1864 + }, + { + "epoch": 0.3916832930799118, + "grad_norm": 0.6922441720962524, + "learning_rate": 0.00018860265066877074, + "loss": 0.263, + "step": 1865 + }, + { + "epoch": 0.39189331093142915, + "grad_norm": 0.5726366639137268, + "learning_rate": 0.0001885865419414982, + "loss": 0.2623, + "step": 1866 + }, + { + "epoch": 0.39210332878294657, + "grad_norm": 0.5620572566986084, + "learning_rate": 0.00018857042252736004, + "loss": 0.2096, + "step": 1867 + }, + { + "epoch": 0.39231334663446393, + "grad_norm": 0.5184766054153442, + "learning_rate": 0.0001885542924283009, + "loss": 0.1827, + "step": 1868 + }, + { + "epoch": 0.3925233644859813, + "grad_norm": 0.41842037439346313, + "learning_rate": 0.00018853815164626667, + "loss": 0.1194, + "step": 1869 + }, + { + "epoch": 0.39273338233749866, + "grad_norm": 0.7123748064041138, + "learning_rate": 0.00018852200018320452, + "loss": 0.2417, + "step": 1870 + }, + { + "epoch": 0.3929434001890161, + "grad_norm": 0.6199332475662231, + "learning_rate": 0.00018850583804106292, + "loss": 0.1418, + "step": 1871 + }, + { + "epoch": 0.39315341804053344, + "grad_norm": 0.6634207963943481, + "learning_rate": 0.00018848966522179168, + "loss": 0.3139, + "step": 1872 + }, + { + "epoch": 0.3933634358920508, + "grad_norm": 0.5441175699234009, + "learning_rate": 0.00018847348172734178, + "loss": 0.1824, + "step": 1873 + }, + { + "epoch": 0.3935734537435682, + "grad_norm": 0.794145405292511, + "learning_rate": 0.00018845728755966564, + "loss": 0.2092, + "step": 1874 + }, + { + "epoch": 0.3937834715950856, + "grad_norm": 0.43816208839416504, + "learning_rate": 0.0001884410827207168, + "loss": 0.1461, + "step": 1875 + }, + { + "epoch": 0.39399348944660295, + "grad_norm": 0.5776886343955994, + "learning_rate": 0.00018842486721245023, + "loss": 0.1782, + "step": 1876 + }, + { + "epoch": 0.3942035072981203, + "grad_norm": 0.5448632836341858, + "learning_rate": 0.00018840864103682208, + "loss": 0.1693, + "step": 1877 + }, + { + "epoch": 0.39441352514963773, + "grad_norm": 0.9205145239830017, + "learning_rate": 0.00018839240419578988, + "loss": 0.2517, + "step": 1878 + }, + { + "epoch": 0.3946235430011551, + "grad_norm": 0.6252496838569641, + "learning_rate": 0.00018837615669131238, + "loss": 0.2044, + "step": 1879 + }, + { + "epoch": 0.39483356085267246, + "grad_norm": 0.7013902068138123, + "learning_rate": 0.00018835989852534967, + "loss": 0.2082, + "step": 1880 + }, + { + "epoch": 0.3950435787041899, + "grad_norm": 0.7002107501029968, + "learning_rate": 0.00018834362969986308, + "loss": 0.2845, + "step": 1881 + }, + { + "epoch": 0.39525359655570724, + "grad_norm": 0.5892364382743835, + "learning_rate": 0.00018832735021681523, + "loss": 0.2595, + "step": 1882 + }, + { + "epoch": 0.3954636144072246, + "grad_norm": 0.6822951436042786, + "learning_rate": 0.00018831106007817004, + "loss": 0.2269, + "step": 1883 + }, + { + "epoch": 0.39567363225874197, + "grad_norm": 0.5758324265480042, + "learning_rate": 0.00018829475928589271, + "loss": 0.1774, + "step": 1884 + }, + { + "epoch": 0.3958836501102594, + "grad_norm": 0.6938642859458923, + "learning_rate": 0.00018827844784194975, + "loss": 0.2571, + "step": 1885 + }, + { + "epoch": 0.39609366796177675, + "grad_norm": 0.8935677409172058, + "learning_rate": 0.0001882621257483089, + "loss": 0.221, + "step": 1886 + }, + { + "epoch": 0.3963036858132941, + "grad_norm": 0.6208258271217346, + "learning_rate": 0.00018824579300693922, + "loss": 0.3372, + "step": 1887 + }, + { + "epoch": 0.39651370366481153, + "grad_norm": 0.6998998522758484, + "learning_rate": 0.0001882294496198111, + "loss": 0.2326, + "step": 1888 + }, + { + "epoch": 0.3967237215163289, + "grad_norm": 1.004351019859314, + "learning_rate": 0.0001882130955888961, + "loss": 0.2532, + "step": 1889 + }, + { + "epoch": 0.39693373936784626, + "grad_norm": 0.8813514113426208, + "learning_rate": 0.0001881967309161672, + "loss": 0.2859, + "step": 1890 + }, + { + "epoch": 0.3971437572193636, + "grad_norm": 0.48337891697883606, + "learning_rate": 0.00018818035560359855, + "loss": 0.1619, + "step": 1891 + }, + { + "epoch": 0.39735377507088104, + "grad_norm": 0.5266842246055603, + "learning_rate": 0.00018816396965316563, + "loss": 0.1676, + "step": 1892 + }, + { + "epoch": 0.3975637929223984, + "grad_norm": 0.48444923758506775, + "learning_rate": 0.00018814757306684522, + "loss": 0.149, + "step": 1893 + }, + { + "epoch": 0.39777381077391577, + "grad_norm": 0.651096522808075, + "learning_rate": 0.00018813116584661535, + "loss": 0.1735, + "step": 1894 + }, + { + "epoch": 0.3979838286254332, + "grad_norm": 0.5188713669776917, + "learning_rate": 0.00018811474799445535, + "loss": 0.2133, + "step": 1895 + }, + { + "epoch": 0.39819384647695055, + "grad_norm": 0.376615971326828, + "learning_rate": 0.00018809831951234582, + "loss": 0.1414, + "step": 1896 + }, + { + "epoch": 0.3984038643284679, + "grad_norm": 0.38462603092193604, + "learning_rate": 0.00018808188040226868, + "loss": 0.1363, + "step": 1897 + }, + { + "epoch": 0.3986138821799853, + "grad_norm": 0.6370077133178711, + "learning_rate": 0.00018806543066620708, + "loss": 0.1403, + "step": 1898 + }, + { + "epoch": 0.3988239000315027, + "grad_norm": 0.6808513402938843, + "learning_rate": 0.00018804897030614548, + "loss": 0.2621, + "step": 1899 + }, + { + "epoch": 0.39903391788302006, + "grad_norm": 0.4564354121685028, + "learning_rate": 0.00018803249932406962, + "loss": 0.1761, + "step": 1900 + }, + { + "epoch": 0.3992439357345374, + "grad_norm": 0.46451982855796814, + "learning_rate": 0.0001880160177219665, + "loss": 0.1802, + "step": 1901 + }, + { + "epoch": 0.3994539535860548, + "grad_norm": 0.46525609493255615, + "learning_rate": 0.00018799952550182446, + "loss": 0.2133, + "step": 1902 + }, + { + "epoch": 0.3996639714375722, + "grad_norm": 0.49694451689720154, + "learning_rate": 0.0001879830226656331, + "loss": 0.1612, + "step": 1903 + }, + { + "epoch": 0.39987398928908957, + "grad_norm": 0.7554435133934021, + "learning_rate": 0.00018796650921538318, + "loss": 0.4251, + "step": 1904 + }, + { + "epoch": 0.40008400714060693, + "grad_norm": 0.5206797122955322, + "learning_rate": 0.00018794998515306692, + "loss": 0.1765, + "step": 1905 + }, + { + "epoch": 0.40029402499212435, + "grad_norm": 0.5828625559806824, + "learning_rate": 0.00018793345048067774, + "loss": 0.278, + "step": 1906 + }, + { + "epoch": 0.4005040428436417, + "grad_norm": 0.48372307419776917, + "learning_rate": 0.00018791690520021036, + "loss": 0.1755, + "step": 1907 + }, + { + "epoch": 0.4007140606951591, + "grad_norm": 0.70604008436203, + "learning_rate": 0.00018790034931366072, + "loss": 0.2469, + "step": 1908 + }, + { + "epoch": 0.40092407854667644, + "grad_norm": 0.6227346658706665, + "learning_rate": 0.00018788378282302606, + "loss": 0.242, + "step": 1909 + }, + { + "epoch": 0.40113409639819386, + "grad_norm": 0.6886958479881287, + "learning_rate": 0.000187867205730305, + "loss": 0.1898, + "step": 1910 + }, + { + "epoch": 0.4013441142497112, + "grad_norm": 0.6775689125061035, + "learning_rate": 0.0001878506180374973, + "loss": 0.182, + "step": 1911 + }, + { + "epoch": 0.4015541321012286, + "grad_norm": 0.7775040864944458, + "learning_rate": 0.0001878340197466041, + "loss": 0.2411, + "step": 1912 + }, + { + "epoch": 0.401764149952746, + "grad_norm": 0.7616844177246094, + "learning_rate": 0.00018781741085962774, + "loss": 0.3289, + "step": 1913 + }, + { + "epoch": 0.40197416780426337, + "grad_norm": 0.7016698718070984, + "learning_rate": 0.0001878007913785719, + "loss": 0.1935, + "step": 1914 + }, + { + "epoch": 0.40218418565578073, + "grad_norm": 0.5730949640274048, + "learning_rate": 0.0001877841613054415, + "loss": 0.1898, + "step": 1915 + }, + { + "epoch": 0.4023942035072981, + "grad_norm": 0.7912655472755432, + "learning_rate": 0.00018776752064224273, + "loss": 0.2357, + "step": 1916 + }, + { + "epoch": 0.4026042213588155, + "grad_norm": 0.6684666872024536, + "learning_rate": 0.00018775086939098312, + "loss": 0.1697, + "step": 1917 + }, + { + "epoch": 0.4028142392103329, + "grad_norm": 0.7139754891395569, + "learning_rate": 0.00018773420755367144, + "loss": 0.3489, + "step": 1918 + }, + { + "epoch": 0.40302425706185024, + "grad_norm": 0.8503203392028809, + "learning_rate": 0.00018771753513231772, + "loss": 0.2465, + "step": 1919 + }, + { + "epoch": 0.40323427491336766, + "grad_norm": 0.7805736660957336, + "learning_rate": 0.00018770085212893326, + "loss": 0.1745, + "step": 1920 + }, + { + "epoch": 0.403444292764885, + "grad_norm": 0.996834397315979, + "learning_rate": 0.00018768415854553068, + "loss": 0.293, + "step": 1921 + }, + { + "epoch": 0.4036543106164024, + "grad_norm": 0.9461717009544373, + "learning_rate": 0.00018766745438412384, + "loss": 0.1942, + "step": 1922 + }, + { + "epoch": 0.40386432846791975, + "grad_norm": 0.4621502757072449, + "learning_rate": 0.0001876507396467279, + "loss": 0.1086, + "step": 1923 + }, + { + "epoch": 0.40407434631943717, + "grad_norm": 0.7377074360847473, + "learning_rate": 0.0001876340143353593, + "loss": 0.2052, + "step": 1924 + }, + { + "epoch": 0.40428436417095454, + "grad_norm": 0.6381657123565674, + "learning_rate": 0.00018761727845203568, + "loss": 0.2239, + "step": 1925 + }, + { + "epoch": 0.4044943820224719, + "grad_norm": 0.6710989475250244, + "learning_rate": 0.0001876005319987761, + "loss": 0.241, + "step": 1926 + }, + { + "epoch": 0.40470439987398926, + "grad_norm": 0.6272912621498108, + "learning_rate": 0.00018758377497760074, + "loss": 0.1398, + "step": 1927 + }, + { + "epoch": 0.4049144177255067, + "grad_norm": 0.7110686302185059, + "learning_rate": 0.00018756700739053117, + "loss": 0.1825, + "step": 1928 + }, + { + "epoch": 0.40512443557702404, + "grad_norm": 0.38758787512779236, + "learning_rate": 0.00018755022923959018, + "loss": 0.1484, + "step": 1929 + }, + { + "epoch": 0.4053344534285414, + "grad_norm": 0.6288896799087524, + "learning_rate": 0.00018753344052680184, + "loss": 0.2315, + "step": 1930 + }, + { + "epoch": 0.4055444712800588, + "grad_norm": 0.5400965809822083, + "learning_rate": 0.00018751664125419152, + "loss": 0.1389, + "step": 1931 + }, + { + "epoch": 0.4057544891315762, + "grad_norm": 0.754919707775116, + "learning_rate": 0.00018749983142378582, + "loss": 0.238, + "step": 1932 + }, + { + "epoch": 0.40596450698309355, + "grad_norm": 0.5651653409004211, + "learning_rate": 0.00018748301103761264, + "loss": 0.1729, + "step": 1933 + }, + { + "epoch": 0.4061745248346109, + "grad_norm": 0.6878918409347534, + "learning_rate": 0.00018746618009770117, + "loss": 0.1804, + "step": 1934 + }, + { + "epoch": 0.40638454268612834, + "grad_norm": 0.46758532524108887, + "learning_rate": 0.00018744933860608183, + "loss": 0.1225, + "step": 1935 + }, + { + "epoch": 0.4065945605376457, + "grad_norm": 0.5796334147453308, + "learning_rate": 0.00018743248656478634, + "loss": 0.1805, + "step": 1936 + }, + { + "epoch": 0.40680457838916306, + "grad_norm": 0.6088680028915405, + "learning_rate": 0.00018741562397584768, + "loss": 0.1575, + "step": 1937 + }, + { + "epoch": 0.4070145962406805, + "grad_norm": 0.5850851535797119, + "learning_rate": 0.00018739875084130013, + "loss": 0.1373, + "step": 1938 + }, + { + "epoch": 0.40722461409219785, + "grad_norm": 0.5626951456069946, + "learning_rate": 0.00018738186716317924, + "loss": 0.1653, + "step": 1939 + }, + { + "epoch": 0.4074346319437152, + "grad_norm": 0.5403121113777161, + "learning_rate": 0.0001873649729435218, + "loss": 0.253, + "step": 1940 + }, + { + "epoch": 0.40764464979523257, + "grad_norm": 0.5512644052505493, + "learning_rate": 0.00018734806818436584, + "loss": 0.1745, + "step": 1941 + }, + { + "epoch": 0.40785466764675, + "grad_norm": 0.4115297794342041, + "learning_rate": 0.00018733115288775077, + "loss": 0.1222, + "step": 1942 + }, + { + "epoch": 0.40806468549826735, + "grad_norm": 0.43221354484558105, + "learning_rate": 0.00018731422705571725, + "loss": 0.1172, + "step": 1943 + }, + { + "epoch": 0.4082747033497847, + "grad_norm": 0.5241899490356445, + "learning_rate": 0.00018729729069030704, + "loss": 0.1785, + "step": 1944 + }, + { + "epoch": 0.40848472120130214, + "grad_norm": 0.8017915487289429, + "learning_rate": 0.00018728034379356342, + "loss": 0.1877, + "step": 1945 + }, + { + "epoch": 0.4086947390528195, + "grad_norm": 0.5832831263542175, + "learning_rate": 0.00018726338636753078, + "loss": 0.1465, + "step": 1946 + }, + { + "epoch": 0.40890475690433686, + "grad_norm": 0.6607633829116821, + "learning_rate": 0.00018724641841425478, + "loss": 0.1821, + "step": 1947 + }, + { + "epoch": 0.4091147747558542, + "grad_norm": 0.8016804456710815, + "learning_rate": 0.00018722943993578248, + "loss": 0.2567, + "step": 1948 + }, + { + "epoch": 0.40932479260737165, + "grad_norm": 0.5984741449356079, + "learning_rate": 0.00018721245093416208, + "loss": 0.2095, + "step": 1949 + }, + { + "epoch": 0.409534810458889, + "grad_norm": 0.7901354432106018, + "learning_rate": 0.00018719545141144308, + "loss": 0.1916, + "step": 1950 + }, + { + "epoch": 0.4097448283104064, + "grad_norm": 0.5149995684623718, + "learning_rate": 0.00018717844136967624, + "loss": 0.227, + "step": 1951 + }, + { + "epoch": 0.40995484616192374, + "grad_norm": 0.6096331477165222, + "learning_rate": 0.00018716142081091368, + "loss": 0.2901, + "step": 1952 + }, + { + "epoch": 0.41016486401344115, + "grad_norm": 0.5911889672279358, + "learning_rate": 0.00018714438973720866, + "loss": 0.3882, + "step": 1953 + }, + { + "epoch": 0.4103748818649585, + "grad_norm": 0.5551436543464661, + "learning_rate": 0.0001871273481506158, + "loss": 0.1997, + "step": 1954 + }, + { + "epoch": 0.4105848997164759, + "grad_norm": 0.5714811682701111, + "learning_rate": 0.00018711029605319093, + "loss": 0.2507, + "step": 1955 + }, + { + "epoch": 0.4107949175679933, + "grad_norm": 0.6794240474700928, + "learning_rate": 0.00018709323344699117, + "loss": 0.2722, + "step": 1956 + }, + { + "epoch": 0.41100493541951066, + "grad_norm": 0.6706187725067139, + "learning_rate": 0.00018707616033407498, + "loss": 0.2413, + "step": 1957 + }, + { + "epoch": 0.411214953271028, + "grad_norm": 0.7831113934516907, + "learning_rate": 0.0001870590767165019, + "loss": 0.2789, + "step": 1958 + }, + { + "epoch": 0.4114249711225454, + "grad_norm": 0.8539621829986572, + "learning_rate": 0.00018704198259633298, + "loss": 0.3216, + "step": 1959 + }, + { + "epoch": 0.4116349889740628, + "grad_norm": 0.7270777821540833, + "learning_rate": 0.00018702487797563034, + "loss": 0.1851, + "step": 1960 + }, + { + "epoch": 0.4118450068255802, + "grad_norm": 0.49483752250671387, + "learning_rate": 0.00018700776285645744, + "loss": 0.2089, + "step": 1961 + }, + { + "epoch": 0.41205502467709754, + "grad_norm": 0.5273351073265076, + "learning_rate": 0.00018699063724087904, + "loss": 0.1737, + "step": 1962 + }, + { + "epoch": 0.41226504252861496, + "grad_norm": 0.5160996317863464, + "learning_rate": 0.0001869735011309611, + "loss": 0.142, + "step": 1963 + }, + { + "epoch": 0.4124750603801323, + "grad_norm": 0.5540682077407837, + "learning_rate": 0.00018695635452877093, + "loss": 0.3374, + "step": 1964 + }, + { + "epoch": 0.4126850782316497, + "grad_norm": 0.6244357228279114, + "learning_rate": 0.000186939197436377, + "loss": 0.1763, + "step": 1965 + }, + { + "epoch": 0.41289509608316705, + "grad_norm": 0.6462785005569458, + "learning_rate": 0.00018692202985584909, + "loss": 0.1546, + "step": 1966 + }, + { + "epoch": 0.41310511393468446, + "grad_norm": 0.6570327877998352, + "learning_rate": 0.0001869048517892583, + "loss": 0.1957, + "step": 1967 + }, + { + "epoch": 0.41331513178620183, + "grad_norm": 0.45998433232307434, + "learning_rate": 0.00018688766323867695, + "loss": 0.1414, + "step": 1968 + }, + { + "epoch": 0.4135251496377192, + "grad_norm": 0.6960569024085999, + "learning_rate": 0.0001868704642061786, + "loss": 0.1752, + "step": 1969 + }, + { + "epoch": 0.4137351674892366, + "grad_norm": 0.5571821928024292, + "learning_rate": 0.0001868532546938381, + "loss": 0.1502, + "step": 1970 + }, + { + "epoch": 0.413945185340754, + "grad_norm": 0.3070127069950104, + "learning_rate": 0.0001868360347037316, + "loss": 0.0917, + "step": 1971 + }, + { + "epoch": 0.41415520319227134, + "grad_norm": 0.5906655192375183, + "learning_rate": 0.00018681880423793642, + "loss": 0.1616, + "step": 1972 + }, + { + "epoch": 0.4143652210437887, + "grad_norm": 0.49494755268096924, + "learning_rate": 0.00018680156329853125, + "loss": 0.1839, + "step": 1973 + }, + { + "epoch": 0.4145752388953061, + "grad_norm": 0.48421719670295715, + "learning_rate": 0.00018678431188759594, + "loss": 0.1576, + "step": 1974 + }, + { + "epoch": 0.4147852567468235, + "grad_norm": 0.5349069833755493, + "learning_rate": 0.00018676705000721176, + "loss": 0.2045, + "step": 1975 + }, + { + "epoch": 0.41499527459834085, + "grad_norm": 0.49466922879219055, + "learning_rate": 0.00018674977765946105, + "loss": 0.1873, + "step": 1976 + }, + { + "epoch": 0.41520529244985827, + "grad_norm": 0.8801465034484863, + "learning_rate": 0.0001867324948464275, + "loss": 0.2954, + "step": 1977 + }, + { + "epoch": 0.41541531030137563, + "grad_norm": 0.7728480100631714, + "learning_rate": 0.00018671520157019614, + "loss": 0.1919, + "step": 1978 + }, + { + "epoch": 0.415625328152893, + "grad_norm": 0.4381806254386902, + "learning_rate": 0.0001866978978328531, + "loss": 0.1868, + "step": 1979 + }, + { + "epoch": 0.41583534600441036, + "grad_norm": 0.7297528386116028, + "learning_rate": 0.00018668058363648598, + "loss": 0.2358, + "step": 1980 + }, + { + "epoch": 0.4160453638559278, + "grad_norm": 0.9243682622909546, + "learning_rate": 0.00018666325898318342, + "loss": 0.3351, + "step": 1981 + }, + { + "epoch": 0.41625538170744514, + "grad_norm": 0.6876251101493835, + "learning_rate": 0.00018664592387503543, + "loss": 0.1835, + "step": 1982 + }, + { + "epoch": 0.4164653995589625, + "grad_norm": 0.6412570476531982, + "learning_rate": 0.0001866285783141333, + "loss": 0.1726, + "step": 1983 + }, + { + "epoch": 0.41667541741047986, + "grad_norm": 0.7733229994773865, + "learning_rate": 0.0001866112223025696, + "loss": 0.2133, + "step": 1984 + }, + { + "epoch": 0.4168854352619973, + "grad_norm": 0.6993643045425415, + "learning_rate": 0.00018659385584243805, + "loss": 0.1651, + "step": 1985 + }, + { + "epoch": 0.41709545311351465, + "grad_norm": 0.438504695892334, + "learning_rate": 0.0001865764789358337, + "loss": 0.1346, + "step": 1986 + }, + { + "epoch": 0.417305470965032, + "grad_norm": 0.8238644599914551, + "learning_rate": 0.0001865590915848529, + "loss": 0.2273, + "step": 1987 + }, + { + "epoch": 0.41751548881654943, + "grad_norm": 0.5988185405731201, + "learning_rate": 0.0001865416937915932, + "loss": 0.2296, + "step": 1988 + }, + { + "epoch": 0.4177255066680668, + "grad_norm": 0.3924911618232727, + "learning_rate": 0.0001865242855581534, + "loss": 0.1437, + "step": 1989 + }, + { + "epoch": 0.41793552451958416, + "grad_norm": 0.3869493901729584, + "learning_rate": 0.00018650686688663362, + "loss": 0.1312, + "step": 1990 + }, + { + "epoch": 0.4181455423711015, + "grad_norm": 0.708566427230835, + "learning_rate": 0.0001864894377791352, + "loss": 0.1798, + "step": 1991 + }, + { + "epoch": 0.41835556022261894, + "grad_norm": 0.4604390859603882, + "learning_rate": 0.00018647199823776075, + "loss": 0.1269, + "step": 1992 + }, + { + "epoch": 0.4185655780741363, + "grad_norm": 0.6290651559829712, + "learning_rate": 0.00018645454826461414, + "loss": 0.306, + "step": 1993 + }, + { + "epoch": 0.41877559592565367, + "grad_norm": 0.7372380495071411, + "learning_rate": 0.00018643708786180048, + "loss": 0.1536, + "step": 1994 + }, + { + "epoch": 0.4189856137771711, + "grad_norm": 0.699030339717865, + "learning_rate": 0.00018641961703142612, + "loss": 0.2426, + "step": 1995 + }, + { + "epoch": 0.41919563162868845, + "grad_norm": 0.6631273627281189, + "learning_rate": 0.00018640213577559873, + "loss": 0.1994, + "step": 1996 + }, + { + "epoch": 0.4194056494802058, + "grad_norm": 0.6958602070808411, + "learning_rate": 0.00018638464409642723, + "loss": 0.2799, + "step": 1997 + }, + { + "epoch": 0.4196156673317232, + "grad_norm": 0.8655551075935364, + "learning_rate": 0.00018636714199602174, + "loss": 0.2509, + "step": 1998 + }, + { + "epoch": 0.4198256851832406, + "grad_norm": 0.6360480189323425, + "learning_rate": 0.00018634962947649368, + "loss": 0.1622, + "step": 1999 + }, + { + "epoch": 0.42003570303475796, + "grad_norm": 0.6162030100822449, + "learning_rate": 0.00018633210653995572, + "loss": 0.2662, + "step": 2000 + } + ], + "logging_steps": 1, + "max_steps": 9522, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.958982538786406e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}