diff --git "a/Qwen2.5-Coder-7B-Instruct-MathQA/checkpoint-1000/trainer_state.json" "b/Qwen2.5-Coder-7B-Instruct-MathQA/checkpoint-1000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/Qwen2.5-Coder-7B-Instruct-MathQA/checkpoint-1000/trainer_state.json" @@ -0,0 +1,7033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.21001785151737898, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00021001785151737897, + "grad_norm": 0.5090809464454651, + "learning_rate": 4.19287211740042e-07, + "loss": 1.1155, + "step": 1 + }, + { + "epoch": 0.00042003570303475793, + "grad_norm": 0.6277585625648499, + "learning_rate": 8.38574423480084e-07, + "loss": 1.3855, + "step": 2 + }, + { + "epoch": 0.000630053554552137, + "grad_norm": 0.7032777070999146, + "learning_rate": 1.257861635220126e-06, + "loss": 1.3905, + "step": 3 + }, + { + "epoch": 0.0008400714060695159, + "grad_norm": 0.745212197303772, + "learning_rate": 1.677148846960168e-06, + "loss": 1.4668, + "step": 4 + }, + { + "epoch": 0.0010500892575868949, + "grad_norm": 0.7577304840087891, + "learning_rate": 2.09643605870021e-06, + "loss": 1.473, + "step": 5 + }, + { + "epoch": 0.001260107109104274, + "grad_norm": 0.7788395881652832, + "learning_rate": 2.515723270440252e-06, + "loss": 1.485, + "step": 6 + }, + { + "epoch": 0.001470124960621653, + "grad_norm": 0.7430889010429382, + "learning_rate": 2.935010482180294e-06, + "loss": 1.4338, + "step": 7 + }, + { + "epoch": 0.0016801428121390317, + "grad_norm": 0.8291558623313904, + "learning_rate": 3.354297693920336e-06, + "loss": 1.4768, + "step": 8 + }, + { + "epoch": 0.0018901606636564107, + "grad_norm": 0.7731107473373413, + "learning_rate": 3.7735849056603773e-06, + "loss": 1.5853, + "step": 9 + }, + { + "epoch": 0.0021001785151737898, + "grad_norm": 0.8241227269172668, + "learning_rate": 4.19287211740042e-06, + "loss": 1.7314, + "step": 10 + }, + { + "epoch": 0.002310196366691169, + "grad_norm": 0.8158630728721619, + "learning_rate": 4.612159329140462e-06, + "loss": 1.5916, + "step": 11 + }, + { + "epoch": 0.002520214218208548, + "grad_norm": 0.8860861659049988, + "learning_rate": 5.031446540880504e-06, + "loss": 1.5115, + "step": 12 + }, + { + "epoch": 0.002730232069725927, + "grad_norm": 0.868651270866394, + "learning_rate": 5.4507337526205454e-06, + "loss": 1.7514, + "step": 13 + }, + { + "epoch": 0.002940249921243306, + "grad_norm": 0.9116117358207703, + "learning_rate": 5.870020964360588e-06, + "loss": 1.5645, + "step": 14 + }, + { + "epoch": 0.003150267772760685, + "grad_norm": 0.8694919347763062, + "learning_rate": 6.289308176100629e-06, + "loss": 1.6326, + "step": 15 + }, + { + "epoch": 0.0033602856242780635, + "grad_norm": 0.8614499568939209, + "learning_rate": 6.708595387840672e-06, + "loss": 1.6224, + "step": 16 + }, + { + "epoch": 0.0035703034757954425, + "grad_norm": 0.8713967800140381, + "learning_rate": 7.127882599580712e-06, + "loss": 1.5923, + "step": 17 + }, + { + "epoch": 0.0037803213273128215, + "grad_norm": 0.8446964025497437, + "learning_rate": 7.547169811320755e-06, + "loss": 1.5843, + "step": 18 + }, + { + "epoch": 0.0039903391788302005, + "grad_norm": 0.8920742869377136, + "learning_rate": 7.966457023060797e-06, + "loss": 1.5485, + "step": 19 + }, + { + "epoch": 0.0042003570303475795, + "grad_norm": 0.9501891136169434, + "learning_rate": 8.38574423480084e-06, + "loss": 1.663, + "step": 20 + }, + { + "epoch": 0.0044103748818649586, + "grad_norm": 0.9179856181144714, + "learning_rate": 8.80503144654088e-06, + "loss": 1.6163, + "step": 21 + }, + { + "epoch": 0.004620392733382338, + "grad_norm": 0.8716169595718384, + "learning_rate": 9.224318658280923e-06, + "loss": 1.5311, + "step": 22 + }, + { + "epoch": 0.004830410584899717, + "grad_norm": 0.9034018516540527, + "learning_rate": 9.643605870020965e-06, + "loss": 1.5315, + "step": 23 + }, + { + "epoch": 0.005040428436417096, + "grad_norm": 0.9811834692955017, + "learning_rate": 1.0062893081761008e-05, + "loss": 1.5356, + "step": 24 + }, + { + "epoch": 0.005250446287934475, + "grad_norm": 0.8846603035926819, + "learning_rate": 1.0482180293501048e-05, + "loss": 1.5815, + "step": 25 + }, + { + "epoch": 0.005460464139451854, + "grad_norm": 0.8842517137527466, + "learning_rate": 1.0901467505241091e-05, + "loss": 1.5628, + "step": 26 + }, + { + "epoch": 0.005670481990969233, + "grad_norm": 0.9207525253295898, + "learning_rate": 1.1320754716981132e-05, + "loss": 1.5623, + "step": 27 + }, + { + "epoch": 0.005880499842486612, + "grad_norm": 0.9082942605018616, + "learning_rate": 1.1740041928721176e-05, + "loss": 1.4599, + "step": 28 + }, + { + "epoch": 0.006090517694003991, + "grad_norm": 0.8724138736724854, + "learning_rate": 1.2159329140461215e-05, + "loss": 1.5282, + "step": 29 + }, + { + "epoch": 0.00630053554552137, + "grad_norm": 0.8738006353378296, + "learning_rate": 1.2578616352201259e-05, + "loss": 1.4782, + "step": 30 + }, + { + "epoch": 0.006510553397038748, + "grad_norm": 0.9410291910171509, + "learning_rate": 1.29979035639413e-05, + "loss": 1.3856, + "step": 31 + }, + { + "epoch": 0.006720571248556127, + "grad_norm": 0.9309423565864563, + "learning_rate": 1.3417190775681343e-05, + "loss": 1.4267, + "step": 32 + }, + { + "epoch": 0.006930589100073506, + "grad_norm": 0.9442999362945557, + "learning_rate": 1.3836477987421385e-05, + "loss": 1.3706, + "step": 33 + }, + { + "epoch": 0.007140606951590885, + "grad_norm": 0.9511269927024841, + "learning_rate": 1.4255765199161425e-05, + "loss": 1.26, + "step": 34 + }, + { + "epoch": 0.007350624803108264, + "grad_norm": 1.0389297008514404, + "learning_rate": 1.467505241090147e-05, + "loss": 1.235, + "step": 35 + }, + { + "epoch": 0.007560642654625643, + "grad_norm": 1.0033001899719238, + "learning_rate": 1.509433962264151e-05, + "loss": 1.2687, + "step": 36 + }, + { + "epoch": 0.007770660506143022, + "grad_norm": 1.075852632522583, + "learning_rate": 1.5513626834381552e-05, + "loss": 1.2762, + "step": 37 + }, + { + "epoch": 0.007980678357660401, + "grad_norm": 1.0721476078033447, + "learning_rate": 1.5932914046121594e-05, + "loss": 1.1935, + "step": 38 + }, + { + "epoch": 0.00819069620917778, + "grad_norm": 1.0784581899642944, + "learning_rate": 1.6352201257861635e-05, + "loss": 1.1119, + "step": 39 + }, + { + "epoch": 0.008400714060695159, + "grad_norm": 1.1390137672424316, + "learning_rate": 1.677148846960168e-05, + "loss": 1.0373, + "step": 40 + }, + { + "epoch": 0.008610731912212538, + "grad_norm": 1.3073922395706177, + "learning_rate": 1.719077568134172e-05, + "loss": 1.1536, + "step": 41 + }, + { + "epoch": 0.008820749763729917, + "grad_norm": 1.3248019218444824, + "learning_rate": 1.761006289308176e-05, + "loss": 0.9316, + "step": 42 + }, + { + "epoch": 0.009030767615247296, + "grad_norm": 1.3569798469543457, + "learning_rate": 1.8029350104821805e-05, + "loss": 0.8881, + "step": 43 + }, + { + "epoch": 0.009240785466764675, + "grad_norm": 1.3192838430404663, + "learning_rate": 1.8448637316561846e-05, + "loss": 0.8825, + "step": 44 + }, + { + "epoch": 0.009450803318282054, + "grad_norm": 1.1947859525680542, + "learning_rate": 1.8867924528301888e-05, + "loss": 0.9415, + "step": 45 + }, + { + "epoch": 0.009660821169799433, + "grad_norm": 1.1684753894805908, + "learning_rate": 1.928721174004193e-05, + "loss": 0.83, + "step": 46 + }, + { + "epoch": 0.009870839021316812, + "grad_norm": 1.1097474098205566, + "learning_rate": 1.970649895178197e-05, + "loss": 0.7248, + "step": 47 + }, + { + "epoch": 0.010080856872834191, + "grad_norm": 1.0564842224121094, + "learning_rate": 2.0125786163522016e-05, + "loss": 0.7427, + "step": 48 + }, + { + "epoch": 0.01029087472435157, + "grad_norm": 0.9865881204605103, + "learning_rate": 2.0545073375262054e-05, + "loss": 0.877, + "step": 49 + }, + { + "epoch": 0.01050089257586895, + "grad_norm": 1.072039246559143, + "learning_rate": 2.0964360587002095e-05, + "loss": 0.8473, + "step": 50 + }, + { + "epoch": 0.010710910427386328, + "grad_norm": 0.5596430897712708, + "learning_rate": 2.138364779874214e-05, + "loss": 0.4599, + "step": 51 + }, + { + "epoch": 0.010920928278903707, + "grad_norm": 0.6180581450462341, + "learning_rate": 2.1802935010482182e-05, + "loss": 0.5215, + "step": 52 + }, + { + "epoch": 0.011130946130421086, + "grad_norm": 0.6805194616317749, + "learning_rate": 2.2222222222222223e-05, + "loss": 0.6148, + "step": 53 + }, + { + "epoch": 0.011340963981938465, + "grad_norm": 0.7125585079193115, + "learning_rate": 2.2641509433962265e-05, + "loss": 0.5615, + "step": 54 + }, + { + "epoch": 0.011550981833455844, + "grad_norm": 0.6816964745521545, + "learning_rate": 2.3060796645702306e-05, + "loss": 0.478, + "step": 55 + }, + { + "epoch": 0.011760999684973223, + "grad_norm": 0.5821985602378845, + "learning_rate": 2.348008385744235e-05, + "loss": 0.5604, + "step": 56 + }, + { + "epoch": 0.011971017536490602, + "grad_norm": 0.642721951007843, + "learning_rate": 2.3899371069182393e-05, + "loss": 0.4068, + "step": 57 + }, + { + "epoch": 0.012181035388007981, + "grad_norm": 0.5806999206542969, + "learning_rate": 2.431865828092243e-05, + "loss": 0.4711, + "step": 58 + }, + { + "epoch": 0.01239105323952536, + "grad_norm": 0.6702911257743835, + "learning_rate": 2.4737945492662476e-05, + "loss": 0.4601, + "step": 59 + }, + { + "epoch": 0.01260107109104274, + "grad_norm": 0.6345894932746887, + "learning_rate": 2.5157232704402517e-05, + "loss": 0.4172, + "step": 60 + }, + { + "epoch": 0.012811088942560117, + "grad_norm": 0.6444422602653503, + "learning_rate": 2.5576519916142562e-05, + "loss": 0.4635, + "step": 61 + }, + { + "epoch": 0.013021106794077496, + "grad_norm": 0.6568206548690796, + "learning_rate": 2.59958071278826e-05, + "loss": 0.4327, + "step": 62 + }, + { + "epoch": 0.013231124645594875, + "grad_norm": 0.6627638936042786, + "learning_rate": 2.641509433962264e-05, + "loss": 0.4935, + "step": 63 + }, + { + "epoch": 0.013441142497112254, + "grad_norm": 0.6746403574943542, + "learning_rate": 2.6834381551362687e-05, + "loss": 0.4013, + "step": 64 + }, + { + "epoch": 0.013651160348629633, + "grad_norm": 0.7141286134719849, + "learning_rate": 2.7253668763102725e-05, + "loss": 0.5162, + "step": 65 + }, + { + "epoch": 0.013861178200147012, + "grad_norm": 0.779960572719574, + "learning_rate": 2.767295597484277e-05, + "loss": 0.4126, + "step": 66 + }, + { + "epoch": 0.014071196051664391, + "grad_norm": 0.6626395583152771, + "learning_rate": 2.809224318658281e-05, + "loss": 0.3925, + "step": 67 + }, + { + "epoch": 0.01428121390318177, + "grad_norm": 0.6545393466949463, + "learning_rate": 2.851153039832285e-05, + "loss": 0.3152, + "step": 68 + }, + { + "epoch": 0.014491231754699149, + "grad_norm": 0.7004114389419556, + "learning_rate": 2.8930817610062894e-05, + "loss": 0.4223, + "step": 69 + }, + { + "epoch": 0.014701249606216528, + "grad_norm": 0.6912452578544617, + "learning_rate": 2.935010482180294e-05, + "loss": 0.3089, + "step": 70 + }, + { + "epoch": 0.014911267457733907, + "grad_norm": 0.7729060649871826, + "learning_rate": 2.976939203354298e-05, + "loss": 0.4045, + "step": 71 + }, + { + "epoch": 0.015121285309251286, + "grad_norm": 0.7606898546218872, + "learning_rate": 3.018867924528302e-05, + "loss": 0.3079, + "step": 72 + }, + { + "epoch": 0.015331303160768665, + "grad_norm": 0.6202028393745422, + "learning_rate": 3.060796645702306e-05, + "loss": 0.3833, + "step": 73 + }, + { + "epoch": 0.015541321012286044, + "grad_norm": 0.6014758348464966, + "learning_rate": 3.1027253668763105e-05, + "loss": 0.3815, + "step": 74 + }, + { + "epoch": 0.015751338863803425, + "grad_norm": 0.6792122721672058, + "learning_rate": 3.144654088050314e-05, + "loss": 0.3383, + "step": 75 + }, + { + "epoch": 0.015961356715320802, + "grad_norm": 0.7135879993438721, + "learning_rate": 3.186582809224319e-05, + "loss": 0.3744, + "step": 76 + }, + { + "epoch": 0.016171374566838183, + "grad_norm": 0.6972818374633789, + "learning_rate": 3.228511530398323e-05, + "loss": 0.3256, + "step": 77 + }, + { + "epoch": 0.01638139241835556, + "grad_norm": 0.5925168395042419, + "learning_rate": 3.270440251572327e-05, + "loss": 0.3309, + "step": 78 + }, + { + "epoch": 0.01659141026987294, + "grad_norm": 0.7750416994094849, + "learning_rate": 3.3123689727463316e-05, + "loss": 0.4142, + "step": 79 + }, + { + "epoch": 0.016801428121390318, + "grad_norm": 0.7466484904289246, + "learning_rate": 3.354297693920336e-05, + "loss": 0.2527, + "step": 80 + }, + { + "epoch": 0.0170114459729077, + "grad_norm": 0.7709718942642212, + "learning_rate": 3.39622641509434e-05, + "loss": 0.3717, + "step": 81 + }, + { + "epoch": 0.017221463824425076, + "grad_norm": 0.6134454607963562, + "learning_rate": 3.438155136268344e-05, + "loss": 0.2969, + "step": 82 + }, + { + "epoch": 0.017431481675942453, + "grad_norm": 0.6442283391952515, + "learning_rate": 3.480083857442348e-05, + "loss": 0.3009, + "step": 83 + }, + { + "epoch": 0.017641499527459834, + "grad_norm": 0.6788150072097778, + "learning_rate": 3.522012578616352e-05, + "loss": 0.309, + "step": 84 + }, + { + "epoch": 0.01785151737897721, + "grad_norm": 0.7172322869300842, + "learning_rate": 3.5639412997903565e-05, + "loss": 0.3602, + "step": 85 + }, + { + "epoch": 0.018061535230494592, + "grad_norm": 0.7475742697715759, + "learning_rate": 3.605870020964361e-05, + "loss": 0.1889, + "step": 86 + }, + { + "epoch": 0.01827155308201197, + "grad_norm": 0.7164073586463928, + "learning_rate": 3.647798742138365e-05, + "loss": 0.2062, + "step": 87 + }, + { + "epoch": 0.01848157093352935, + "grad_norm": 0.7514247298240662, + "learning_rate": 3.689727463312369e-05, + "loss": 0.2426, + "step": 88 + }, + { + "epoch": 0.018691588785046728, + "grad_norm": 0.8898234963417053, + "learning_rate": 3.731656184486374e-05, + "loss": 0.3759, + "step": 89 + }, + { + "epoch": 0.01890160663656411, + "grad_norm": 0.8034729361534119, + "learning_rate": 3.7735849056603776e-05, + "loss": 0.2547, + "step": 90 + }, + { + "epoch": 0.019111624488081486, + "grad_norm": 0.771716296672821, + "learning_rate": 3.8155136268343814e-05, + "loss": 0.2125, + "step": 91 + }, + { + "epoch": 0.019321642339598866, + "grad_norm": 0.811174213886261, + "learning_rate": 3.857442348008386e-05, + "loss": 0.3535, + "step": 92 + }, + { + "epoch": 0.019531660191116244, + "grad_norm": 1.0474952459335327, + "learning_rate": 3.8993710691823904e-05, + "loss": 0.3278, + "step": 93 + }, + { + "epoch": 0.019741678042633624, + "grad_norm": 0.752088725566864, + "learning_rate": 3.941299790356394e-05, + "loss": 0.2574, + "step": 94 + }, + { + "epoch": 0.019951695894151, + "grad_norm": 0.9202740788459778, + "learning_rate": 3.983228511530399e-05, + "loss": 0.2618, + "step": 95 + }, + { + "epoch": 0.020161713745668382, + "grad_norm": 0.663686990737915, + "learning_rate": 4.025157232704403e-05, + "loss": 0.1981, + "step": 96 + }, + { + "epoch": 0.02037173159718576, + "grad_norm": 0.7075244784355164, + "learning_rate": 4.067085953878407e-05, + "loss": 0.195, + "step": 97 + }, + { + "epoch": 0.02058174944870314, + "grad_norm": 0.8226995468139648, + "learning_rate": 4.109014675052411e-05, + "loss": 0.3464, + "step": 98 + }, + { + "epoch": 0.020791767300220518, + "grad_norm": 0.826926589012146, + "learning_rate": 4.150943396226415e-05, + "loss": 0.241, + "step": 99 + }, + { + "epoch": 0.0210017851517379, + "grad_norm": 0.8767513632774353, + "learning_rate": 4.192872117400419e-05, + "loss": 0.33, + "step": 100 + }, + { + "epoch": 0.021211803003255276, + "grad_norm": 0.9166819453239441, + "learning_rate": 4.2348008385744236e-05, + "loss": 0.3528, + "step": 101 + }, + { + "epoch": 0.021421820854772657, + "grad_norm": 0.6607112288475037, + "learning_rate": 4.276729559748428e-05, + "loss": 0.3294, + "step": 102 + }, + { + "epoch": 0.021631838706290034, + "grad_norm": 0.5891725420951843, + "learning_rate": 4.318658280922432e-05, + "loss": 0.2523, + "step": 103 + }, + { + "epoch": 0.021841856557807415, + "grad_norm": 0.5484351515769958, + "learning_rate": 4.3605870020964364e-05, + "loss": 0.3563, + "step": 104 + }, + { + "epoch": 0.022051874409324792, + "grad_norm": 0.6384206414222717, + "learning_rate": 4.402515723270441e-05, + "loss": 0.5014, + "step": 105 + }, + { + "epoch": 0.022261892260842173, + "grad_norm": 0.6228074431419373, + "learning_rate": 4.4444444444444447e-05, + "loss": 0.297, + "step": 106 + }, + { + "epoch": 0.02247191011235955, + "grad_norm": 0.6993734240531921, + "learning_rate": 4.4863731656184485e-05, + "loss": 0.3416, + "step": 107 + }, + { + "epoch": 0.02268192796387693, + "grad_norm": 0.5191211104393005, + "learning_rate": 4.528301886792453e-05, + "loss": 0.2403, + "step": 108 + }, + { + "epoch": 0.022891945815394308, + "grad_norm": 0.5719013214111328, + "learning_rate": 4.570230607966457e-05, + "loss": 0.2226, + "step": 109 + }, + { + "epoch": 0.02310196366691169, + "grad_norm": 0.5222904682159424, + "learning_rate": 4.612159329140461e-05, + "loss": 0.2119, + "step": 110 + }, + { + "epoch": 0.023311981518429066, + "grad_norm": 0.4741697609424591, + "learning_rate": 4.654088050314466e-05, + "loss": 0.2076, + "step": 111 + }, + { + "epoch": 0.023521999369946447, + "grad_norm": 0.5350250005722046, + "learning_rate": 4.69601677148847e-05, + "loss": 0.2342, + "step": 112 + }, + { + "epoch": 0.023732017221463824, + "grad_norm": 0.6532084345817566, + "learning_rate": 4.737945492662474e-05, + "loss": 0.3541, + "step": 113 + }, + { + "epoch": 0.023942035072981205, + "grad_norm": 0.6158542633056641, + "learning_rate": 4.7798742138364785e-05, + "loss": 0.2586, + "step": 114 + }, + { + "epoch": 0.024152052924498582, + "grad_norm": 0.7820281982421875, + "learning_rate": 4.8218029350104823e-05, + "loss": 0.472, + "step": 115 + }, + { + "epoch": 0.024362070776015963, + "grad_norm": 0.6096176505088806, + "learning_rate": 4.863731656184486e-05, + "loss": 0.3014, + "step": 116 + }, + { + "epoch": 0.02457208862753334, + "grad_norm": 0.5152641534805298, + "learning_rate": 4.9056603773584906e-05, + "loss": 0.2032, + "step": 117 + }, + { + "epoch": 0.02478210647905072, + "grad_norm": 0.6049755215644836, + "learning_rate": 4.947589098532495e-05, + "loss": 0.2932, + "step": 118 + }, + { + "epoch": 0.024992124330568098, + "grad_norm": 0.5203216075897217, + "learning_rate": 4.989517819706499e-05, + "loss": 0.1978, + "step": 119 + }, + { + "epoch": 0.02520214218208548, + "grad_norm": 0.6881438493728638, + "learning_rate": 5.0314465408805034e-05, + "loss": 0.3873, + "step": 120 + }, + { + "epoch": 0.025412160033602856, + "grad_norm": 0.602206289768219, + "learning_rate": 5.073375262054507e-05, + "loss": 0.2508, + "step": 121 + }, + { + "epoch": 0.025622177885120234, + "grad_norm": 0.7059246897697449, + "learning_rate": 5.1153039832285124e-05, + "loss": 0.2661, + "step": 122 + }, + { + "epoch": 0.025832195736637614, + "grad_norm": 0.44054412841796875, + "learning_rate": 5.157232704402516e-05, + "loss": 0.177, + "step": 123 + }, + { + "epoch": 0.02604221358815499, + "grad_norm": 0.6321287155151367, + "learning_rate": 5.19916142557652e-05, + "loss": 0.2849, + "step": 124 + }, + { + "epoch": 0.026252231439672372, + "grad_norm": 0.7430282235145569, + "learning_rate": 5.2410901467505245e-05, + "loss": 0.3353, + "step": 125 + }, + { + "epoch": 0.02646224929118975, + "grad_norm": 0.6884610056877136, + "learning_rate": 5.283018867924528e-05, + "loss": 0.2935, + "step": 126 + }, + { + "epoch": 0.02667226714270713, + "grad_norm": 0.8021960854530334, + "learning_rate": 5.324947589098532e-05, + "loss": 0.2609, + "step": 127 + }, + { + "epoch": 0.026882284994224508, + "grad_norm": 0.5545848608016968, + "learning_rate": 5.366876310272537e-05, + "loss": 0.1972, + "step": 128 + }, + { + "epoch": 0.02709230284574189, + "grad_norm": 0.6628248691558838, + "learning_rate": 5.408805031446541e-05, + "loss": 0.2184, + "step": 129 + }, + { + "epoch": 0.027302320697259266, + "grad_norm": 0.5908805131912231, + "learning_rate": 5.450733752620545e-05, + "loss": 0.2029, + "step": 130 + }, + { + "epoch": 0.027512338548776646, + "grad_norm": 0.6377450823783875, + "learning_rate": 5.49266247379455e-05, + "loss": 0.2382, + "step": 131 + }, + { + "epoch": 0.027722356400294024, + "grad_norm": 0.7006211876869202, + "learning_rate": 5.534591194968554e-05, + "loss": 0.1936, + "step": 132 + }, + { + "epoch": 0.027932374251811404, + "grad_norm": 0.5962005257606506, + "learning_rate": 5.576519916142558e-05, + "loss": 0.2922, + "step": 133 + }, + { + "epoch": 0.028142392103328782, + "grad_norm": 0.6030206084251404, + "learning_rate": 5.618448637316562e-05, + "loss": 0.1629, + "step": 134 + }, + { + "epoch": 0.028352409954846162, + "grad_norm": 0.7888013124465942, + "learning_rate": 5.660377358490566e-05, + "loss": 0.2866, + "step": 135 + }, + { + "epoch": 0.02856242780636354, + "grad_norm": 0.5116386413574219, + "learning_rate": 5.70230607966457e-05, + "loss": 0.1963, + "step": 136 + }, + { + "epoch": 0.02877244565788092, + "grad_norm": 0.6759427785873413, + "learning_rate": 5.744234800838575e-05, + "loss": 0.2412, + "step": 137 + }, + { + "epoch": 0.028982463509398298, + "grad_norm": 0.8643584847450256, + "learning_rate": 5.786163522012579e-05, + "loss": 0.2277, + "step": 138 + }, + { + "epoch": 0.02919248136091568, + "grad_norm": 0.639639139175415, + "learning_rate": 5.8280922431865826e-05, + "loss": 0.2286, + "step": 139 + }, + { + "epoch": 0.029402499212433056, + "grad_norm": 0.6094908714294434, + "learning_rate": 5.870020964360588e-05, + "loss": 0.1656, + "step": 140 + }, + { + "epoch": 0.029612517063950437, + "grad_norm": 0.7927185297012329, + "learning_rate": 5.9119496855345916e-05, + "loss": 0.2436, + "step": 141 + }, + { + "epoch": 0.029822534915467814, + "grad_norm": 0.8780869841575623, + "learning_rate": 5.953878406708596e-05, + "loss": 0.2614, + "step": 142 + }, + { + "epoch": 0.030032552766985195, + "grad_norm": 0.5985304117202759, + "learning_rate": 5.9958071278826e-05, + "loss": 0.2268, + "step": 143 + }, + { + "epoch": 0.030242570618502572, + "grad_norm": 0.6452706456184387, + "learning_rate": 6.037735849056604e-05, + "loss": 0.211, + "step": 144 + }, + { + "epoch": 0.030452588470019953, + "grad_norm": 0.8015931844711304, + "learning_rate": 6.079664570230609e-05, + "loss": 0.3532, + "step": 145 + }, + { + "epoch": 0.03066260632153733, + "grad_norm": 0.667226254940033, + "learning_rate": 6.121593291404612e-05, + "loss": 0.2051, + "step": 146 + }, + { + "epoch": 0.03087262417305471, + "grad_norm": 0.6942270398139954, + "learning_rate": 6.163522012578616e-05, + "loss": 0.2516, + "step": 147 + }, + { + "epoch": 0.031082642024572088, + "grad_norm": 0.845588743686676, + "learning_rate": 6.205450733752621e-05, + "loss": 0.257, + "step": 148 + }, + { + "epoch": 0.031292659876089465, + "grad_norm": 0.6104562878608704, + "learning_rate": 6.247379454926625e-05, + "loss": 0.246, + "step": 149 + }, + { + "epoch": 0.03150267772760685, + "grad_norm": 0.7243993282318115, + "learning_rate": 6.289308176100629e-05, + "loss": 0.2623, + "step": 150 + }, + { + "epoch": 0.03171269557912423, + "grad_norm": 0.6479102373123169, + "learning_rate": 6.331236897274634e-05, + "loss": 0.3154, + "step": 151 + }, + { + "epoch": 0.031922713430641604, + "grad_norm": 0.6088507175445557, + "learning_rate": 6.373165618448638e-05, + "loss": 0.3639, + "step": 152 + }, + { + "epoch": 0.03213273128215898, + "grad_norm": 0.5590083599090576, + "learning_rate": 6.415094339622641e-05, + "loss": 0.2753, + "step": 153 + }, + { + "epoch": 0.032342749133676366, + "grad_norm": 0.6644802093505859, + "learning_rate": 6.457023060796647e-05, + "loss": 0.2722, + "step": 154 + }, + { + "epoch": 0.03255276698519374, + "grad_norm": 0.6034846901893616, + "learning_rate": 6.49895178197065e-05, + "loss": 0.3084, + "step": 155 + }, + { + "epoch": 0.03276278483671112, + "grad_norm": 0.897366464138031, + "learning_rate": 6.540880503144654e-05, + "loss": 0.2834, + "step": 156 + }, + { + "epoch": 0.0329728026882285, + "grad_norm": 0.7516223788261414, + "learning_rate": 6.58280922431866e-05, + "loss": 0.2515, + "step": 157 + }, + { + "epoch": 0.03318282053974588, + "grad_norm": 0.712957501411438, + "learning_rate": 6.624737945492663e-05, + "loss": 0.217, + "step": 158 + }, + { + "epoch": 0.03339283839126326, + "grad_norm": 0.6373322010040283, + "learning_rate": 6.666666666666667e-05, + "loss": 0.3091, + "step": 159 + }, + { + "epoch": 0.033602856242780636, + "grad_norm": 0.6305301189422607, + "learning_rate": 6.708595387840672e-05, + "loss": 0.1965, + "step": 160 + }, + { + "epoch": 0.033812874094298014, + "grad_norm": 0.6340491771697998, + "learning_rate": 6.750524109014676e-05, + "loss": 0.2316, + "step": 161 + }, + { + "epoch": 0.0340228919458154, + "grad_norm": 0.6992335915565491, + "learning_rate": 6.79245283018868e-05, + "loss": 0.321, + "step": 162 + }, + { + "epoch": 0.034232909797332775, + "grad_norm": 0.723899245262146, + "learning_rate": 6.834381551362684e-05, + "loss": 0.2057, + "step": 163 + }, + { + "epoch": 0.03444292764885015, + "grad_norm": 0.6245738863945007, + "learning_rate": 6.876310272536687e-05, + "loss": 0.2367, + "step": 164 + }, + { + "epoch": 0.03465294550036753, + "grad_norm": 0.716299295425415, + "learning_rate": 6.918238993710691e-05, + "loss": 0.3107, + "step": 165 + }, + { + "epoch": 0.03486296335188491, + "grad_norm": 0.8374738097190857, + "learning_rate": 6.960167714884696e-05, + "loss": 0.4097, + "step": 166 + }, + { + "epoch": 0.03507298120340229, + "grad_norm": 0.7812545299530029, + "learning_rate": 7.0020964360587e-05, + "loss": 0.389, + "step": 167 + }, + { + "epoch": 0.03528299905491967, + "grad_norm": 0.516504168510437, + "learning_rate": 7.044025157232704e-05, + "loss": 0.2321, + "step": 168 + }, + { + "epoch": 0.035493016906437046, + "grad_norm": 0.5948511958122253, + "learning_rate": 7.085953878406709e-05, + "loss": 0.2075, + "step": 169 + }, + { + "epoch": 0.03570303475795442, + "grad_norm": 0.5658239126205444, + "learning_rate": 7.127882599580713e-05, + "loss": 0.2366, + "step": 170 + }, + { + "epoch": 0.03591305260947181, + "grad_norm": 0.44888898730278015, + "learning_rate": 7.169811320754717e-05, + "loss": 0.226, + "step": 171 + }, + { + "epoch": 0.036123070460989185, + "grad_norm": 0.5403774380683899, + "learning_rate": 7.211740041928722e-05, + "loss": 0.2887, + "step": 172 + }, + { + "epoch": 0.03633308831250656, + "grad_norm": 0.5742720365524292, + "learning_rate": 7.253668763102726e-05, + "loss": 0.1841, + "step": 173 + }, + { + "epoch": 0.03654310616402394, + "grad_norm": 0.7217287421226501, + "learning_rate": 7.29559748427673e-05, + "loss": 0.283, + "step": 174 + }, + { + "epoch": 0.03675312401554132, + "grad_norm": 0.6517660021781921, + "learning_rate": 7.337526205450735e-05, + "loss": 0.277, + "step": 175 + }, + { + "epoch": 0.0369631418670587, + "grad_norm": 0.5237565040588379, + "learning_rate": 7.379454926624739e-05, + "loss": 0.2764, + "step": 176 + }, + { + "epoch": 0.03717315971857608, + "grad_norm": 0.5715314745903015, + "learning_rate": 7.421383647798742e-05, + "loss": 0.2947, + "step": 177 + }, + { + "epoch": 0.037383177570093455, + "grad_norm": 0.39689743518829346, + "learning_rate": 7.463312368972748e-05, + "loss": 0.1478, + "step": 178 + }, + { + "epoch": 0.03759319542161084, + "grad_norm": 0.62773197889328, + "learning_rate": 7.505241090146751e-05, + "loss": 0.2688, + "step": 179 + }, + { + "epoch": 0.03780321327312822, + "grad_norm": 0.5422549247741699, + "learning_rate": 7.547169811320755e-05, + "loss": 0.2852, + "step": 180 + }, + { + "epoch": 0.038013231124645594, + "grad_norm": 0.7973243594169617, + "learning_rate": 7.589098532494759e-05, + "loss": 0.2414, + "step": 181 + }, + { + "epoch": 0.03822324897616297, + "grad_norm": 0.596788227558136, + "learning_rate": 7.631027253668763e-05, + "loss": 0.2979, + "step": 182 + }, + { + "epoch": 0.038433266827680355, + "grad_norm": 0.7164194583892822, + "learning_rate": 7.672955974842768e-05, + "loss": 0.3195, + "step": 183 + }, + { + "epoch": 0.03864328467919773, + "grad_norm": 0.6374505758285522, + "learning_rate": 7.714884696016772e-05, + "loss": 0.2244, + "step": 184 + }, + { + "epoch": 0.03885330253071511, + "grad_norm": 0.7066443562507629, + "learning_rate": 7.756813417190776e-05, + "loss": 0.3328, + "step": 185 + }, + { + "epoch": 0.03906332038223249, + "grad_norm": 0.5930470824241638, + "learning_rate": 7.798742138364781e-05, + "loss": 0.208, + "step": 186 + }, + { + "epoch": 0.03927333823374987, + "grad_norm": 0.7578011155128479, + "learning_rate": 7.840670859538785e-05, + "loss": 0.3468, + "step": 187 + }, + { + "epoch": 0.03948335608526725, + "grad_norm": 0.5424745678901672, + "learning_rate": 7.882599580712788e-05, + "loss": 0.159, + "step": 188 + }, + { + "epoch": 0.039693373936784626, + "grad_norm": 0.6554968953132629, + "learning_rate": 7.924528301886794e-05, + "loss": 0.1718, + "step": 189 + }, + { + "epoch": 0.039903391788302, + "grad_norm": 0.596862256526947, + "learning_rate": 7.966457023060797e-05, + "loss": 0.2238, + "step": 190 + }, + { + "epoch": 0.04011340963981939, + "grad_norm": 0.7238299250602722, + "learning_rate": 8.008385744234801e-05, + "loss": 0.2322, + "step": 191 + }, + { + "epoch": 0.040323427491336765, + "grad_norm": 0.6230559349060059, + "learning_rate": 8.050314465408806e-05, + "loss": 0.166, + "step": 192 + }, + { + "epoch": 0.04053344534285414, + "grad_norm": 0.63409823179245, + "learning_rate": 8.09224318658281e-05, + "loss": 0.2431, + "step": 193 + }, + { + "epoch": 0.04074346319437152, + "grad_norm": 0.43581536412239075, + "learning_rate": 8.134171907756814e-05, + "loss": 0.1758, + "step": 194 + }, + { + "epoch": 0.040953481045888904, + "grad_norm": 0.5425090789794922, + "learning_rate": 8.176100628930818e-05, + "loss": 0.1214, + "step": 195 + }, + { + "epoch": 0.04116349889740628, + "grad_norm": 0.44201138615608215, + "learning_rate": 8.218029350104822e-05, + "loss": 0.148, + "step": 196 + }, + { + "epoch": 0.04137351674892366, + "grad_norm": 0.8185025453567505, + "learning_rate": 8.259958071278825e-05, + "loss": 0.2788, + "step": 197 + }, + { + "epoch": 0.041583534600441036, + "grad_norm": 0.5838762521743774, + "learning_rate": 8.30188679245283e-05, + "loss": 0.2547, + "step": 198 + }, + { + "epoch": 0.04179355245195842, + "grad_norm": 0.6128750443458557, + "learning_rate": 8.343815513626834e-05, + "loss": 0.2107, + "step": 199 + }, + { + "epoch": 0.0420035703034758, + "grad_norm": 0.6906862854957581, + "learning_rate": 8.385744234800838e-05, + "loss": 0.2066, + "step": 200 + }, + { + "epoch": 0.042213588154993174, + "grad_norm": 0.46028971672058105, + "learning_rate": 8.427672955974843e-05, + "loss": 0.3119, + "step": 201 + }, + { + "epoch": 0.04242360600651055, + "grad_norm": 0.45515576004981995, + "learning_rate": 8.469601677148847e-05, + "loss": 0.3191, + "step": 202 + }, + { + "epoch": 0.04263362385802793, + "grad_norm": 0.5129204392433167, + "learning_rate": 8.511530398322851e-05, + "loss": 0.2978, + "step": 203 + }, + { + "epoch": 0.04284364170954531, + "grad_norm": 0.5413036942481995, + "learning_rate": 8.553459119496856e-05, + "loss": 0.2489, + "step": 204 + }, + { + "epoch": 0.04305365956106269, + "grad_norm": 0.5111532211303711, + "learning_rate": 8.59538784067086e-05, + "loss": 0.2322, + "step": 205 + }, + { + "epoch": 0.04326367741258007, + "grad_norm": 0.50229412317276, + "learning_rate": 8.637316561844864e-05, + "loss": 0.2858, + "step": 206 + }, + { + "epoch": 0.043473695264097445, + "grad_norm": 0.5821024179458618, + "learning_rate": 8.679245283018869e-05, + "loss": 0.3118, + "step": 207 + }, + { + "epoch": 0.04368371311561483, + "grad_norm": 0.5523480176925659, + "learning_rate": 8.721174004192873e-05, + "loss": 0.3757, + "step": 208 + }, + { + "epoch": 0.043893730967132207, + "grad_norm": 0.721248209476471, + "learning_rate": 8.763102725366877e-05, + "loss": 0.3196, + "step": 209 + }, + { + "epoch": 0.044103748818649584, + "grad_norm": 0.47591283917427063, + "learning_rate": 8.805031446540882e-05, + "loss": 0.233, + "step": 210 + }, + { + "epoch": 0.04431376667016696, + "grad_norm": 0.5817727446556091, + "learning_rate": 8.846960167714886e-05, + "loss": 0.2924, + "step": 211 + }, + { + "epoch": 0.044523784521684345, + "grad_norm": 0.5370981097221375, + "learning_rate": 8.888888888888889e-05, + "loss": 0.2487, + "step": 212 + }, + { + "epoch": 0.04473380237320172, + "grad_norm": 0.47605571150779724, + "learning_rate": 8.930817610062893e-05, + "loss": 0.2249, + "step": 213 + }, + { + "epoch": 0.0449438202247191, + "grad_norm": 0.6315035223960876, + "learning_rate": 8.972746331236897e-05, + "loss": 0.2805, + "step": 214 + }, + { + "epoch": 0.04515383807623648, + "grad_norm": 0.7511045932769775, + "learning_rate": 9.014675052410901e-05, + "loss": 0.2235, + "step": 215 + }, + { + "epoch": 0.04536385592775386, + "grad_norm": 0.6990196704864502, + "learning_rate": 9.056603773584906e-05, + "loss": 0.2548, + "step": 216 + }, + { + "epoch": 0.04557387377927124, + "grad_norm": 0.6954676508903503, + "learning_rate": 9.09853249475891e-05, + "loss": 0.2232, + "step": 217 + }, + { + "epoch": 0.045783891630788616, + "grad_norm": 0.6898313164710999, + "learning_rate": 9.140461215932914e-05, + "loss": 0.2241, + "step": 218 + }, + { + "epoch": 0.04599390948230599, + "grad_norm": 0.649360716342926, + "learning_rate": 9.182389937106919e-05, + "loss": 0.3236, + "step": 219 + }, + { + "epoch": 0.04620392733382338, + "grad_norm": 0.6722248792648315, + "learning_rate": 9.224318658280923e-05, + "loss": 0.4025, + "step": 220 + }, + { + "epoch": 0.046413945185340755, + "grad_norm": 0.8752652406692505, + "learning_rate": 9.266247379454928e-05, + "loss": 0.2937, + "step": 221 + }, + { + "epoch": 0.04662396303685813, + "grad_norm": 0.6925809979438782, + "learning_rate": 9.308176100628931e-05, + "loss": 0.2719, + "step": 222 + }, + { + "epoch": 0.04683398088837551, + "grad_norm": 0.6006962656974792, + "learning_rate": 9.350104821802935e-05, + "loss": 0.3025, + "step": 223 + }, + { + "epoch": 0.047043998739892894, + "grad_norm": 0.7841734290122986, + "learning_rate": 9.39203354297694e-05, + "loss": 0.267, + "step": 224 + }, + { + "epoch": 0.04725401659141027, + "grad_norm": 0.6895375847816467, + "learning_rate": 9.433962264150944e-05, + "loss": 0.185, + "step": 225 + }, + { + "epoch": 0.04746403444292765, + "grad_norm": 0.4874520003795624, + "learning_rate": 9.475890985324948e-05, + "loss": 0.1967, + "step": 226 + }, + { + "epoch": 0.047674052294445025, + "grad_norm": 0.5807353258132935, + "learning_rate": 9.517819706498953e-05, + "loss": 0.2649, + "step": 227 + }, + { + "epoch": 0.04788407014596241, + "grad_norm": 0.5833479166030884, + "learning_rate": 9.559748427672957e-05, + "loss": 0.2506, + "step": 228 + }, + { + "epoch": 0.04809408799747979, + "grad_norm": 0.5458546280860901, + "learning_rate": 9.601677148846961e-05, + "loss": 0.2338, + "step": 229 + }, + { + "epoch": 0.048304105848997164, + "grad_norm": 0.543692409992218, + "learning_rate": 9.643605870020965e-05, + "loss": 0.2058, + "step": 230 + }, + { + "epoch": 0.04851412370051454, + "grad_norm": 0.8071588277816772, + "learning_rate": 9.685534591194969e-05, + "loss": 0.2658, + "step": 231 + }, + { + "epoch": 0.048724141552031926, + "grad_norm": 0.6677277088165283, + "learning_rate": 9.727463312368972e-05, + "loss": 0.3603, + "step": 232 + }, + { + "epoch": 0.0489341594035493, + "grad_norm": 0.6699347496032715, + "learning_rate": 9.769392033542977e-05, + "loss": 0.2474, + "step": 233 + }, + { + "epoch": 0.04914417725506668, + "grad_norm": 0.47732773423194885, + "learning_rate": 9.811320754716981e-05, + "loss": 0.1856, + "step": 234 + }, + { + "epoch": 0.04935419510658406, + "grad_norm": 0.738476037979126, + "learning_rate": 9.853249475890985e-05, + "loss": 0.2443, + "step": 235 + }, + { + "epoch": 0.04956421295810144, + "grad_norm": 0.6604114174842834, + "learning_rate": 9.89517819706499e-05, + "loss": 0.2656, + "step": 236 + }, + { + "epoch": 0.04977423080961882, + "grad_norm": 0.6403035521507263, + "learning_rate": 9.937106918238994e-05, + "loss": 0.2248, + "step": 237 + }, + { + "epoch": 0.049984248661136196, + "grad_norm": 0.7960561513900757, + "learning_rate": 9.979035639412998e-05, + "loss": 0.5721, + "step": 238 + }, + { + "epoch": 0.050194266512653574, + "grad_norm": 0.7372507452964783, + "learning_rate": 0.00010020964360587002, + "loss": 0.263, + "step": 239 + }, + { + "epoch": 0.05040428436417096, + "grad_norm": 0.5040899515151978, + "learning_rate": 0.00010062893081761007, + "loss": 0.2005, + "step": 240 + }, + { + "epoch": 0.050614302215688335, + "grad_norm": 0.5214698314666748, + "learning_rate": 0.00010104821802935012, + "loss": 0.1686, + "step": 241 + }, + { + "epoch": 0.05082432006720571, + "grad_norm": 0.5759347677230835, + "learning_rate": 0.00010146750524109014, + "loss": 0.213, + "step": 242 + }, + { + "epoch": 0.05103433791872309, + "grad_norm": 0.5980076789855957, + "learning_rate": 0.0001018867924528302, + "loss": 0.2073, + "step": 243 + }, + { + "epoch": 0.05124435577024047, + "grad_norm": 0.4733896851539612, + "learning_rate": 0.00010230607966457025, + "loss": 0.2275, + "step": 244 + }, + { + "epoch": 0.05145437362175785, + "grad_norm": 0.597186803817749, + "learning_rate": 0.00010272536687631027, + "loss": 0.2205, + "step": 245 + }, + { + "epoch": 0.05166439147327523, + "grad_norm": 0.7414665818214417, + "learning_rate": 0.00010314465408805032, + "loss": 0.2796, + "step": 246 + }, + { + "epoch": 0.051874409324792606, + "grad_norm": 0.5712472200393677, + "learning_rate": 0.00010356394129979036, + "loss": 0.2343, + "step": 247 + }, + { + "epoch": 0.05208442717630998, + "grad_norm": 0.7146592140197754, + "learning_rate": 0.0001039832285115304, + "loss": 0.2339, + "step": 248 + }, + { + "epoch": 0.05229444502782737, + "grad_norm": 0.5384387373924255, + "learning_rate": 0.00010440251572327044, + "loss": 0.3013, + "step": 249 + }, + { + "epoch": 0.052504462879344745, + "grad_norm": 0.6523765921592712, + "learning_rate": 0.00010482180293501049, + "loss": 0.2295, + "step": 250 + }, + { + "epoch": 0.05271448073086212, + "grad_norm": 0.5877966284751892, + "learning_rate": 0.00010524109014675052, + "loss": 0.3045, + "step": 251 + }, + { + "epoch": 0.0529244985823795, + "grad_norm": 0.5458659529685974, + "learning_rate": 0.00010566037735849057, + "loss": 0.3458, + "step": 252 + }, + { + "epoch": 0.05313451643389688, + "grad_norm": 0.6964541673660278, + "learning_rate": 0.00010607966457023062, + "loss": 0.3715, + "step": 253 + }, + { + "epoch": 0.05334453428541426, + "grad_norm": 0.4040902853012085, + "learning_rate": 0.00010649895178197064, + "loss": 0.1554, + "step": 254 + }, + { + "epoch": 0.05355455213693164, + "grad_norm": 0.7013939023017883, + "learning_rate": 0.0001069182389937107, + "loss": 0.4111, + "step": 255 + }, + { + "epoch": 0.053764569988449015, + "grad_norm": 0.622279942035675, + "learning_rate": 0.00010733752620545075, + "loss": 0.2715, + "step": 256 + }, + { + "epoch": 0.0539745878399664, + "grad_norm": 0.6869280934333801, + "learning_rate": 0.00010775681341719077, + "loss": 0.3444, + "step": 257 + }, + { + "epoch": 0.05418460569148378, + "grad_norm": 0.6003340482711792, + "learning_rate": 0.00010817610062893082, + "loss": 0.261, + "step": 258 + }, + { + "epoch": 0.054394623543001154, + "grad_norm": 0.4238702356815338, + "learning_rate": 0.00010859538784067087, + "loss": 0.2615, + "step": 259 + }, + { + "epoch": 0.05460464139451853, + "grad_norm": 0.5799490809440613, + "learning_rate": 0.0001090146750524109, + "loss": 0.3293, + "step": 260 + }, + { + "epoch": 0.054814659246035916, + "grad_norm": 0.6378821134567261, + "learning_rate": 0.00010943396226415095, + "loss": 0.3184, + "step": 261 + }, + { + "epoch": 0.05502467709755329, + "grad_norm": 0.540050745010376, + "learning_rate": 0.000109853249475891, + "loss": 0.262, + "step": 262 + }, + { + "epoch": 0.05523469494907067, + "grad_norm": 0.551968514919281, + "learning_rate": 0.00011027253668763103, + "loss": 0.3183, + "step": 263 + }, + { + "epoch": 0.05544471280058805, + "grad_norm": 0.6024414896965027, + "learning_rate": 0.00011069182389937108, + "loss": 0.2883, + "step": 264 + }, + { + "epoch": 0.05565473065210543, + "grad_norm": 0.6821377277374268, + "learning_rate": 0.00011111111111111112, + "loss": 0.3178, + "step": 265 + }, + { + "epoch": 0.05586474850362281, + "grad_norm": 0.6535030603408813, + "learning_rate": 0.00011153039832285115, + "loss": 0.2091, + "step": 266 + }, + { + "epoch": 0.056074766355140186, + "grad_norm": 0.5255767703056335, + "learning_rate": 0.00011194968553459119, + "loss": 0.1802, + "step": 267 + }, + { + "epoch": 0.056284784206657563, + "grad_norm": 0.59894859790802, + "learning_rate": 0.00011236897274633124, + "loss": 0.2526, + "step": 268 + }, + { + "epoch": 0.05649480205817495, + "grad_norm": 0.6401522159576416, + "learning_rate": 0.00011278825995807127, + "loss": 0.2462, + "step": 269 + }, + { + "epoch": 0.056704819909692325, + "grad_norm": 0.6755663752555847, + "learning_rate": 0.00011320754716981132, + "loss": 0.1973, + "step": 270 + }, + { + "epoch": 0.0569148377612097, + "grad_norm": 0.5885902643203735, + "learning_rate": 0.00011362683438155137, + "loss": 0.2111, + "step": 271 + }, + { + "epoch": 0.05712485561272708, + "grad_norm": 0.48771098256111145, + "learning_rate": 0.0001140461215932914, + "loss": 0.2409, + "step": 272 + }, + { + "epoch": 0.057334873464244464, + "grad_norm": 0.5513054132461548, + "learning_rate": 0.00011446540880503145, + "loss": 0.1883, + "step": 273 + }, + { + "epoch": 0.05754489131576184, + "grad_norm": 0.43761372566223145, + "learning_rate": 0.0001148846960167715, + "loss": 0.228, + "step": 274 + }, + { + "epoch": 0.05775490916727922, + "grad_norm": 0.5232881307601929, + "learning_rate": 0.00011530398322851152, + "loss": 0.1592, + "step": 275 + }, + { + "epoch": 0.057964927018796596, + "grad_norm": 0.5873312950134277, + "learning_rate": 0.00011572327044025158, + "loss": 0.1972, + "step": 276 + }, + { + "epoch": 0.05817494487031398, + "grad_norm": 0.5464483499526978, + "learning_rate": 0.00011614255765199163, + "loss": 0.1502, + "step": 277 + }, + { + "epoch": 0.05838496272183136, + "grad_norm": 0.6480989456176758, + "learning_rate": 0.00011656184486373165, + "loss": 0.2349, + "step": 278 + }, + { + "epoch": 0.058594980573348734, + "grad_norm": 0.41417571902275085, + "learning_rate": 0.0001169811320754717, + "loss": 0.2097, + "step": 279 + }, + { + "epoch": 0.05880499842486611, + "grad_norm": 0.8272523880004883, + "learning_rate": 0.00011740041928721176, + "loss": 0.2676, + "step": 280 + }, + { + "epoch": 0.059015016276383496, + "grad_norm": 0.6363915205001831, + "learning_rate": 0.0001178197064989518, + "loss": 0.3084, + "step": 281 + }, + { + "epoch": 0.05922503412790087, + "grad_norm": 0.6411394476890564, + "learning_rate": 0.00011823899371069183, + "loss": 0.244, + "step": 282 + }, + { + "epoch": 0.05943505197941825, + "grad_norm": 0.9145995378494263, + "learning_rate": 0.00011865828092243187, + "loss": 0.298, + "step": 283 + }, + { + "epoch": 0.05964506983093563, + "grad_norm": 0.7248232960700989, + "learning_rate": 0.00011907756813417192, + "loss": 0.2438, + "step": 284 + }, + { + "epoch": 0.059855087682453005, + "grad_norm": 0.4901827573776245, + "learning_rate": 0.00011949685534591195, + "loss": 0.1903, + "step": 285 + }, + { + "epoch": 0.06006510553397039, + "grad_norm": 0.5104687809944153, + "learning_rate": 0.000119916142557652, + "loss": 0.2014, + "step": 286 + }, + { + "epoch": 0.06027512338548777, + "grad_norm": 0.5063393712043762, + "learning_rate": 0.00012033542976939205, + "loss": 0.212, + "step": 287 + }, + { + "epoch": 0.060485141237005144, + "grad_norm": 0.6044209599494934, + "learning_rate": 0.00012075471698113207, + "loss": 0.3138, + "step": 288 + }, + { + "epoch": 0.06069515908852252, + "grad_norm": 0.5843082666397095, + "learning_rate": 0.00012117400419287213, + "loss": 0.2199, + "step": 289 + }, + { + "epoch": 0.060905176940039905, + "grad_norm": 0.4589983820915222, + "learning_rate": 0.00012159329140461218, + "loss": 0.2222, + "step": 290 + }, + { + "epoch": 0.06111519479155728, + "grad_norm": 0.4094448983669281, + "learning_rate": 0.0001220125786163522, + "loss": 0.1286, + "step": 291 + }, + { + "epoch": 0.06132521264307466, + "grad_norm": 0.42624855041503906, + "learning_rate": 0.00012243186582809224, + "loss": 0.2719, + "step": 292 + }, + { + "epoch": 0.06153523049459204, + "grad_norm": 0.5488569736480713, + "learning_rate": 0.0001228511530398323, + "loss": 0.2588, + "step": 293 + }, + { + "epoch": 0.06174524834610942, + "grad_norm": 0.6029438972473145, + "learning_rate": 0.00012327044025157232, + "loss": 0.3182, + "step": 294 + }, + { + "epoch": 0.0619552661976268, + "grad_norm": 0.49090123176574707, + "learning_rate": 0.00012368972746331237, + "loss": 0.2192, + "step": 295 + }, + { + "epoch": 0.062165284049144176, + "grad_norm": 0.7553131580352783, + "learning_rate": 0.00012410901467505242, + "loss": 0.2932, + "step": 296 + }, + { + "epoch": 0.06237530190066155, + "grad_norm": 0.6839373707771301, + "learning_rate": 0.00012452830188679244, + "loss": 0.1896, + "step": 297 + }, + { + "epoch": 0.06258531975217893, + "grad_norm": 0.5805861949920654, + "learning_rate": 0.0001249475890985325, + "loss": 0.2613, + "step": 298 + }, + { + "epoch": 0.06279533760369631, + "grad_norm": 0.4247298836708069, + "learning_rate": 0.00012536687631027255, + "loss": 0.1701, + "step": 299 + }, + { + "epoch": 0.0630053554552137, + "grad_norm": 0.6167422533035278, + "learning_rate": 0.00012578616352201257, + "loss": 0.2919, + "step": 300 + }, + { + "epoch": 0.06321537330673108, + "grad_norm": 0.5140472054481506, + "learning_rate": 0.00012620545073375262, + "loss": 0.2204, + "step": 301 + }, + { + "epoch": 0.06342539115824845, + "grad_norm": 0.48360675573349, + "learning_rate": 0.00012662473794549268, + "loss": 0.2625, + "step": 302 + }, + { + "epoch": 0.06363540900976583, + "grad_norm": 0.5805841684341431, + "learning_rate": 0.0001270440251572327, + "loss": 0.2659, + "step": 303 + }, + { + "epoch": 0.06384542686128321, + "grad_norm": 0.4108704924583435, + "learning_rate": 0.00012746331236897275, + "loss": 0.1757, + "step": 304 + }, + { + "epoch": 0.06405544471280059, + "grad_norm": 0.4739980697631836, + "learning_rate": 0.0001278825995807128, + "loss": 0.2413, + "step": 305 + }, + { + "epoch": 0.06426546256431796, + "grad_norm": 0.6421864032745361, + "learning_rate": 0.00012830188679245283, + "loss": 0.3373, + "step": 306 + }, + { + "epoch": 0.06447548041583534, + "grad_norm": 0.6035056114196777, + "learning_rate": 0.00012872117400419288, + "loss": 0.1632, + "step": 307 + }, + { + "epoch": 0.06468549826735273, + "grad_norm": 0.5946957468986511, + "learning_rate": 0.00012914046121593293, + "loss": 0.2797, + "step": 308 + }, + { + "epoch": 0.06489551611887011, + "grad_norm": 0.5636250972747803, + "learning_rate": 0.00012955974842767296, + "loss": 0.3484, + "step": 309 + }, + { + "epoch": 0.06510553397038749, + "grad_norm": 0.5175902843475342, + "learning_rate": 0.000129979035639413, + "loss": 0.2306, + "step": 310 + }, + { + "epoch": 0.06531555182190486, + "grad_norm": 0.39933711290359497, + "learning_rate": 0.00013039832285115306, + "loss": 0.2018, + "step": 311 + }, + { + "epoch": 0.06552556967342224, + "grad_norm": 0.6203914284706116, + "learning_rate": 0.00013081761006289308, + "loss": 0.2519, + "step": 312 + }, + { + "epoch": 0.06573558752493962, + "grad_norm": 0.6847423911094666, + "learning_rate": 0.00013123689727463314, + "loss": 0.2125, + "step": 313 + }, + { + "epoch": 0.065945605376457, + "grad_norm": 0.5958030223846436, + "learning_rate": 0.0001316561844863732, + "loss": 0.2019, + "step": 314 + }, + { + "epoch": 0.06615562322797437, + "grad_norm": 0.4878827631473541, + "learning_rate": 0.0001320754716981132, + "loss": 0.2286, + "step": 315 + }, + { + "epoch": 0.06636564107949176, + "grad_norm": 0.5386853814125061, + "learning_rate": 0.00013249475890985326, + "loss": 0.2349, + "step": 316 + }, + { + "epoch": 0.06657565893100914, + "grad_norm": 0.5583687424659729, + "learning_rate": 0.00013291404612159332, + "loss": 0.273, + "step": 317 + }, + { + "epoch": 0.06678567678252652, + "grad_norm": 0.503384530544281, + "learning_rate": 0.00013333333333333334, + "loss": 0.2718, + "step": 318 + }, + { + "epoch": 0.0669956946340439, + "grad_norm": 0.6256868839263916, + "learning_rate": 0.0001337526205450734, + "loss": 0.2176, + "step": 319 + }, + { + "epoch": 0.06720571248556127, + "grad_norm": 0.4585525095462799, + "learning_rate": 0.00013417190775681344, + "loss": 0.1671, + "step": 320 + }, + { + "epoch": 0.06741573033707865, + "grad_norm": 0.52493816614151, + "learning_rate": 0.00013459119496855347, + "loss": 0.2129, + "step": 321 + }, + { + "epoch": 0.06762574818859603, + "grad_norm": 0.7206648588180542, + "learning_rate": 0.00013501048218029352, + "loss": 0.1872, + "step": 322 + }, + { + "epoch": 0.0678357660401134, + "grad_norm": 0.5732535123825073, + "learning_rate": 0.00013542976939203354, + "loss": 0.2131, + "step": 323 + }, + { + "epoch": 0.0680457838916308, + "grad_norm": 0.5404482483863831, + "learning_rate": 0.0001358490566037736, + "loss": 0.1472, + "step": 324 + }, + { + "epoch": 0.06825580174314817, + "grad_norm": 0.7235817313194275, + "learning_rate": 0.00013626834381551362, + "loss": 0.2404, + "step": 325 + }, + { + "epoch": 0.06846581959466555, + "grad_norm": 0.4254133999347687, + "learning_rate": 0.00013668763102725367, + "loss": 0.2133, + "step": 326 + }, + { + "epoch": 0.06867583744618293, + "grad_norm": 0.4804741144180298, + "learning_rate": 0.0001371069182389937, + "loss": 0.1776, + "step": 327 + }, + { + "epoch": 0.0688858552977003, + "grad_norm": 0.4900747537612915, + "learning_rate": 0.00013752620545073375, + "loss": 0.1958, + "step": 328 + }, + { + "epoch": 0.06909587314921768, + "grad_norm": 0.576337456703186, + "learning_rate": 0.0001379454926624738, + "loss": 0.2318, + "step": 329 + }, + { + "epoch": 0.06930589100073506, + "grad_norm": 0.5610971450805664, + "learning_rate": 0.00013836477987421382, + "loss": 0.2631, + "step": 330 + }, + { + "epoch": 0.06951590885225244, + "grad_norm": 0.6010019779205322, + "learning_rate": 0.00013878406708595388, + "loss": 0.2201, + "step": 331 + }, + { + "epoch": 0.06972592670376981, + "grad_norm": 0.4658229947090149, + "learning_rate": 0.00013920335429769393, + "loss": 0.1602, + "step": 332 + }, + { + "epoch": 0.0699359445552872, + "grad_norm": 0.5411532521247864, + "learning_rate": 0.00013962264150943395, + "loss": 0.1942, + "step": 333 + }, + { + "epoch": 0.07014596240680458, + "grad_norm": 0.875629186630249, + "learning_rate": 0.000140041928721174, + "loss": 0.2337, + "step": 334 + }, + { + "epoch": 0.07035598025832196, + "grad_norm": 0.5620985627174377, + "learning_rate": 0.00014046121593291406, + "loss": 0.2641, + "step": 335 + }, + { + "epoch": 0.07056599810983934, + "grad_norm": 0.8389297723770142, + "learning_rate": 0.00014088050314465408, + "loss": 0.3069, + "step": 336 + }, + { + "epoch": 0.07077601596135671, + "grad_norm": 0.4745865762233734, + "learning_rate": 0.00014129979035639413, + "loss": 0.1626, + "step": 337 + }, + { + "epoch": 0.07098603381287409, + "grad_norm": 0.4688374996185303, + "learning_rate": 0.00014171907756813418, + "loss": 0.1327, + "step": 338 + }, + { + "epoch": 0.07119605166439147, + "grad_norm": 0.4219890832901001, + "learning_rate": 0.0001421383647798742, + "loss": 0.142, + "step": 339 + }, + { + "epoch": 0.07140606951590885, + "grad_norm": 0.700579047203064, + "learning_rate": 0.00014255765199161426, + "loss": 0.179, + "step": 340 + }, + { + "epoch": 0.07161608736742624, + "grad_norm": 0.36132583022117615, + "learning_rate": 0.0001429769392033543, + "loss": 0.1283, + "step": 341 + }, + { + "epoch": 0.07182610521894361, + "grad_norm": 0.9342030882835388, + "learning_rate": 0.00014339622641509434, + "loss": 0.3873, + "step": 342 + }, + { + "epoch": 0.07203612307046099, + "grad_norm": 0.6389639973640442, + "learning_rate": 0.0001438155136268344, + "loss": 0.2631, + "step": 343 + }, + { + "epoch": 0.07224614092197837, + "grad_norm": 0.7687662243843079, + "learning_rate": 0.00014423480083857444, + "loss": 0.2002, + "step": 344 + }, + { + "epoch": 0.07245615877349575, + "grad_norm": 0.6517148613929749, + "learning_rate": 0.00014465408805031446, + "loss": 0.2454, + "step": 345 + }, + { + "epoch": 0.07266617662501312, + "grad_norm": 0.5010355710983276, + "learning_rate": 0.00014507337526205452, + "loss": 0.1541, + "step": 346 + }, + { + "epoch": 0.0728761944765305, + "grad_norm": 0.49431943893432617, + "learning_rate": 0.00014549266247379457, + "loss": 0.213, + "step": 347 + }, + { + "epoch": 0.07308621232804788, + "grad_norm": 0.6462149024009705, + "learning_rate": 0.0001459119496855346, + "loss": 0.2642, + "step": 348 + }, + { + "epoch": 0.07329623017956527, + "grad_norm": 0.5412748456001282, + "learning_rate": 0.00014633123689727464, + "loss": 0.187, + "step": 349 + }, + { + "epoch": 0.07350624803108265, + "grad_norm": 0.6458069682121277, + "learning_rate": 0.0001467505241090147, + "loss": 0.3224, + "step": 350 + }, + { + "epoch": 0.07371626588260002, + "grad_norm": 0.4398702383041382, + "learning_rate": 0.00014716981132075472, + "loss": 0.2419, + "step": 351 + }, + { + "epoch": 0.0739262837341174, + "grad_norm": 0.47583240270614624, + "learning_rate": 0.00014758909853249477, + "loss": 0.2259, + "step": 352 + }, + { + "epoch": 0.07413630158563478, + "grad_norm": 0.5058132410049438, + "learning_rate": 0.00014800838574423482, + "loss": 0.3733, + "step": 353 + }, + { + "epoch": 0.07434631943715216, + "grad_norm": 0.4765789210796356, + "learning_rate": 0.00014842767295597485, + "loss": 0.3204, + "step": 354 + }, + { + "epoch": 0.07455633728866953, + "grad_norm": 0.4549868106842041, + "learning_rate": 0.0001488469601677149, + "loss": 0.2788, + "step": 355 + }, + { + "epoch": 0.07476635514018691, + "grad_norm": 0.44640183448791504, + "learning_rate": 0.00014926624737945495, + "loss": 0.225, + "step": 356 + }, + { + "epoch": 0.0749763729917043, + "grad_norm": 0.5040209293365479, + "learning_rate": 0.00014968553459119498, + "loss": 0.288, + "step": 357 + }, + { + "epoch": 0.07518639084322168, + "grad_norm": 0.7681525349617004, + "learning_rate": 0.00015010482180293503, + "loss": 0.2677, + "step": 358 + }, + { + "epoch": 0.07539640869473906, + "grad_norm": 0.3658473491668701, + "learning_rate": 0.00015052410901467505, + "loss": 0.1944, + "step": 359 + }, + { + "epoch": 0.07560642654625643, + "grad_norm": 0.5071917772293091, + "learning_rate": 0.0001509433962264151, + "loss": 0.1937, + "step": 360 + }, + { + "epoch": 0.07581644439777381, + "grad_norm": 0.5669259428977966, + "learning_rate": 0.00015136268343815513, + "loss": 0.2755, + "step": 361 + }, + { + "epoch": 0.07602646224929119, + "grad_norm": 0.5721021294593811, + "learning_rate": 0.00015178197064989518, + "loss": 0.2233, + "step": 362 + }, + { + "epoch": 0.07623648010080857, + "grad_norm": 0.5776953101158142, + "learning_rate": 0.00015220125786163523, + "loss": 0.2918, + "step": 363 + }, + { + "epoch": 0.07644649795232594, + "grad_norm": 0.7863572239875793, + "learning_rate": 0.00015262054507337526, + "loss": 0.3225, + "step": 364 + }, + { + "epoch": 0.07665651580384332, + "grad_norm": 0.7403888702392578, + "learning_rate": 0.0001530398322851153, + "loss": 0.1836, + "step": 365 + }, + { + "epoch": 0.07686653365536071, + "grad_norm": 0.7344810962677002, + "learning_rate": 0.00015345911949685536, + "loss": 0.342, + "step": 366 + }, + { + "epoch": 0.07707655150687809, + "grad_norm": 0.6341666579246521, + "learning_rate": 0.00015387840670859538, + "loss": 0.2222, + "step": 367 + }, + { + "epoch": 0.07728656935839547, + "grad_norm": 0.7821016907691956, + "learning_rate": 0.00015429769392033544, + "loss": 0.3106, + "step": 368 + }, + { + "epoch": 0.07749658720991284, + "grad_norm": 0.5648399591445923, + "learning_rate": 0.0001547169811320755, + "loss": 0.1907, + "step": 369 + }, + { + "epoch": 0.07770660506143022, + "grad_norm": 0.5853981971740723, + "learning_rate": 0.0001551362683438155, + "loss": 0.1873, + "step": 370 + }, + { + "epoch": 0.0779166229129476, + "grad_norm": 0.6429926753044128, + "learning_rate": 0.00015555555555555556, + "loss": 0.177, + "step": 371 + }, + { + "epoch": 0.07812664076446497, + "grad_norm": 0.5365523099899292, + "learning_rate": 0.00015597484276729561, + "loss": 0.2283, + "step": 372 + }, + { + "epoch": 0.07833665861598235, + "grad_norm": 0.4820340871810913, + "learning_rate": 0.00015639412997903564, + "loss": 0.2179, + "step": 373 + }, + { + "epoch": 0.07854667646749974, + "grad_norm": 0.5231903195381165, + "learning_rate": 0.0001568134171907757, + "loss": 0.2165, + "step": 374 + }, + { + "epoch": 0.07875669431901712, + "grad_norm": 0.6309874057769775, + "learning_rate": 0.00015723270440251574, + "loss": 0.2511, + "step": 375 + }, + { + "epoch": 0.0789667121705345, + "grad_norm": 0.6248964667320251, + "learning_rate": 0.00015765199161425577, + "loss": 0.192, + "step": 376 + }, + { + "epoch": 0.07917673002205187, + "grad_norm": 0.4089469611644745, + "learning_rate": 0.00015807127882599582, + "loss": 0.1674, + "step": 377 + }, + { + "epoch": 0.07938674787356925, + "grad_norm": 0.5720129609107971, + "learning_rate": 0.00015849056603773587, + "loss": 0.2571, + "step": 378 + }, + { + "epoch": 0.07959676572508663, + "grad_norm": 0.505424976348877, + "learning_rate": 0.0001589098532494759, + "loss": 0.2189, + "step": 379 + }, + { + "epoch": 0.079806783576604, + "grad_norm": 0.4483712315559387, + "learning_rate": 0.00015932914046121595, + "loss": 0.2959, + "step": 380 + }, + { + "epoch": 0.08001680142812138, + "grad_norm": 0.6313521862030029, + "learning_rate": 0.000159748427672956, + "loss": 0.237, + "step": 381 + }, + { + "epoch": 0.08022681927963878, + "grad_norm": 0.530503511428833, + "learning_rate": 0.00016016771488469602, + "loss": 0.1922, + "step": 382 + }, + { + "epoch": 0.08043683713115615, + "grad_norm": 0.65278160572052, + "learning_rate": 0.00016058700209643607, + "loss": 0.2714, + "step": 383 + }, + { + "epoch": 0.08064685498267353, + "grad_norm": 0.6226363182067871, + "learning_rate": 0.00016100628930817613, + "loss": 0.1913, + "step": 384 + }, + { + "epoch": 0.08085687283419091, + "grad_norm": 0.6313908696174622, + "learning_rate": 0.00016142557651991615, + "loss": 0.2457, + "step": 385 + }, + { + "epoch": 0.08106689068570828, + "grad_norm": 0.5335121750831604, + "learning_rate": 0.0001618448637316562, + "loss": 0.2537, + "step": 386 + }, + { + "epoch": 0.08127690853722566, + "grad_norm": 0.7243566513061523, + "learning_rate": 0.00016226415094339625, + "loss": 0.2125, + "step": 387 + }, + { + "epoch": 0.08148692638874304, + "grad_norm": 0.5874237418174744, + "learning_rate": 0.00016268343815513628, + "loss": 0.2104, + "step": 388 + }, + { + "epoch": 0.08169694424026042, + "grad_norm": 0.5792878866195679, + "learning_rate": 0.00016310272536687633, + "loss": 0.198, + "step": 389 + }, + { + "epoch": 0.08190696209177781, + "grad_norm": 0.5439760088920593, + "learning_rate": 0.00016352201257861635, + "loss": 0.1895, + "step": 390 + }, + { + "epoch": 0.08211697994329518, + "grad_norm": 0.6903837323188782, + "learning_rate": 0.0001639412997903564, + "loss": 0.2587, + "step": 391 + }, + { + "epoch": 0.08232699779481256, + "grad_norm": 0.6126405596733093, + "learning_rate": 0.00016436058700209643, + "loss": 0.2232, + "step": 392 + }, + { + "epoch": 0.08253701564632994, + "grad_norm": 0.9248547554016113, + "learning_rate": 0.00016477987421383648, + "loss": 0.267, + "step": 393 + }, + { + "epoch": 0.08274703349784732, + "grad_norm": 0.6509301066398621, + "learning_rate": 0.0001651991614255765, + "loss": 0.2109, + "step": 394 + }, + { + "epoch": 0.0829570513493647, + "grad_norm": 0.5985137820243835, + "learning_rate": 0.00016561844863731656, + "loss": 0.1783, + "step": 395 + }, + { + "epoch": 0.08316706920088207, + "grad_norm": 0.6711693406105042, + "learning_rate": 0.0001660377358490566, + "loss": 0.2115, + "step": 396 + }, + { + "epoch": 0.08337708705239945, + "grad_norm": 0.4494445025920868, + "learning_rate": 0.00016645702306079664, + "loss": 0.1486, + "step": 397 + }, + { + "epoch": 0.08358710490391684, + "grad_norm": 0.5083547830581665, + "learning_rate": 0.0001668763102725367, + "loss": 0.2317, + "step": 398 + }, + { + "epoch": 0.08379712275543422, + "grad_norm": 0.7552763819694519, + "learning_rate": 0.00016729559748427674, + "loss": 0.236, + "step": 399 + }, + { + "epoch": 0.0840071406069516, + "grad_norm": 0.7656201124191284, + "learning_rate": 0.00016771488469601676, + "loss": 0.2732, + "step": 400 + }, + { + "epoch": 0.08421715845846897, + "grad_norm": 0.3850518465042114, + "learning_rate": 0.00016813417190775681, + "loss": 0.3322, + "step": 401 + }, + { + "epoch": 0.08442717630998635, + "grad_norm": 0.5610989928245544, + "learning_rate": 0.00016855345911949687, + "loss": 0.2844, + "step": 402 + }, + { + "epoch": 0.08463719416150373, + "grad_norm": 0.7500874400138855, + "learning_rate": 0.0001689727463312369, + "loss": 0.3651, + "step": 403 + }, + { + "epoch": 0.0848472120130211, + "grad_norm": 0.45343145728111267, + "learning_rate": 0.00016939203354297694, + "loss": 0.2174, + "step": 404 + }, + { + "epoch": 0.08505722986453848, + "grad_norm": 0.6427581310272217, + "learning_rate": 0.000169811320754717, + "loss": 0.2748, + "step": 405 + }, + { + "epoch": 0.08526724771605586, + "grad_norm": 0.64598149061203, + "learning_rate": 0.00017023060796645702, + "loss": 0.2912, + "step": 406 + }, + { + "epoch": 0.08547726556757325, + "grad_norm": 0.49100759625434875, + "learning_rate": 0.00017064989517819707, + "loss": 0.2582, + "step": 407 + }, + { + "epoch": 0.08568728341909063, + "grad_norm": 0.5637136101722717, + "learning_rate": 0.00017106918238993712, + "loss": 0.254, + "step": 408 + }, + { + "epoch": 0.085897301270608, + "grad_norm": 0.5617924928665161, + "learning_rate": 0.00017148846960167715, + "loss": 0.2043, + "step": 409 + }, + { + "epoch": 0.08610731912212538, + "grad_norm": 0.5467379093170166, + "learning_rate": 0.0001719077568134172, + "loss": 0.2363, + "step": 410 + }, + { + "epoch": 0.08631733697364276, + "grad_norm": 0.6882631778717041, + "learning_rate": 0.00017232704402515725, + "loss": 0.2341, + "step": 411 + }, + { + "epoch": 0.08652735482516014, + "grad_norm": 0.40710386633872986, + "learning_rate": 0.00017274633123689727, + "loss": 0.1952, + "step": 412 + }, + { + "epoch": 0.08673737267667751, + "grad_norm": 0.688685953617096, + "learning_rate": 0.00017316561844863733, + "loss": 0.3588, + "step": 413 + }, + { + "epoch": 0.08694739052819489, + "grad_norm": 0.7739083170890808, + "learning_rate": 0.00017358490566037738, + "loss": 0.1897, + "step": 414 + }, + { + "epoch": 0.08715740837971228, + "grad_norm": 0.45127734541893005, + "learning_rate": 0.0001740041928721174, + "loss": 0.2282, + "step": 415 + }, + { + "epoch": 0.08736742623122966, + "grad_norm": 0.6713837385177612, + "learning_rate": 0.00017442348008385745, + "loss": 0.3395, + "step": 416 + }, + { + "epoch": 0.08757744408274704, + "grad_norm": 0.5886412858963013, + "learning_rate": 0.0001748427672955975, + "loss": 0.1673, + "step": 417 + }, + { + "epoch": 0.08778746193426441, + "grad_norm": 0.6254634261131287, + "learning_rate": 0.00017526205450733753, + "loss": 0.2392, + "step": 418 + }, + { + "epoch": 0.08799747978578179, + "grad_norm": 0.5936654806137085, + "learning_rate": 0.00017568134171907758, + "loss": 0.1817, + "step": 419 + }, + { + "epoch": 0.08820749763729917, + "grad_norm": 0.6107873320579529, + "learning_rate": 0.00017610062893081763, + "loss": 0.2877, + "step": 420 + }, + { + "epoch": 0.08841751548881654, + "grad_norm": 0.583984911441803, + "learning_rate": 0.00017651991614255766, + "loss": 0.2382, + "step": 421 + }, + { + "epoch": 0.08862753334033392, + "grad_norm": 0.6411318778991699, + "learning_rate": 0.0001769392033542977, + "loss": 0.2528, + "step": 422 + }, + { + "epoch": 0.08883755119185131, + "grad_norm": 0.5407703518867493, + "learning_rate": 0.00017735849056603776, + "loss": 0.217, + "step": 423 + }, + { + "epoch": 0.08904756904336869, + "grad_norm": 0.5086292028427124, + "learning_rate": 0.00017777777777777779, + "loss": 0.1703, + "step": 424 + }, + { + "epoch": 0.08925758689488607, + "grad_norm": 0.534488320350647, + "learning_rate": 0.00017819706498951784, + "loss": 0.4212, + "step": 425 + }, + { + "epoch": 0.08946760474640345, + "grad_norm": 0.5869336724281311, + "learning_rate": 0.00017861635220125786, + "loss": 0.3634, + "step": 426 + }, + { + "epoch": 0.08967762259792082, + "grad_norm": 0.5784481763839722, + "learning_rate": 0.00017903563941299791, + "loss": 0.2076, + "step": 427 + }, + { + "epoch": 0.0898876404494382, + "grad_norm": 0.467438668012619, + "learning_rate": 0.00017945492662473794, + "loss": 0.1948, + "step": 428 + }, + { + "epoch": 0.09009765830095558, + "grad_norm": 0.8514359593391418, + "learning_rate": 0.000179874213836478, + "loss": 0.2695, + "step": 429 + }, + { + "epoch": 0.09030767615247295, + "grad_norm": 0.630066990852356, + "learning_rate": 0.00018029350104821801, + "loss": 0.2624, + "step": 430 + }, + { + "epoch": 0.09051769400399035, + "grad_norm": 0.6442775130271912, + "learning_rate": 0.00018071278825995807, + "loss": 0.2555, + "step": 431 + }, + { + "epoch": 0.09072771185550772, + "grad_norm": 0.6193580031394958, + "learning_rate": 0.00018113207547169812, + "loss": 0.2388, + "step": 432 + }, + { + "epoch": 0.0909377297070251, + "grad_norm": 1.108219027519226, + "learning_rate": 0.00018155136268343814, + "loss": 0.2135, + "step": 433 + }, + { + "epoch": 0.09114774755854248, + "grad_norm": 0.666748046875, + "learning_rate": 0.0001819706498951782, + "loss": 0.2402, + "step": 434 + }, + { + "epoch": 0.09135776541005985, + "grad_norm": 0.516096293926239, + "learning_rate": 0.00018238993710691825, + "loss": 0.2022, + "step": 435 + }, + { + "epoch": 0.09156778326157723, + "grad_norm": 0.4976787269115448, + "learning_rate": 0.00018280922431865827, + "loss": 0.1492, + "step": 436 + }, + { + "epoch": 0.09177780111309461, + "grad_norm": 0.596254289150238, + "learning_rate": 0.00018322851153039832, + "loss": 0.1926, + "step": 437 + }, + { + "epoch": 0.09198781896461199, + "grad_norm": 0.4079163670539856, + "learning_rate": 0.00018364779874213837, + "loss": 0.219, + "step": 438 + }, + { + "epoch": 0.09219783681612938, + "grad_norm": 0.4968511164188385, + "learning_rate": 0.00018406708595387843, + "loss": 0.2203, + "step": 439 + }, + { + "epoch": 0.09240785466764675, + "grad_norm": 0.5749839544296265, + "learning_rate": 0.00018448637316561845, + "loss": 0.2561, + "step": 440 + }, + { + "epoch": 0.09261787251916413, + "grad_norm": 0.46315014362335205, + "learning_rate": 0.0001849056603773585, + "loss": 0.1608, + "step": 441 + }, + { + "epoch": 0.09282789037068151, + "grad_norm": 0.4630315601825714, + "learning_rate": 0.00018532494758909855, + "loss": 0.1564, + "step": 442 + }, + { + "epoch": 0.09303790822219889, + "grad_norm": 0.5688292384147644, + "learning_rate": 0.00018574423480083858, + "loss": 0.1483, + "step": 443 + }, + { + "epoch": 0.09324792607371626, + "grad_norm": 0.9025551676750183, + "learning_rate": 0.00018616352201257863, + "loss": 0.216, + "step": 444 + }, + { + "epoch": 0.09345794392523364, + "grad_norm": 0.6165971755981445, + "learning_rate": 0.00018658280922431868, + "loss": 0.1852, + "step": 445 + }, + { + "epoch": 0.09366796177675102, + "grad_norm": 0.5040764808654785, + "learning_rate": 0.0001870020964360587, + "loss": 0.1534, + "step": 446 + }, + { + "epoch": 0.0938779796282684, + "grad_norm": 0.6921994686126709, + "learning_rate": 0.00018742138364779876, + "loss": 0.2739, + "step": 447 + }, + { + "epoch": 0.09408799747978579, + "grad_norm": 0.9911003708839417, + "learning_rate": 0.0001878406708595388, + "loss": 0.2148, + "step": 448 + }, + { + "epoch": 0.09429801533130316, + "grad_norm": 0.4098629951477051, + "learning_rate": 0.00018825995807127883, + "loss": 0.1627, + "step": 449 + }, + { + "epoch": 0.09450803318282054, + "grad_norm": 0.5267736315727234, + "learning_rate": 0.00018867924528301889, + "loss": 0.1714, + "step": 450 + }, + { + "epoch": 0.09471805103433792, + "grad_norm": 0.826693058013916, + "learning_rate": 0.00018909853249475894, + "loss": 0.3614, + "step": 451 + }, + { + "epoch": 0.0949280688858553, + "grad_norm": 0.7960173487663269, + "learning_rate": 0.00018951781970649896, + "loss": 0.2831, + "step": 452 + }, + { + "epoch": 0.09513808673737267, + "grad_norm": 0.5408324003219604, + "learning_rate": 0.00018993710691823901, + "loss": 0.2396, + "step": 453 + }, + { + "epoch": 0.09534810458889005, + "grad_norm": 0.5551522374153137, + "learning_rate": 0.00019035639412997907, + "loss": 0.2399, + "step": 454 + }, + { + "epoch": 0.09555812244040743, + "grad_norm": 0.5053918361663818, + "learning_rate": 0.0001907756813417191, + "loss": 0.2054, + "step": 455 + }, + { + "epoch": 0.09576814029192482, + "grad_norm": 0.6408351063728333, + "learning_rate": 0.00019119496855345914, + "loss": 0.2319, + "step": 456 + }, + { + "epoch": 0.0959781581434422, + "grad_norm": 0.6061432361602783, + "learning_rate": 0.0001916142557651992, + "loss": 0.2289, + "step": 457 + }, + { + "epoch": 0.09618817599495957, + "grad_norm": 0.6452487111091614, + "learning_rate": 0.00019203354297693922, + "loss": 0.2787, + "step": 458 + }, + { + "epoch": 0.09639819384647695, + "grad_norm": 0.5427165627479553, + "learning_rate": 0.00019245283018867927, + "loss": 0.3181, + "step": 459 + }, + { + "epoch": 0.09660821169799433, + "grad_norm": 0.5678632259368896, + "learning_rate": 0.0001928721174004193, + "loss": 0.3166, + "step": 460 + }, + { + "epoch": 0.0968182295495117, + "grad_norm": 0.554288923740387, + "learning_rate": 0.00019329140461215935, + "loss": 0.214, + "step": 461 + }, + { + "epoch": 0.09702824740102908, + "grad_norm": 0.7040925621986389, + "learning_rate": 0.00019371069182389937, + "loss": 0.473, + "step": 462 + }, + { + "epoch": 0.09723826525254646, + "grad_norm": 0.6425243020057678, + "learning_rate": 0.00019412997903563942, + "loss": 0.3542, + "step": 463 + }, + { + "epoch": 0.09744828310406385, + "grad_norm": 0.6984371542930603, + "learning_rate": 0.00019454926624737945, + "loss": 0.2165, + "step": 464 + }, + { + "epoch": 0.09765830095558123, + "grad_norm": 0.5204288959503174, + "learning_rate": 0.0001949685534591195, + "loss": 0.232, + "step": 465 + }, + { + "epoch": 0.0978683188070986, + "grad_norm": 0.5688004493713379, + "learning_rate": 0.00019538784067085955, + "loss": 0.3099, + "step": 466 + }, + { + "epoch": 0.09807833665861598, + "grad_norm": 0.4850284159183502, + "learning_rate": 0.00019580712788259957, + "loss": 0.202, + "step": 467 + }, + { + "epoch": 0.09828835451013336, + "grad_norm": 0.5034931302070618, + "learning_rate": 0.00019622641509433963, + "loss": 0.2077, + "step": 468 + }, + { + "epoch": 0.09849837236165074, + "grad_norm": 0.6193839311599731, + "learning_rate": 0.00019664570230607968, + "loss": 0.318, + "step": 469 + }, + { + "epoch": 0.09870839021316812, + "grad_norm": 0.6226887702941895, + "learning_rate": 0.0001970649895178197, + "loss": 0.3538, + "step": 470 + }, + { + "epoch": 0.09891840806468549, + "grad_norm": 0.6102244257926941, + "learning_rate": 0.00019748427672955975, + "loss": 0.2865, + "step": 471 + }, + { + "epoch": 0.09912842591620288, + "grad_norm": 0.6731789112091064, + "learning_rate": 0.0001979035639412998, + "loss": 0.3135, + "step": 472 + }, + { + "epoch": 0.09933844376772026, + "grad_norm": 0.661486029624939, + "learning_rate": 0.00019832285115303983, + "loss": 0.2198, + "step": 473 + }, + { + "epoch": 0.09954846161923764, + "grad_norm": 0.7321748733520508, + "learning_rate": 0.00019874213836477988, + "loss": 0.2557, + "step": 474 + }, + { + "epoch": 0.09975847947075502, + "grad_norm": 0.5708514451980591, + "learning_rate": 0.00019916142557651993, + "loss": 0.2385, + "step": 475 + }, + { + "epoch": 0.09996849732227239, + "grad_norm": 0.8140966892242432, + "learning_rate": 0.00019958071278825996, + "loss": 0.1724, + "step": 476 + }, + { + "epoch": 0.10017851517378977, + "grad_norm": 0.5185543298721313, + "learning_rate": 0.0002, + "loss": 0.192, + "step": 477 + }, + { + "epoch": 0.10038853302530715, + "grad_norm": 0.7630559802055359, + "learning_rate": 0.00019999999396812126, + "loss": 0.3107, + "step": 478 + }, + { + "epoch": 0.10059855087682452, + "grad_norm": 0.5256696939468384, + "learning_rate": 0.00019999997587248573, + "loss": 0.2057, + "step": 479 + }, + { + "epoch": 0.10080856872834192, + "grad_norm": 0.5820131301879883, + "learning_rate": 0.0001999999457130956, + "loss": 0.2444, + "step": 480 + }, + { + "epoch": 0.10101858657985929, + "grad_norm": 0.6161417365074158, + "learning_rate": 0.00019999990348995456, + "loss": 0.1882, + "step": 481 + }, + { + "epoch": 0.10122860443137667, + "grad_norm": 0.5549945831298828, + "learning_rate": 0.00019999984920306764, + "loss": 0.3101, + "step": 482 + }, + { + "epoch": 0.10143862228289405, + "grad_norm": 0.8708590269088745, + "learning_rate": 0.00019999978285244142, + "loss": 0.2377, + "step": 483 + }, + { + "epoch": 0.10164864013441142, + "grad_norm": 0.5110476016998291, + "learning_rate": 0.00019999970443808387, + "loss": 0.1476, + "step": 484 + }, + { + "epoch": 0.1018586579859288, + "grad_norm": 1.1280276775360107, + "learning_rate": 0.0001999996139600045, + "loss": 0.1595, + "step": 485 + }, + { + "epoch": 0.10206867583744618, + "grad_norm": 0.7876203656196594, + "learning_rate": 0.0001999995114182142, + "loss": 0.229, + "step": 486 + }, + { + "epoch": 0.10227869368896356, + "grad_norm": 0.7196666598320007, + "learning_rate": 0.00019999939681272536, + "loss": 0.1838, + "step": 487 + }, + { + "epoch": 0.10248871154048093, + "grad_norm": 0.6737300157546997, + "learning_rate": 0.00019999927014355175, + "loss": 0.1786, + "step": 488 + }, + { + "epoch": 0.10269872939199833, + "grad_norm": 0.7758048176765442, + "learning_rate": 0.0001999991314107087, + "loss": 0.1847, + "step": 489 + }, + { + "epoch": 0.1029087472435157, + "grad_norm": 0.8189213871955872, + "learning_rate": 0.00019999898061421294, + "loss": 0.2842, + "step": 490 + }, + { + "epoch": 0.10311876509503308, + "grad_norm": 0.5789510011672974, + "learning_rate": 0.00019999881775408263, + "loss": 0.2353, + "step": 491 + }, + { + "epoch": 0.10332878294655046, + "grad_norm": 0.808729350566864, + "learning_rate": 0.00019999864283033747, + "loss": 0.2481, + "step": 492 + }, + { + "epoch": 0.10353880079806783, + "grad_norm": 0.587478518486023, + "learning_rate": 0.00019999845584299855, + "loss": 0.1976, + "step": 493 + }, + { + "epoch": 0.10374881864958521, + "grad_norm": 0.7419194579124451, + "learning_rate": 0.00019999825679208839, + "loss": 0.2444, + "step": 494 + }, + { + "epoch": 0.10395883650110259, + "grad_norm": 0.6678702235221863, + "learning_rate": 0.000199998045677631, + "loss": 0.1752, + "step": 495 + }, + { + "epoch": 0.10416885435261997, + "grad_norm": 0.5477135181427002, + "learning_rate": 0.00019999782249965193, + "loss": 0.1176, + "step": 496 + }, + { + "epoch": 0.10437887220413736, + "grad_norm": 0.47613173723220825, + "learning_rate": 0.000199997587258178, + "loss": 0.1734, + "step": 497 + }, + { + "epoch": 0.10458889005565473, + "grad_norm": 0.8437466025352478, + "learning_rate": 0.0001999973399532377, + "loss": 0.2279, + "step": 498 + }, + { + "epoch": 0.10479890790717211, + "grad_norm": 0.7599924206733704, + "learning_rate": 0.00019999708058486074, + "loss": 0.209, + "step": 499 + }, + { + "epoch": 0.10500892575868949, + "grad_norm": 0.5578658580780029, + "learning_rate": 0.00019999680915307847, + "loss": 0.243, + "step": 500 + }, + { + "epoch": 0.10521894361020687, + "grad_norm": 0.5664511322975159, + "learning_rate": 0.00019999652565792368, + "loss": 0.2176, + "step": 501 + }, + { + "epoch": 0.10542896146172424, + "grad_norm": 0.5591540336608887, + "learning_rate": 0.0001999962300994305, + "loss": 0.3467, + "step": 502 + }, + { + "epoch": 0.10563897931324162, + "grad_norm": 0.5022396445274353, + "learning_rate": 0.0001999959224776346, + "loss": 0.2149, + "step": 503 + }, + { + "epoch": 0.105848997164759, + "grad_norm": 0.5846520662307739, + "learning_rate": 0.00019999560279257314, + "loss": 0.3388, + "step": 504 + }, + { + "epoch": 0.10605901501627639, + "grad_norm": 0.4137157201766968, + "learning_rate": 0.00019999527104428463, + "loss": 0.2223, + "step": 505 + }, + { + "epoch": 0.10626903286779377, + "grad_norm": 0.49332931637763977, + "learning_rate": 0.0001999949272328091, + "loss": 0.2679, + "step": 506 + }, + { + "epoch": 0.10647905071931114, + "grad_norm": 0.7095859050750732, + "learning_rate": 0.00019999457135818805, + "loss": 0.2681, + "step": 507 + }, + { + "epoch": 0.10668906857082852, + "grad_norm": 0.5563727021217346, + "learning_rate": 0.0001999942034204644, + "loss": 0.2695, + "step": 508 + }, + { + "epoch": 0.1068990864223459, + "grad_norm": 0.5464118719100952, + "learning_rate": 0.00019999382341968252, + "loss": 0.4308, + "step": 509 + }, + { + "epoch": 0.10710910427386328, + "grad_norm": 0.7822732329368591, + "learning_rate": 0.00019999343135588827, + "loss": 0.2458, + "step": 510 + }, + { + "epoch": 0.10731912212538065, + "grad_norm": 0.6268991231918335, + "learning_rate": 0.00019999302722912895, + "loss": 0.2877, + "step": 511 + }, + { + "epoch": 0.10752913997689803, + "grad_norm": 0.7860679626464844, + "learning_rate": 0.0001999926110394533, + "loss": 0.275, + "step": 512 + }, + { + "epoch": 0.10773915782841542, + "grad_norm": 0.5817549228668213, + "learning_rate": 0.00019999218278691158, + "loss": 0.2005, + "step": 513 + }, + { + "epoch": 0.1079491756799328, + "grad_norm": 0.8145076036453247, + "learning_rate": 0.00019999174247155535, + "loss": 0.2032, + "step": 514 + }, + { + "epoch": 0.10815919353145018, + "grad_norm": 0.7561895847320557, + "learning_rate": 0.0001999912900934378, + "loss": 0.2452, + "step": 515 + }, + { + "epoch": 0.10836921138296755, + "grad_norm": 0.8003164529800415, + "learning_rate": 0.0001999908256526135, + "loss": 0.2318, + "step": 516 + }, + { + "epoch": 0.10857922923448493, + "grad_norm": 0.6318978667259216, + "learning_rate": 0.0001999903491491385, + "loss": 0.2534, + "step": 517 + }, + { + "epoch": 0.10878924708600231, + "grad_norm": 0.5220886468887329, + "learning_rate": 0.00019998986058307022, + "loss": 0.2011, + "step": 518 + }, + { + "epoch": 0.10899926493751969, + "grad_norm": 0.5928252935409546, + "learning_rate": 0.00019998935995446763, + "loss": 0.2175, + "step": 519 + }, + { + "epoch": 0.10920928278903706, + "grad_norm": 0.7763411998748779, + "learning_rate": 0.00019998884726339116, + "loss": 0.2852, + "step": 520 + }, + { + "epoch": 0.10941930064055445, + "grad_norm": 0.7260156273841858, + "learning_rate": 0.00019998832250990264, + "loss": 0.2313, + "step": 521 + }, + { + "epoch": 0.10962931849207183, + "grad_norm": 0.7486017942428589, + "learning_rate": 0.0001999877856940653, + "loss": 0.2789, + "step": 522 + }, + { + "epoch": 0.10983933634358921, + "grad_norm": 0.6006895303726196, + "learning_rate": 0.00019998723681594402, + "loss": 0.1674, + "step": 523 + }, + { + "epoch": 0.11004935419510659, + "grad_norm": 0.7286220192909241, + "learning_rate": 0.00019998667587560495, + "loss": 0.2521, + "step": 524 + }, + { + "epoch": 0.11025937204662396, + "grad_norm": 0.7167160511016846, + "learning_rate": 0.00019998610287311574, + "loss": 0.2308, + "step": 525 + }, + { + "epoch": 0.11046938989814134, + "grad_norm": 0.6270986795425415, + "learning_rate": 0.00019998551780854557, + "loss": 0.2048, + "step": 526 + }, + { + "epoch": 0.11067940774965872, + "grad_norm": 0.7366195917129517, + "learning_rate": 0.000199984920681965, + "loss": 0.2239, + "step": 527 + }, + { + "epoch": 0.1108894256011761, + "grad_norm": 0.5709097981452942, + "learning_rate": 0.00019998431149344606, + "loss": 0.2074, + "step": 528 + }, + { + "epoch": 0.11109944345269347, + "grad_norm": 0.6718131899833679, + "learning_rate": 0.00019998369024306224, + "loss": 0.2902, + "step": 529 + }, + { + "epoch": 0.11130946130421086, + "grad_norm": 0.6741777658462524, + "learning_rate": 0.00019998305693088848, + "loss": 0.2638, + "step": 530 + }, + { + "epoch": 0.11151947915572824, + "grad_norm": 0.5218673944473267, + "learning_rate": 0.0001999824115570012, + "loss": 0.201, + "step": 531 + }, + { + "epoch": 0.11172949700724562, + "grad_norm": 0.6867272257804871, + "learning_rate": 0.00019998175412147824, + "loss": 0.2864, + "step": 532 + }, + { + "epoch": 0.111939514858763, + "grad_norm": 0.6319578289985657, + "learning_rate": 0.00019998108462439894, + "loss": 0.2277, + "step": 533 + }, + { + "epoch": 0.11214953271028037, + "grad_norm": 0.5601973533630371, + "learning_rate": 0.000199980403065844, + "loss": 0.2678, + "step": 534 + }, + { + "epoch": 0.11235955056179775, + "grad_norm": 0.5189068913459778, + "learning_rate": 0.00019997970944589572, + "loss": 0.2036, + "step": 535 + }, + { + "epoch": 0.11256956841331513, + "grad_norm": 0.7217200994491577, + "learning_rate": 0.00019997900376463778, + "loss": 0.2299, + "step": 536 + }, + { + "epoch": 0.1127795862648325, + "grad_norm": 0.6617181301116943, + "learning_rate": 0.0001999782860221552, + "loss": 0.2259, + "step": 537 + }, + { + "epoch": 0.1129896041163499, + "grad_norm": 0.6987117528915405, + "learning_rate": 0.0001999775562185347, + "loss": 0.1882, + "step": 538 + }, + { + "epoch": 0.11319962196786727, + "grad_norm": 0.4491863548755646, + "learning_rate": 0.00019997681435386422, + "loss": 0.1937, + "step": 539 + }, + { + "epoch": 0.11340963981938465, + "grad_norm": 0.5842171311378479, + "learning_rate": 0.00019997606042823334, + "loss": 0.2808, + "step": 540 + }, + { + "epoch": 0.11361965767090203, + "grad_norm": 0.7743870615959167, + "learning_rate": 0.00019997529444173293, + "loss": 0.2329, + "step": 541 + }, + { + "epoch": 0.1138296755224194, + "grad_norm": 0.5326593518257141, + "learning_rate": 0.00019997451639445547, + "loss": 0.188, + "step": 542 + }, + { + "epoch": 0.11403969337393678, + "grad_norm": 0.5364864468574524, + "learning_rate": 0.00019997372628649478, + "loss": 0.2294, + "step": 543 + }, + { + "epoch": 0.11424971122545416, + "grad_norm": 0.5609897375106812, + "learning_rate": 0.00019997292411794618, + "loss": 0.2108, + "step": 544 + }, + { + "epoch": 0.11445972907697154, + "grad_norm": 0.5446069836616516, + "learning_rate": 0.00019997210988890646, + "loss": 0.2577, + "step": 545 + }, + { + "epoch": 0.11466974692848893, + "grad_norm": 0.6916573643684387, + "learning_rate": 0.0001999712835994738, + "loss": 0.1976, + "step": 546 + }, + { + "epoch": 0.1148797647800063, + "grad_norm": 0.7029738426208496, + "learning_rate": 0.00019997044524974799, + "loss": 0.2076, + "step": 547 + }, + { + "epoch": 0.11508978263152368, + "grad_norm": 0.8003794550895691, + "learning_rate": 0.00019996959483983004, + "loss": 0.3284, + "step": 548 + }, + { + "epoch": 0.11529980048304106, + "grad_norm": 0.6394858360290527, + "learning_rate": 0.00019996873236982258, + "loss": 0.2397, + "step": 549 + }, + { + "epoch": 0.11550981833455844, + "grad_norm": 0.7164601683616638, + "learning_rate": 0.00019996785783982972, + "loss": 0.2097, + "step": 550 + }, + { + "epoch": 0.11571983618607581, + "grad_norm": 0.5346024036407471, + "learning_rate": 0.0001999669712499569, + "loss": 0.2559, + "step": 551 + }, + { + "epoch": 0.11592985403759319, + "grad_norm": 0.77498859167099, + "learning_rate": 0.00019996607260031106, + "loss": 0.3734, + "step": 552 + }, + { + "epoch": 0.11613987188911057, + "grad_norm": 0.6465743780136108, + "learning_rate": 0.00019996516189100066, + "loss": 0.2642, + "step": 553 + }, + { + "epoch": 0.11634988974062796, + "grad_norm": 0.4624630808830261, + "learning_rate": 0.00019996423912213554, + "loss": 0.1649, + "step": 554 + }, + { + "epoch": 0.11655990759214534, + "grad_norm": 0.6889922618865967, + "learning_rate": 0.00019996330429382703, + "loss": 0.2902, + "step": 555 + }, + { + "epoch": 0.11676992544366271, + "grad_norm": 0.7413411140441895, + "learning_rate": 0.0001999623574061879, + "loss": 0.2428, + "step": 556 + }, + { + "epoch": 0.11697994329518009, + "grad_norm": 0.9009401798248291, + "learning_rate": 0.0001999613984593324, + "loss": 0.3112, + "step": 557 + }, + { + "epoch": 0.11718996114669747, + "grad_norm": 0.6533844470977783, + "learning_rate": 0.00019996042745337617, + "loss": 0.2118, + "step": 558 + }, + { + "epoch": 0.11739997899821485, + "grad_norm": 0.6814008355140686, + "learning_rate": 0.00019995944438843636, + "loss": 0.2755, + "step": 559 + }, + { + "epoch": 0.11760999684973222, + "grad_norm": 0.6098254323005676, + "learning_rate": 0.0001999584492646316, + "loss": 0.2385, + "step": 560 + }, + { + "epoch": 0.1178200147012496, + "grad_norm": 0.7361414432525635, + "learning_rate": 0.00019995744208208194, + "loss": 0.326, + "step": 561 + }, + { + "epoch": 0.11803003255276699, + "grad_norm": 0.6809893250465393, + "learning_rate": 0.00019995642284090885, + "loss": 0.2727, + "step": 562 + }, + { + "epoch": 0.11824005040428437, + "grad_norm": 0.6401339769363403, + "learning_rate": 0.00019995539154123529, + "loss": 0.2143, + "step": 563 + }, + { + "epoch": 0.11845006825580175, + "grad_norm": 0.715313732624054, + "learning_rate": 0.00019995434818318567, + "loss": 0.2269, + "step": 564 + }, + { + "epoch": 0.11866008610731912, + "grad_norm": 0.6071058511734009, + "learning_rate": 0.00019995329276688593, + "loss": 0.4811, + "step": 565 + }, + { + "epoch": 0.1188701039588365, + "grad_norm": 0.882318377494812, + "learning_rate": 0.0001999522252924633, + "loss": 0.2607, + "step": 566 + }, + { + "epoch": 0.11908012181035388, + "grad_norm": 0.4758372902870178, + "learning_rate": 0.0001999511457600466, + "loss": 0.2013, + "step": 567 + }, + { + "epoch": 0.11929013966187126, + "grad_norm": 0.6482694149017334, + "learning_rate": 0.00019995005416976604, + "loss": 0.2567, + "step": 568 + }, + { + "epoch": 0.11950015751338863, + "grad_norm": 0.6689179539680481, + "learning_rate": 0.00019994895052175338, + "loss": 0.2498, + "step": 569 + }, + { + "epoch": 0.11971017536490601, + "grad_norm": 0.5817541480064392, + "learning_rate": 0.00019994783481614166, + "loss": 0.2013, + "step": 570 + }, + { + "epoch": 0.1199201932164234, + "grad_norm": 0.4717533588409424, + "learning_rate": 0.00019994670705306554, + "loss": 0.2647, + "step": 571 + }, + { + "epoch": 0.12013021106794078, + "grad_norm": 0.574079692363739, + "learning_rate": 0.00019994556723266103, + "loss": 0.1704, + "step": 572 + }, + { + "epoch": 0.12034022891945816, + "grad_norm": 0.759425938129425, + "learning_rate": 0.00019994441535506569, + "loss": 0.208, + "step": 573 + }, + { + "epoch": 0.12055024677097553, + "grad_norm": 0.5335227847099304, + "learning_rate": 0.0001999432514204184, + "loss": 0.2018, + "step": 574 + }, + { + "epoch": 0.12076026462249291, + "grad_norm": 0.5595372915267944, + "learning_rate": 0.00019994207542885963, + "loss": 0.2667, + "step": 575 + }, + { + "epoch": 0.12097028247401029, + "grad_norm": 0.6673279404640198, + "learning_rate": 0.00019994088738053124, + "loss": 0.3175, + "step": 576 + }, + { + "epoch": 0.12118030032552767, + "grad_norm": 0.5622591376304626, + "learning_rate": 0.0001999396872755766, + "loss": 0.1543, + "step": 577 + }, + { + "epoch": 0.12139031817704504, + "grad_norm": 0.5784288048744202, + "learning_rate": 0.0001999384751141404, + "loss": 0.2624, + "step": 578 + }, + { + "epoch": 0.12160033602856243, + "grad_norm": 0.6946631669998169, + "learning_rate": 0.00019993725089636891, + "loss": 0.2469, + "step": 579 + }, + { + "epoch": 0.12181035388007981, + "grad_norm": 0.6951069235801697, + "learning_rate": 0.00019993601462240984, + "loss": 0.2636, + "step": 580 + }, + { + "epoch": 0.12202037173159719, + "grad_norm": 0.5134934186935425, + "learning_rate": 0.0001999347662924123, + "loss": 0.1432, + "step": 581 + }, + { + "epoch": 0.12223038958311457, + "grad_norm": 0.5719296932220459, + "learning_rate": 0.00019993350590652691, + "loss": 0.2388, + "step": 582 + }, + { + "epoch": 0.12244040743463194, + "grad_norm": 0.7625638246536255, + "learning_rate": 0.0001999322334649057, + "loss": 0.2507, + "step": 583 + }, + { + "epoch": 0.12265042528614932, + "grad_norm": 0.6974209547042847, + "learning_rate": 0.00019993094896770218, + "loss": 0.2431, + "step": 584 + }, + { + "epoch": 0.1228604431376667, + "grad_norm": 0.7072513699531555, + "learning_rate": 0.0001999296524150713, + "loss": 0.1745, + "step": 585 + }, + { + "epoch": 0.12307046098918407, + "grad_norm": 0.7435344457626343, + "learning_rate": 0.00019992834380716946, + "loss": 0.216, + "step": 586 + }, + { + "epoch": 0.12328047884070147, + "grad_norm": 0.5491403937339783, + "learning_rate": 0.00019992702314415461, + "loss": 0.1853, + "step": 587 + }, + { + "epoch": 0.12349049669221884, + "grad_norm": 0.5487938523292542, + "learning_rate": 0.00019992569042618597, + "loss": 0.2361, + "step": 588 + }, + { + "epoch": 0.12370051454373622, + "grad_norm": 0.4346216320991516, + "learning_rate": 0.00019992434565342437, + "loss": 0.1812, + "step": 589 + }, + { + "epoch": 0.1239105323952536, + "grad_norm": 0.5448020696640015, + "learning_rate": 0.00019992298882603202, + "loss": 0.2017, + "step": 590 + }, + { + "epoch": 0.12412055024677097, + "grad_norm": 0.6867210268974304, + "learning_rate": 0.0001999216199441726, + "loss": 0.1788, + "step": 591 + }, + { + "epoch": 0.12433056809828835, + "grad_norm": 0.6821328401565552, + "learning_rate": 0.00019992023900801127, + "loss": 0.2159, + "step": 592 + }, + { + "epoch": 0.12454058594980573, + "grad_norm": 0.6648369431495667, + "learning_rate": 0.0001999188460177146, + "loss": 0.223, + "step": 593 + }, + { + "epoch": 0.1247506038013231, + "grad_norm": 0.6275060772895813, + "learning_rate": 0.00019991744097345068, + "loss": 0.2174, + "step": 594 + }, + { + "epoch": 0.1249606216528405, + "grad_norm": 0.43622860312461853, + "learning_rate": 0.00019991602387538896, + "loss": 0.1709, + "step": 595 + }, + { + "epoch": 0.12517063950435786, + "grad_norm": 0.41494739055633545, + "learning_rate": 0.00019991459472370042, + "loss": 0.1615, + "step": 596 + }, + { + "epoch": 0.12538065735587525, + "grad_norm": 0.4159907400608063, + "learning_rate": 0.00019991315351855748, + "loss": 0.1457, + "step": 597 + }, + { + "epoch": 0.12559067520739262, + "grad_norm": 0.8123224377632141, + "learning_rate": 0.00019991170026013397, + "loss": 0.202, + "step": 598 + }, + { + "epoch": 0.12580069305891, + "grad_norm": 0.9315401911735535, + "learning_rate": 0.00019991023494860522, + "loss": 0.2496, + "step": 599 + }, + { + "epoch": 0.1260107109104274, + "grad_norm": 0.7999815344810486, + "learning_rate": 0.00019990875758414803, + "loss": 0.2782, + "step": 600 + }, + { + "epoch": 0.12622072876194476, + "grad_norm": 0.5633922815322876, + "learning_rate": 0.0001999072681669406, + "loss": 0.2276, + "step": 601 + }, + { + "epoch": 0.12643074661346215, + "grad_norm": 0.6719483733177185, + "learning_rate": 0.00019990576669716258, + "loss": 0.3169, + "step": 602 + }, + { + "epoch": 0.12664076446497952, + "grad_norm": 0.7311053276062012, + "learning_rate": 0.0001999042531749952, + "loss": 0.2723, + "step": 603 + }, + { + "epoch": 0.1268507823164969, + "grad_norm": 0.5853881239891052, + "learning_rate": 0.00019990272760062093, + "loss": 0.2869, + "step": 604 + }, + { + "epoch": 0.12706080016801427, + "grad_norm": 0.7300302982330322, + "learning_rate": 0.0001999011899742239, + "loss": 0.2405, + "step": 605 + }, + { + "epoch": 0.12727081801953166, + "grad_norm": 0.704954206943512, + "learning_rate": 0.00019989964029598953, + "loss": 0.3195, + "step": 606 + }, + { + "epoch": 0.12748083587104905, + "grad_norm": 0.6305354833602905, + "learning_rate": 0.00019989807856610482, + "loss": 0.2442, + "step": 607 + }, + { + "epoch": 0.12769085372256642, + "grad_norm": 0.5027151107788086, + "learning_rate": 0.0001998965047847582, + "loss": 0.3006, + "step": 608 + }, + { + "epoch": 0.1279008715740838, + "grad_norm": 0.6237658262252808, + "learning_rate": 0.00019989491895213948, + "loss": 0.2019, + "step": 609 + }, + { + "epoch": 0.12811088942560117, + "grad_norm": 0.6959155797958374, + "learning_rate": 0.00019989332106844, + "loss": 0.2142, + "step": 610 + }, + { + "epoch": 0.12832090727711856, + "grad_norm": 0.7905144095420837, + "learning_rate": 0.0001998917111338525, + "loss": 0.2599, + "step": 611 + }, + { + "epoch": 0.12853092512863593, + "grad_norm": 0.7247504591941833, + "learning_rate": 0.00019989008914857116, + "loss": 0.2679, + "step": 612 + }, + { + "epoch": 0.12874094298015332, + "grad_norm": 0.5282559990882874, + "learning_rate": 0.0001998884551127917, + "loss": 0.2125, + "step": 613 + }, + { + "epoch": 0.12895096083167068, + "grad_norm": 0.6418580412864685, + "learning_rate": 0.0001998868090267113, + "loss": 0.2185, + "step": 614 + }, + { + "epoch": 0.12916097868318807, + "grad_norm": 0.48245900869369507, + "learning_rate": 0.00019988515089052844, + "loss": 0.2175, + "step": 615 + }, + { + "epoch": 0.12937099653470546, + "grad_norm": 0.4887724220752716, + "learning_rate": 0.00019988348070444322, + "loss": 0.1777, + "step": 616 + }, + { + "epoch": 0.12958101438622283, + "grad_norm": 0.5296192169189453, + "learning_rate": 0.0001998817984686571, + "loss": 0.2344, + "step": 617 + }, + { + "epoch": 0.12979103223774022, + "grad_norm": 0.6658729314804077, + "learning_rate": 0.00019988010418337305, + "loss": 0.2322, + "step": 618 + }, + { + "epoch": 0.13000105008925758, + "grad_norm": 0.5744292736053467, + "learning_rate": 0.0001998783978487954, + "loss": 0.2156, + "step": 619 + }, + { + "epoch": 0.13021106794077497, + "grad_norm": 0.5000370144844055, + "learning_rate": 0.00019987667946513006, + "loss": 0.2319, + "step": 620 + }, + { + "epoch": 0.13042108579229234, + "grad_norm": 0.8539411425590515, + "learning_rate": 0.00019987494903258432, + "loss": 0.3729, + "step": 621 + }, + { + "epoch": 0.13063110364380973, + "grad_norm": 0.6094825267791748, + "learning_rate": 0.00019987320655136693, + "loss": 0.2171, + "step": 622 + }, + { + "epoch": 0.1308411214953271, + "grad_norm": 0.6408823728561401, + "learning_rate": 0.00019987145202168805, + "loss": 0.2658, + "step": 623 + }, + { + "epoch": 0.13105113934684448, + "grad_norm": 0.5738769769668579, + "learning_rate": 0.0001998696854437594, + "loss": 0.2127, + "step": 624 + }, + { + "epoch": 0.13126115719836187, + "grad_norm": 0.6330286860466003, + "learning_rate": 0.00019986790681779412, + "loss": 0.1503, + "step": 625 + }, + { + "epoch": 0.13147117504987924, + "grad_norm": 0.8125373125076294, + "learning_rate": 0.0001998661161440067, + "loss": 0.2741, + "step": 626 + }, + { + "epoch": 0.13168119290139663, + "grad_norm": 0.710121750831604, + "learning_rate": 0.00019986431342261323, + "loss": 0.2672, + "step": 627 + }, + { + "epoch": 0.131891210752914, + "grad_norm": 0.8024762868881226, + "learning_rate": 0.00019986249865383115, + "loss": 0.2818, + "step": 628 + }, + { + "epoch": 0.13210122860443138, + "grad_norm": 1.0455816984176636, + "learning_rate": 0.0001998606718378794, + "loss": 0.3204, + "step": 629 + }, + { + "epoch": 0.13231124645594874, + "grad_norm": 0.7923910617828369, + "learning_rate": 0.00019985883297497835, + "loss": 0.213, + "step": 630 + }, + { + "epoch": 0.13252126430746614, + "grad_norm": 0.7458345890045166, + "learning_rate": 0.00019985698206534985, + "loss": 0.2066, + "step": 631 + }, + { + "epoch": 0.13273128215898353, + "grad_norm": 0.6166645884513855, + "learning_rate": 0.0001998551191092172, + "loss": 0.2239, + "step": 632 + }, + { + "epoch": 0.1329413000105009, + "grad_norm": 0.7050312161445618, + "learning_rate": 0.00019985324410680514, + "loss": 0.2692, + "step": 633 + }, + { + "epoch": 0.13315131786201828, + "grad_norm": 0.6465736627578735, + "learning_rate": 0.00019985135705833984, + "loss": 0.235, + "step": 634 + }, + { + "epoch": 0.13336133571353564, + "grad_norm": 0.6108490228652954, + "learning_rate": 0.00019984945796404894, + "loss": 0.2472, + "step": 635 + }, + { + "epoch": 0.13357135356505304, + "grad_norm": 0.725173830986023, + "learning_rate": 0.00019984754682416157, + "loss": 0.2521, + "step": 636 + }, + { + "epoch": 0.1337813714165704, + "grad_norm": 0.5391446352005005, + "learning_rate": 0.00019984562363890832, + "loss": 0.2151, + "step": 637 + }, + { + "epoch": 0.1339913892680878, + "grad_norm": 0.44177114963531494, + "learning_rate": 0.00019984368840852114, + "loss": 0.179, + "step": 638 + }, + { + "epoch": 0.13420140711960515, + "grad_norm": 0.48038744926452637, + "learning_rate": 0.00019984174113323353, + "loss": 0.2296, + "step": 639 + }, + { + "epoch": 0.13441142497112255, + "grad_norm": 0.5720350742340088, + "learning_rate": 0.00019983978181328037, + "loss": 0.1843, + "step": 640 + }, + { + "epoch": 0.13462144282263994, + "grad_norm": 0.4996393322944641, + "learning_rate": 0.00019983781044889803, + "loss": 0.225, + "step": 641 + }, + { + "epoch": 0.1348314606741573, + "grad_norm": 0.5970807671546936, + "learning_rate": 0.00019983582704032434, + "loss": 0.179, + "step": 642 + }, + { + "epoch": 0.1350414785256747, + "grad_norm": 0.858808159828186, + "learning_rate": 0.0001998338315877986, + "loss": 0.246, + "step": 643 + }, + { + "epoch": 0.13525149637719205, + "grad_norm": 0.6708926558494568, + "learning_rate": 0.0001998318240915615, + "loss": 0.197, + "step": 644 + }, + { + "epoch": 0.13546151422870945, + "grad_norm": 0.8443548083305359, + "learning_rate": 0.00019982980455185526, + "loss": 0.1889, + "step": 645 + }, + { + "epoch": 0.1356715320802268, + "grad_norm": 0.6451512575149536, + "learning_rate": 0.00019982777296892346, + "loss": 0.2103, + "step": 646 + }, + { + "epoch": 0.1358815499317442, + "grad_norm": 0.7601468563079834, + "learning_rate": 0.00019982572934301122, + "loss": 0.2338, + "step": 647 + }, + { + "epoch": 0.1360915677832616, + "grad_norm": 0.5944762229919434, + "learning_rate": 0.00019982367367436506, + "loss": 0.1814, + "step": 648 + }, + { + "epoch": 0.13630158563477895, + "grad_norm": 0.7542382478713989, + "learning_rate": 0.00019982160596323297, + "loss": 0.2062, + "step": 649 + }, + { + "epoch": 0.13651160348629635, + "grad_norm": 0.560296893119812, + "learning_rate": 0.00019981952620986442, + "loss": 0.199, + "step": 650 + }, + { + "epoch": 0.1367216213378137, + "grad_norm": 0.5254395604133606, + "learning_rate": 0.0001998174344145103, + "loss": 0.2943, + "step": 651 + }, + { + "epoch": 0.1369316391893311, + "grad_norm": 0.6042603254318237, + "learning_rate": 0.00019981533057742294, + "loss": 0.2355, + "step": 652 + }, + { + "epoch": 0.13714165704084846, + "grad_norm": 0.6384417414665222, + "learning_rate": 0.00019981321469885615, + "loss": 0.202, + "step": 653 + }, + { + "epoch": 0.13735167489236585, + "grad_norm": 0.7300348877906799, + "learning_rate": 0.0001998110867790652, + "loss": 0.2422, + "step": 654 + }, + { + "epoch": 0.13756169274388322, + "grad_norm": 0.5238686800003052, + "learning_rate": 0.00019980894681830678, + "loss": 0.2491, + "step": 655 + }, + { + "epoch": 0.1377717105954006, + "grad_norm": 0.7352842092514038, + "learning_rate": 0.00019980679481683904, + "loss": 0.3193, + "step": 656 + }, + { + "epoch": 0.137981728446918, + "grad_norm": 0.6651904582977295, + "learning_rate": 0.0001998046307749216, + "loss": 0.2618, + "step": 657 + }, + { + "epoch": 0.13819174629843536, + "grad_norm": 0.6976970434188843, + "learning_rate": 0.00019980245469281553, + "loss": 0.2622, + "step": 658 + }, + { + "epoch": 0.13840176414995276, + "grad_norm": 0.6078370809555054, + "learning_rate": 0.00019980026657078336, + "loss": 0.2532, + "step": 659 + }, + { + "epoch": 0.13861178200147012, + "grad_norm": 0.7155233025550842, + "learning_rate": 0.00019979806640908906, + "loss": 0.3283, + "step": 660 + }, + { + "epoch": 0.1388217998529875, + "grad_norm": 0.519636869430542, + "learning_rate": 0.00019979585420799802, + "loss": 0.187, + "step": 661 + }, + { + "epoch": 0.13903181770450487, + "grad_norm": 0.7007790803909302, + "learning_rate": 0.00019979362996777714, + "loss": 0.2554, + "step": 662 + }, + { + "epoch": 0.13924183555602226, + "grad_norm": 0.6281614303588867, + "learning_rate": 0.00019979139368869473, + "loss": 0.2153, + "step": 663 + }, + { + "epoch": 0.13945185340753963, + "grad_norm": 0.5729889869689941, + "learning_rate": 0.00019978914537102055, + "loss": 0.2432, + "step": 664 + }, + { + "epoch": 0.13966187125905702, + "grad_norm": 0.4995453357696533, + "learning_rate": 0.00019978688501502592, + "loss": 0.1931, + "step": 665 + }, + { + "epoch": 0.1398718891105744, + "grad_norm": 0.48151615262031555, + "learning_rate": 0.00019978461262098343, + "loss": 0.1664, + "step": 666 + }, + { + "epoch": 0.14008190696209177, + "grad_norm": 0.6951011419296265, + "learning_rate": 0.00019978232818916727, + "loss": 0.2229, + "step": 667 + }, + { + "epoch": 0.14029192481360916, + "grad_norm": 0.5914542078971863, + "learning_rate": 0.000199780031719853, + "loss": 0.2126, + "step": 668 + }, + { + "epoch": 0.14050194266512653, + "grad_norm": 0.7551674246788025, + "learning_rate": 0.00019977772321331765, + "loss": 0.2806, + "step": 669 + }, + { + "epoch": 0.14071196051664392, + "grad_norm": 0.7960730195045471, + "learning_rate": 0.00019977540266983976, + "loss": 0.2653, + "step": 670 + }, + { + "epoch": 0.14092197836816128, + "grad_norm": 0.5545317530632019, + "learning_rate": 0.00019977307008969922, + "loss": 0.2141, + "step": 671 + }, + { + "epoch": 0.14113199621967867, + "grad_norm": 0.7467978596687317, + "learning_rate": 0.0001997707254731775, + "loss": 0.1961, + "step": 672 + }, + { + "epoch": 0.14134201407119606, + "grad_norm": 0.6775459051132202, + "learning_rate": 0.00019976836882055736, + "loss": 0.2304, + "step": 673 + }, + { + "epoch": 0.14155203192271343, + "grad_norm": 0.793547511100769, + "learning_rate": 0.00019976600013212317, + "loss": 0.2266, + "step": 674 + }, + { + "epoch": 0.14176204977423082, + "grad_norm": 0.6920728087425232, + "learning_rate": 0.00019976361940816063, + "loss": 0.3469, + "step": 675 + }, + { + "epoch": 0.14197206762574818, + "grad_norm": 0.840145468711853, + "learning_rate": 0.000199761226648957, + "loss": 0.235, + "step": 676 + }, + { + "epoch": 0.14218208547726557, + "grad_norm": 0.8047716617584229, + "learning_rate": 0.0001997588218548009, + "loss": 0.3034, + "step": 677 + }, + { + "epoch": 0.14239210332878294, + "grad_norm": 0.649042010307312, + "learning_rate": 0.00019975640502598244, + "loss": 0.2822, + "step": 678 + }, + { + "epoch": 0.14260212118030033, + "grad_norm": 0.6780881881713867, + "learning_rate": 0.0001997539761627932, + "loss": 0.1593, + "step": 679 + }, + { + "epoch": 0.1428121390318177, + "grad_norm": 0.6812571883201599, + "learning_rate": 0.00019975153526552615, + "loss": 0.1898, + "step": 680 + }, + { + "epoch": 0.14302215688333508, + "grad_norm": 0.5687631368637085, + "learning_rate": 0.0001997490823344758, + "loss": 0.2193, + "step": 681 + }, + { + "epoch": 0.14323217473485247, + "grad_norm": 0.8981772065162659, + "learning_rate": 0.00019974661736993804, + "loss": 0.2785, + "step": 682 + }, + { + "epoch": 0.14344219258636984, + "grad_norm": 0.6966889500617981, + "learning_rate": 0.00019974414037221027, + "loss": 0.2678, + "step": 683 + }, + { + "epoch": 0.14365221043788723, + "grad_norm": 0.5631129145622253, + "learning_rate": 0.00019974165134159126, + "loss": 0.2836, + "step": 684 + }, + { + "epoch": 0.1438622282894046, + "grad_norm": 0.7686763405799866, + "learning_rate": 0.00019973915027838134, + "loss": 0.2372, + "step": 685 + }, + { + "epoch": 0.14407224614092198, + "grad_norm": 0.881515383720398, + "learning_rate": 0.00019973663718288217, + "loss": 0.2901, + "step": 686 + }, + { + "epoch": 0.14428226399243935, + "grad_norm": 0.6973896026611328, + "learning_rate": 0.00019973411205539694, + "loss": 0.2577, + "step": 687 + }, + { + "epoch": 0.14449228184395674, + "grad_norm": 0.3398377299308777, + "learning_rate": 0.0001997315748962303, + "loss": 0.1305, + "step": 688 + }, + { + "epoch": 0.1447022996954741, + "grad_norm": 0.6775567531585693, + "learning_rate": 0.0001997290257056883, + "loss": 0.3746, + "step": 689 + }, + { + "epoch": 0.1449123175469915, + "grad_norm": 0.3776891827583313, + "learning_rate": 0.0001997264644840785, + "loss": 0.1731, + "step": 690 + }, + { + "epoch": 0.14512233539850888, + "grad_norm": 0.6515337824821472, + "learning_rate": 0.00019972389123170986, + "loss": 0.2596, + "step": 691 + }, + { + "epoch": 0.14533235325002625, + "grad_norm": 0.7165318131446838, + "learning_rate": 0.00019972130594889286, + "loss": 0.2673, + "step": 692 + }, + { + "epoch": 0.14554237110154364, + "grad_norm": 0.5702444314956665, + "learning_rate": 0.00019971870863593925, + "loss": 0.1928, + "step": 693 + }, + { + "epoch": 0.145752388953061, + "grad_norm": 0.3542981743812561, + "learning_rate": 0.0001997160992931625, + "loss": 0.1277, + "step": 694 + }, + { + "epoch": 0.1459624068045784, + "grad_norm": 0.6520780920982361, + "learning_rate": 0.00019971347792087732, + "loss": 0.2623, + "step": 695 + }, + { + "epoch": 0.14617242465609576, + "grad_norm": 0.4505969285964966, + "learning_rate": 0.00019971084451939997, + "loss": 0.2026, + "step": 696 + }, + { + "epoch": 0.14638244250761315, + "grad_norm": 0.623036801815033, + "learning_rate": 0.00019970819908904814, + "loss": 0.2371, + "step": 697 + }, + { + "epoch": 0.14659246035913054, + "grad_norm": 0.60871422290802, + "learning_rate": 0.00019970554163014097, + "loss": 0.3128, + "step": 698 + }, + { + "epoch": 0.1468024782106479, + "grad_norm": 0.6321155428886414, + "learning_rate": 0.00019970287214299902, + "loss": 0.2183, + "step": 699 + }, + { + "epoch": 0.1470124960621653, + "grad_norm": 0.7513316869735718, + "learning_rate": 0.0001997001906279444, + "loss": 0.2753, + "step": 700 + }, + { + "epoch": 0.14722251391368266, + "grad_norm": 0.4192676842212677, + "learning_rate": 0.0001996974970853005, + "loss": 0.3071, + "step": 701 + }, + { + "epoch": 0.14743253176520005, + "grad_norm": 0.5773706436157227, + "learning_rate": 0.00019969479151539236, + "loss": 0.2883, + "step": 702 + }, + { + "epoch": 0.1476425496167174, + "grad_norm": 0.4587963819503784, + "learning_rate": 0.00019969207391854632, + "loss": 0.2997, + "step": 703 + }, + { + "epoch": 0.1478525674682348, + "grad_norm": 0.6077782511711121, + "learning_rate": 0.00019968934429509023, + "loss": 0.182, + "step": 704 + }, + { + "epoch": 0.14806258531975217, + "grad_norm": 0.6285839676856995, + "learning_rate": 0.0001996866026453534, + "loss": 0.3573, + "step": 705 + }, + { + "epoch": 0.14827260317126956, + "grad_norm": 0.7416669726371765, + "learning_rate": 0.00019968384896966657, + "loss": 0.2424, + "step": 706 + }, + { + "epoch": 0.14848262102278695, + "grad_norm": 0.5475688576698303, + "learning_rate": 0.0001996810832683619, + "loss": 0.1766, + "step": 707 + }, + { + "epoch": 0.1486926388743043, + "grad_norm": 0.5601086020469666, + "learning_rate": 0.00019967830554177312, + "loss": 0.2725, + "step": 708 + }, + { + "epoch": 0.1489026567258217, + "grad_norm": 0.7686034440994263, + "learning_rate": 0.00019967551579023524, + "loss": 0.3008, + "step": 709 + }, + { + "epoch": 0.14911267457733907, + "grad_norm": 0.8172418475151062, + "learning_rate": 0.00019967271401408486, + "loss": 0.3042, + "step": 710 + }, + { + "epoch": 0.14932269242885646, + "grad_norm": 0.8726032972335815, + "learning_rate": 0.00019966990021366, + "loss": 0.224, + "step": 711 + }, + { + "epoch": 0.14953271028037382, + "grad_norm": 0.6053635478019714, + "learning_rate": 0.00019966707438930003, + "loss": 0.2325, + "step": 712 + }, + { + "epoch": 0.1497427281318912, + "grad_norm": 0.7157438397407532, + "learning_rate": 0.00019966423654134592, + "loss": 0.2656, + "step": 713 + }, + { + "epoch": 0.1499527459834086, + "grad_norm": 0.6943267583847046, + "learning_rate": 0.00019966138667014, + "loss": 0.2625, + "step": 714 + }, + { + "epoch": 0.15016276383492597, + "grad_norm": 0.7070578336715698, + "learning_rate": 0.00019965852477602604, + "loss": 0.2795, + "step": 715 + }, + { + "epoch": 0.15037278168644336, + "grad_norm": 0.654684841632843, + "learning_rate": 0.00019965565085934935, + "loss": 0.2168, + "step": 716 + }, + { + "epoch": 0.15058279953796072, + "grad_norm": 0.5972804427146912, + "learning_rate": 0.00019965276492045662, + "loss": 0.2337, + "step": 717 + }, + { + "epoch": 0.1507928173894781, + "grad_norm": 0.4990095794200897, + "learning_rate": 0.000199649866959696, + "loss": 0.3187, + "step": 718 + }, + { + "epoch": 0.15100283524099548, + "grad_norm": 0.6247003078460693, + "learning_rate": 0.00019964695697741703, + "loss": 0.2139, + "step": 719 + }, + { + "epoch": 0.15121285309251287, + "grad_norm": 0.6358337998390198, + "learning_rate": 0.00019964403497397084, + "loss": 0.244, + "step": 720 + }, + { + "epoch": 0.15142287094403023, + "grad_norm": 0.5211917161941528, + "learning_rate": 0.0001996411009497099, + "loss": 0.1784, + "step": 721 + }, + { + "epoch": 0.15163288879554762, + "grad_norm": 0.464606374502182, + "learning_rate": 0.00019963815490498817, + "loss": 0.2137, + "step": 722 + }, + { + "epoch": 0.151842906647065, + "grad_norm": 0.7099301815032959, + "learning_rate": 0.00019963519684016107, + "loss": 0.2927, + "step": 723 + }, + { + "epoch": 0.15205292449858238, + "grad_norm": 0.7805564999580383, + "learning_rate": 0.00019963222675558543, + "loss": 0.2374, + "step": 724 + }, + { + "epoch": 0.15226294235009977, + "grad_norm": 0.6172361373901367, + "learning_rate": 0.00019962924465161957, + "loss": 0.201, + "step": 725 + }, + { + "epoch": 0.15247296020161713, + "grad_norm": 0.6261605620384216, + "learning_rate": 0.0001996262505286232, + "loss": 0.1709, + "step": 726 + }, + { + "epoch": 0.15268297805313452, + "grad_norm": 0.6561511158943176, + "learning_rate": 0.00019962324438695762, + "loss": 0.2283, + "step": 727 + }, + { + "epoch": 0.15289299590465188, + "grad_norm": 0.5386349558830261, + "learning_rate": 0.0001996202262269854, + "loss": 0.231, + "step": 728 + }, + { + "epoch": 0.15310301375616928, + "grad_norm": 0.644136369228363, + "learning_rate": 0.00019961719604907066, + "loss": 0.1875, + "step": 729 + }, + { + "epoch": 0.15331303160768664, + "grad_norm": 0.6452980041503906, + "learning_rate": 0.00019961415385357897, + "loss": 0.2294, + "step": 730 + }, + { + "epoch": 0.15352304945920403, + "grad_norm": 0.5558809041976929, + "learning_rate": 0.0001996110996408773, + "loss": 0.1988, + "step": 731 + }, + { + "epoch": 0.15373306731072142, + "grad_norm": 0.6049979329109192, + "learning_rate": 0.00019960803341133413, + "loss": 0.2368, + "step": 732 + }, + { + "epoch": 0.15394308516223879, + "grad_norm": 0.6450143456459045, + "learning_rate": 0.00019960495516531935, + "loss": 0.2217, + "step": 733 + }, + { + "epoch": 0.15415310301375618, + "grad_norm": 0.6582781672477722, + "learning_rate": 0.00019960186490320436, + "loss": 0.1942, + "step": 734 + }, + { + "epoch": 0.15436312086527354, + "grad_norm": 0.5160269141197205, + "learning_rate": 0.0001995987626253619, + "loss": 0.1723, + "step": 735 + }, + { + "epoch": 0.15457313871679093, + "grad_norm": 0.6058139801025391, + "learning_rate": 0.00019959564833216625, + "loss": 0.2089, + "step": 736 + }, + { + "epoch": 0.1547831565683083, + "grad_norm": 0.540282666683197, + "learning_rate": 0.0001995925220239931, + "loss": 0.2089, + "step": 737 + }, + { + "epoch": 0.15499317441982569, + "grad_norm": 0.7635892033576965, + "learning_rate": 0.0001995893837012196, + "loss": 0.2825, + "step": 738 + }, + { + "epoch": 0.15520319227134308, + "grad_norm": 0.5233755111694336, + "learning_rate": 0.00019958623336422434, + "loss": 0.1514, + "step": 739 + }, + { + "epoch": 0.15541321012286044, + "grad_norm": 0.44758716225624084, + "learning_rate": 0.00019958307101338742, + "loss": 0.132, + "step": 740 + }, + { + "epoch": 0.15562322797437783, + "grad_norm": 0.7145951390266418, + "learning_rate": 0.00019957989664909026, + "loss": 0.2395, + "step": 741 + }, + { + "epoch": 0.1558332458258952, + "grad_norm": 0.6241814494132996, + "learning_rate": 0.0001995767102717159, + "loss": 0.2255, + "step": 742 + }, + { + "epoch": 0.15604326367741259, + "grad_norm": 0.502863883972168, + "learning_rate": 0.00019957351188164865, + "loss": 0.1941, + "step": 743 + }, + { + "epoch": 0.15625328152892995, + "grad_norm": 0.5572714805603027, + "learning_rate": 0.00019957030147927442, + "loss": 0.1664, + "step": 744 + }, + { + "epoch": 0.15646329938044734, + "grad_norm": 1.0500191450119019, + "learning_rate": 0.00019956707906498044, + "loss": 0.3229, + "step": 745 + }, + { + "epoch": 0.1566733172319647, + "grad_norm": 0.595522403717041, + "learning_rate": 0.0001995638446391555, + "loss": 0.1932, + "step": 746 + }, + { + "epoch": 0.1568833350834821, + "grad_norm": 0.38818204402923584, + "learning_rate": 0.00019956059820218982, + "loss": 0.1324, + "step": 747 + }, + { + "epoch": 0.1570933529349995, + "grad_norm": 0.6705027222633362, + "learning_rate": 0.000199557339754475, + "loss": 0.194, + "step": 748 + }, + { + "epoch": 0.15730337078651685, + "grad_norm": 0.4935189485549927, + "learning_rate": 0.0001995540692964041, + "loss": 0.2492, + "step": 749 + }, + { + "epoch": 0.15751338863803424, + "grad_norm": 0.3950806260108948, + "learning_rate": 0.00019955078682837174, + "loss": 0.1331, + "step": 750 + }, + { + "epoch": 0.1577234064895516, + "grad_norm": 0.6625058650970459, + "learning_rate": 0.00019954749235077384, + "loss": 0.297, + "step": 751 + }, + { + "epoch": 0.157933424341069, + "grad_norm": 0.5862818956375122, + "learning_rate": 0.00019954418586400787, + "loss": 0.2628, + "step": 752 + }, + { + "epoch": 0.15814344219258636, + "grad_norm": 0.6951611042022705, + "learning_rate": 0.0001995408673684727, + "loss": 0.2573, + "step": 753 + }, + { + "epoch": 0.15835346004410375, + "grad_norm": 0.8030470013618469, + "learning_rate": 0.0001995375368645687, + "loss": 0.2671, + "step": 754 + }, + { + "epoch": 0.15856347789562114, + "grad_norm": 0.4509555995464325, + "learning_rate": 0.00019953419435269764, + "loss": 0.1808, + "step": 755 + }, + { + "epoch": 0.1587734957471385, + "grad_norm": 0.7687417268753052, + "learning_rate": 0.0001995308398332627, + "loss": 0.2906, + "step": 756 + }, + { + "epoch": 0.1589835135986559, + "grad_norm": 0.7642715573310852, + "learning_rate": 0.00019952747330666867, + "loss": 0.3541, + "step": 757 + }, + { + "epoch": 0.15919353145017326, + "grad_norm": 0.6699778437614441, + "learning_rate": 0.00019952409477332156, + "loss": 0.2494, + "step": 758 + }, + { + "epoch": 0.15940354930169065, + "grad_norm": 0.7119278907775879, + "learning_rate": 0.00019952070423362903, + "loss": 0.1994, + "step": 759 + }, + { + "epoch": 0.159613567153208, + "grad_norm": 0.6130563616752625, + "learning_rate": 0.00019951730168800004, + "loss": 0.3433, + "step": 760 + }, + { + "epoch": 0.1598235850047254, + "grad_norm": 0.692933201789856, + "learning_rate": 0.00019951388713684514, + "loss": 0.1762, + "step": 761 + }, + { + "epoch": 0.16003360285624277, + "grad_norm": 0.5561717748641968, + "learning_rate": 0.00019951046058057622, + "loss": 0.2266, + "step": 762 + }, + { + "epoch": 0.16024362070776016, + "grad_norm": 0.8559679388999939, + "learning_rate": 0.00019950702201960665, + "loss": 0.3145, + "step": 763 + }, + { + "epoch": 0.16045363855927755, + "grad_norm": 0.7173314094543457, + "learning_rate": 0.00019950357145435122, + "loss": 0.2079, + "step": 764 + }, + { + "epoch": 0.16066365641079491, + "grad_norm": 0.4696892201900482, + "learning_rate": 0.00019950010888522625, + "loss": 0.2374, + "step": 765 + }, + { + "epoch": 0.1608736742623123, + "grad_norm": 0.5349077582359314, + "learning_rate": 0.00019949663431264943, + "loss": 0.2221, + "step": 766 + }, + { + "epoch": 0.16108369211382967, + "grad_norm": 0.49449819326400757, + "learning_rate": 0.0001994931477370399, + "loss": 0.1432, + "step": 767 + }, + { + "epoch": 0.16129370996534706, + "grad_norm": 0.652260422706604, + "learning_rate": 0.00019948964915881835, + "loss": 0.2122, + "step": 768 + }, + { + "epoch": 0.16150372781686442, + "grad_norm": 0.6549475789070129, + "learning_rate": 0.00019948613857840672, + "loss": 0.3484, + "step": 769 + }, + { + "epoch": 0.16171374566838181, + "grad_norm": 0.6772179007530212, + "learning_rate": 0.00019948261599622865, + "loss": 0.2784, + "step": 770 + }, + { + "epoch": 0.16192376351989918, + "grad_norm": 0.788960337638855, + "learning_rate": 0.00019947908141270898, + "loss": 0.1939, + "step": 771 + }, + { + "epoch": 0.16213378137141657, + "grad_norm": 0.6915500164031982, + "learning_rate": 0.00019947553482827418, + "loss": 0.1541, + "step": 772 + }, + { + "epoch": 0.16234379922293396, + "grad_norm": 0.604015052318573, + "learning_rate": 0.0001994719762433521, + "loss": 0.2148, + "step": 773 + }, + { + "epoch": 0.16255381707445132, + "grad_norm": 0.8275285959243774, + "learning_rate": 0.00019946840565837203, + "loss": 0.2808, + "step": 774 + }, + { + "epoch": 0.16276383492596871, + "grad_norm": 0.6737775802612305, + "learning_rate": 0.00019946482307376472, + "loss": 0.1813, + "step": 775 + }, + { + "epoch": 0.16297385277748608, + "grad_norm": 0.8311626315116882, + "learning_rate": 0.0001994612284899623, + "loss": 0.2819, + "step": 776 + }, + { + "epoch": 0.16318387062900347, + "grad_norm": 0.7368951439857483, + "learning_rate": 0.00019945762190739852, + "loss": 0.2619, + "step": 777 + }, + { + "epoch": 0.16339388848052083, + "grad_norm": 0.6095349788665771, + "learning_rate": 0.0001994540033265084, + "loss": 0.2449, + "step": 778 + }, + { + "epoch": 0.16360390633203822, + "grad_norm": 0.6738486886024475, + "learning_rate": 0.0001994503727477285, + "loss": 0.1493, + "step": 779 + }, + { + "epoch": 0.16381392418355561, + "grad_norm": 0.5636208653450012, + "learning_rate": 0.0001994467301714968, + "loss": 0.1949, + "step": 780 + }, + { + "epoch": 0.16402394203507298, + "grad_norm": 0.9404299259185791, + "learning_rate": 0.00019944307559825272, + "loss": 0.2503, + "step": 781 + }, + { + "epoch": 0.16423395988659037, + "grad_norm": 0.6188719868659973, + "learning_rate": 0.0001994394090284372, + "loss": 0.1658, + "step": 782 + }, + { + "epoch": 0.16444397773810773, + "grad_norm": 0.9498090147972107, + "learning_rate": 0.00019943573046249244, + "loss": 0.3425, + "step": 783 + }, + { + "epoch": 0.16465399558962512, + "grad_norm": 0.6508981585502625, + "learning_rate": 0.00019943203990086233, + "loss": 0.1384, + "step": 784 + }, + { + "epoch": 0.1648640134411425, + "grad_norm": 1.0658531188964844, + "learning_rate": 0.00019942833734399202, + "loss": 0.2609, + "step": 785 + }, + { + "epoch": 0.16507403129265988, + "grad_norm": 0.7281699776649475, + "learning_rate": 0.00019942462279232825, + "loss": 0.1985, + "step": 786 + }, + { + "epoch": 0.16528404914417724, + "grad_norm": 0.7734364867210388, + "learning_rate": 0.00019942089624631906, + "loss": 0.2617, + "step": 787 + }, + { + "epoch": 0.16549406699569463, + "grad_norm": 0.977069616317749, + "learning_rate": 0.00019941715770641408, + "loss": 0.2928, + "step": 788 + }, + { + "epoch": 0.16570408484721202, + "grad_norm": 0.7139049768447876, + "learning_rate": 0.00019941340717306424, + "loss": 0.3369, + "step": 789 + }, + { + "epoch": 0.1659141026987294, + "grad_norm": 0.5771147012710571, + "learning_rate": 0.00019940964464672205, + "loss": 0.2304, + "step": 790 + }, + { + "epoch": 0.16612412055024678, + "grad_norm": 0.5506160855293274, + "learning_rate": 0.00019940587012784138, + "loss": 0.2084, + "step": 791 + }, + { + "epoch": 0.16633413840176414, + "grad_norm": 0.48316794633865356, + "learning_rate": 0.0001994020836168776, + "loss": 0.1835, + "step": 792 + }, + { + "epoch": 0.16654415625328153, + "grad_norm": 0.5649861693382263, + "learning_rate": 0.00019939828511428753, + "loss": 0.2426, + "step": 793 + }, + { + "epoch": 0.1667541741047989, + "grad_norm": 0.5224729180335999, + "learning_rate": 0.00019939447462052936, + "loss": 0.1862, + "step": 794 + }, + { + "epoch": 0.1669641919563163, + "grad_norm": 0.5801841616630554, + "learning_rate": 0.00019939065213606282, + "loss": 0.2081, + "step": 795 + }, + { + "epoch": 0.16717420980783368, + "grad_norm": 0.4274038076400757, + "learning_rate": 0.00019938681766134902, + "loss": 0.14, + "step": 796 + }, + { + "epoch": 0.16738422765935104, + "grad_norm": 0.5294644236564636, + "learning_rate": 0.00019938297119685054, + "loss": 0.1851, + "step": 797 + }, + { + "epoch": 0.16759424551086843, + "grad_norm": 0.5110440850257874, + "learning_rate": 0.00019937911274303145, + "loss": 0.2346, + "step": 798 + }, + { + "epoch": 0.1678042633623858, + "grad_norm": 0.5785256028175354, + "learning_rate": 0.00019937524230035717, + "loss": 0.1554, + "step": 799 + }, + { + "epoch": 0.1680142812139032, + "grad_norm": 0.586320161819458, + "learning_rate": 0.00019937135986929465, + "loss": 0.2672, + "step": 800 + }, + { + "epoch": 0.16822429906542055, + "grad_norm": 0.502890408039093, + "learning_rate": 0.00019936746545031223, + "loss": 0.3023, + "step": 801 + }, + { + "epoch": 0.16843431691693794, + "grad_norm": 0.5421012043952942, + "learning_rate": 0.00019936355904387977, + "loss": 0.2331, + "step": 802 + }, + { + "epoch": 0.1686443347684553, + "grad_norm": 0.5681023001670837, + "learning_rate": 0.0001993596406504685, + "loss": 0.2064, + "step": 803 + }, + { + "epoch": 0.1688543526199727, + "grad_norm": 0.4179142713546753, + "learning_rate": 0.00019935571027055113, + "loss": 0.2302, + "step": 804 + }, + { + "epoch": 0.1690643704714901, + "grad_norm": 0.7016621232032776, + "learning_rate": 0.00019935176790460179, + "loss": 0.2442, + "step": 805 + }, + { + "epoch": 0.16927438832300745, + "grad_norm": 0.5401879549026489, + "learning_rate": 0.00019934781355309612, + "loss": 0.2798, + "step": 806 + }, + { + "epoch": 0.16948440617452484, + "grad_norm": 0.5687265396118164, + "learning_rate": 0.00019934384721651113, + "loss": 0.2097, + "step": 807 + }, + { + "epoch": 0.1696944240260422, + "grad_norm": 0.659520149230957, + "learning_rate": 0.00019933986889532533, + "loss": 0.1938, + "step": 808 + }, + { + "epoch": 0.1699044418775596, + "grad_norm": 0.8230718970298767, + "learning_rate": 0.00019933587859001866, + "loss": 0.4148, + "step": 809 + }, + { + "epoch": 0.17011445972907696, + "grad_norm": 0.7954551577568054, + "learning_rate": 0.00019933187630107244, + "loss": 0.4564, + "step": 810 + }, + { + "epoch": 0.17032447758059435, + "grad_norm": 0.6618001461029053, + "learning_rate": 0.0001993278620289696, + "loss": 0.2819, + "step": 811 + }, + { + "epoch": 0.17053449543211172, + "grad_norm": 0.9731025099754333, + "learning_rate": 0.00019932383577419432, + "loss": 0.3954, + "step": 812 + }, + { + "epoch": 0.1707445132836291, + "grad_norm": 0.7344256639480591, + "learning_rate": 0.00019931979753723232, + "loss": 0.2502, + "step": 813 + }, + { + "epoch": 0.1709545311351465, + "grad_norm": 0.6986575722694397, + "learning_rate": 0.00019931574731857086, + "loss": 0.2499, + "step": 814 + }, + { + "epoch": 0.17116454898666386, + "grad_norm": 0.5757253170013428, + "learning_rate": 0.00019931168511869846, + "loss": 0.2445, + "step": 815 + }, + { + "epoch": 0.17137456683818125, + "grad_norm": 0.5453664064407349, + "learning_rate": 0.0001993076109381052, + "loss": 0.2494, + "step": 816 + }, + { + "epoch": 0.17158458468969862, + "grad_norm": 0.7031118869781494, + "learning_rate": 0.00019930352477728257, + "loss": 0.2777, + "step": 817 + }, + { + "epoch": 0.171794602541216, + "grad_norm": 0.6201139092445374, + "learning_rate": 0.0001992994266367235, + "loss": 0.2145, + "step": 818 + }, + { + "epoch": 0.17200462039273337, + "grad_norm": 0.6421683430671692, + "learning_rate": 0.00019929531651692245, + "loss": 0.1951, + "step": 819 + }, + { + "epoch": 0.17221463824425076, + "grad_norm": 0.6390677094459534, + "learning_rate": 0.00019929119441837518, + "loss": 0.2011, + "step": 820 + }, + { + "epoch": 0.17242465609576815, + "grad_norm": 0.5171882510185242, + "learning_rate": 0.00019928706034157901, + "loss": 0.1459, + "step": 821 + }, + { + "epoch": 0.17263467394728552, + "grad_norm": 0.6737155914306641, + "learning_rate": 0.00019928291428703262, + "loss": 0.1507, + "step": 822 + }, + { + "epoch": 0.1728446917988029, + "grad_norm": 0.526128351688385, + "learning_rate": 0.00019927875625523625, + "loss": 0.1565, + "step": 823 + }, + { + "epoch": 0.17305470965032027, + "grad_norm": 0.7430817484855652, + "learning_rate": 0.00019927458624669145, + "loss": 0.2575, + "step": 824 + }, + { + "epoch": 0.17326472750183766, + "grad_norm": 0.4702281355857849, + "learning_rate": 0.0001992704042619013, + "loss": 0.1796, + "step": 825 + }, + { + "epoch": 0.17347474535335503, + "grad_norm": 0.5295049548149109, + "learning_rate": 0.00019926621030137034, + "loss": 0.1974, + "step": 826 + }, + { + "epoch": 0.17368476320487242, + "grad_norm": 0.667036771774292, + "learning_rate": 0.00019926200436560447, + "loss": 0.2125, + "step": 827 + }, + { + "epoch": 0.17389478105638978, + "grad_norm": 0.7351561188697815, + "learning_rate": 0.0001992577864551111, + "loss": 0.2271, + "step": 828 + }, + { + "epoch": 0.17410479890790717, + "grad_norm": 0.8084509372711182, + "learning_rate": 0.0001992535565703991, + "loss": 0.2301, + "step": 829 + }, + { + "epoch": 0.17431481675942456, + "grad_norm": 0.7022576928138733, + "learning_rate": 0.0001992493147119787, + "loss": 0.2662, + "step": 830 + }, + { + "epoch": 0.17452483461094193, + "grad_norm": 0.7098193168640137, + "learning_rate": 0.00019924506088036165, + "loss": 0.1979, + "step": 831 + }, + { + "epoch": 0.17473485246245932, + "grad_norm": 0.590630292892456, + "learning_rate": 0.00019924079507606114, + "loss": 0.1872, + "step": 832 + }, + { + "epoch": 0.17494487031397668, + "grad_norm": 0.7556937336921692, + "learning_rate": 0.00019923651729959177, + "loss": 0.1651, + "step": 833 + }, + { + "epoch": 0.17515488816549407, + "grad_norm": 0.6680110096931458, + "learning_rate": 0.00019923222755146956, + "loss": 0.1837, + "step": 834 + }, + { + "epoch": 0.17536490601701143, + "grad_norm": 0.7310810685157776, + "learning_rate": 0.0001992279258322121, + "loss": 0.3201, + "step": 835 + }, + { + "epoch": 0.17557492386852883, + "grad_norm": 0.5796787142753601, + "learning_rate": 0.0001992236121423383, + "loss": 0.178, + "step": 836 + }, + { + "epoch": 0.17578494172004622, + "grad_norm": 0.45521265268325806, + "learning_rate": 0.00019921928648236853, + "loss": 0.1723, + "step": 837 + }, + { + "epoch": 0.17599495957156358, + "grad_norm": 0.43274396657943726, + "learning_rate": 0.00019921494885282467, + "loss": 0.1597, + "step": 838 + }, + { + "epoch": 0.17620497742308097, + "grad_norm": 0.40754616260528564, + "learning_rate": 0.00019921059925422996, + "loss": 0.1299, + "step": 839 + }, + { + "epoch": 0.17641499527459834, + "grad_norm": 0.6628978252410889, + "learning_rate": 0.00019920623768710912, + "loss": 0.1931, + "step": 840 + }, + { + "epoch": 0.17662501312611573, + "grad_norm": 0.644637405872345, + "learning_rate": 0.0001992018641519884, + "loss": 0.199, + "step": 841 + }, + { + "epoch": 0.1768350309776331, + "grad_norm": 0.5001009106636047, + "learning_rate": 0.0001991974786493953, + "loss": 0.2109, + "step": 842 + }, + { + "epoch": 0.17704504882915048, + "grad_norm": 0.49435755610466003, + "learning_rate": 0.00019919308117985894, + "loss": 0.1832, + "step": 843 + }, + { + "epoch": 0.17725506668066784, + "grad_norm": 0.7176212668418884, + "learning_rate": 0.0001991886717439098, + "loss": 0.2491, + "step": 844 + }, + { + "epoch": 0.17746508453218524, + "grad_norm": 0.5122328996658325, + "learning_rate": 0.00019918425034207984, + "loss": 0.2618, + "step": 845 + }, + { + "epoch": 0.17767510238370263, + "grad_norm": 0.6069608926773071, + "learning_rate": 0.00019917981697490245, + "loss": 0.2119, + "step": 846 + }, + { + "epoch": 0.17788512023522, + "grad_norm": 0.8389537334442139, + "learning_rate": 0.00019917537164291244, + "loss": 0.2619, + "step": 847 + }, + { + "epoch": 0.17809513808673738, + "grad_norm": 0.5856572389602661, + "learning_rate": 0.00019917091434664612, + "loss": 0.1928, + "step": 848 + }, + { + "epoch": 0.17830515593825474, + "grad_norm": 0.5682632327079773, + "learning_rate": 0.00019916644508664115, + "loss": 0.2963, + "step": 849 + }, + { + "epoch": 0.17851517378977214, + "grad_norm": 0.45547807216644287, + "learning_rate": 0.00019916196386343674, + "loss": 0.1277, + "step": 850 + }, + { + "epoch": 0.1787251916412895, + "grad_norm": 0.648499071598053, + "learning_rate": 0.00019915747067757349, + "loss": 0.3407, + "step": 851 + }, + { + "epoch": 0.1789352094928069, + "grad_norm": 0.48874902725219727, + "learning_rate": 0.0001991529655295934, + "loss": 0.185, + "step": 852 + }, + { + "epoch": 0.17914522734432425, + "grad_norm": 0.7059923410415649, + "learning_rate": 0.00019914844842004002, + "loss": 0.2352, + "step": 853 + }, + { + "epoch": 0.17935524519584164, + "grad_norm": 0.6532195210456848, + "learning_rate": 0.00019914391934945823, + "loss": 0.292, + "step": 854 + }, + { + "epoch": 0.17956526304735904, + "grad_norm": 0.6922776103019714, + "learning_rate": 0.0001991393783183945, + "loss": 0.4635, + "step": 855 + }, + { + "epoch": 0.1797752808988764, + "grad_norm": 0.6560776829719543, + "learning_rate": 0.00019913482532739652, + "loss": 0.2684, + "step": 856 + }, + { + "epoch": 0.1799852987503938, + "grad_norm": 0.5644369125366211, + "learning_rate": 0.00019913026037701362, + "loss": 0.2018, + "step": 857 + }, + { + "epoch": 0.18019531660191115, + "grad_norm": 0.6108200550079346, + "learning_rate": 0.00019912568346779652, + "loss": 0.1746, + "step": 858 + }, + { + "epoch": 0.18040533445342855, + "grad_norm": 0.6762723326683044, + "learning_rate": 0.00019912109460029734, + "loss": 0.4662, + "step": 859 + }, + { + "epoch": 0.1806153523049459, + "grad_norm": 0.5877822041511536, + "learning_rate": 0.00019911649377506966, + "loss": 0.2546, + "step": 860 + }, + { + "epoch": 0.1808253701564633, + "grad_norm": 0.5038641095161438, + "learning_rate": 0.00019911188099266855, + "loss": 0.3073, + "step": 861 + }, + { + "epoch": 0.1810353880079807, + "grad_norm": 0.6587141156196594, + "learning_rate": 0.00019910725625365045, + "loss": 0.2991, + "step": 862 + }, + { + "epoch": 0.18124540585949805, + "grad_norm": 0.9041693210601807, + "learning_rate": 0.0001991026195585733, + "loss": 0.3111, + "step": 863 + }, + { + "epoch": 0.18145542371101545, + "grad_norm": 0.6296244263648987, + "learning_rate": 0.00019909797090799644, + "loss": 0.2272, + "step": 864 + }, + { + "epoch": 0.1816654415625328, + "grad_norm": 0.6931461691856384, + "learning_rate": 0.00019909331030248072, + "loss": 0.3503, + "step": 865 + }, + { + "epoch": 0.1818754594140502, + "grad_norm": 0.7656722664833069, + "learning_rate": 0.00019908863774258827, + "loss": 0.3773, + "step": 866 + }, + { + "epoch": 0.18208547726556756, + "grad_norm": 0.6011465787887573, + "learning_rate": 0.00019908395322888294, + "loss": 0.2101, + "step": 867 + }, + { + "epoch": 0.18229549511708495, + "grad_norm": 0.6926429867744446, + "learning_rate": 0.0001990792567619297, + "loss": 0.2027, + "step": 868 + }, + { + "epoch": 0.18250551296860232, + "grad_norm": 0.5799981355667114, + "learning_rate": 0.00019907454834229525, + "loss": 0.2129, + "step": 869 + }, + { + "epoch": 0.1827155308201197, + "grad_norm": 0.5605289936065674, + "learning_rate": 0.0001990698279705475, + "loss": 0.2104, + "step": 870 + }, + { + "epoch": 0.1829255486716371, + "grad_norm": 0.9048646092414856, + "learning_rate": 0.00019906509564725596, + "loss": 0.4131, + "step": 871 + }, + { + "epoch": 0.18313556652315446, + "grad_norm": 0.6802535057067871, + "learning_rate": 0.0001990603513729915, + "loss": 0.2715, + "step": 872 + }, + { + "epoch": 0.18334558437467185, + "grad_norm": 0.4949076771736145, + "learning_rate": 0.0001990555951483265, + "loss": 0.1725, + "step": 873 + }, + { + "epoch": 0.18355560222618922, + "grad_norm": 0.6589632034301758, + "learning_rate": 0.0001990508269738347, + "loss": 0.1424, + "step": 874 + }, + { + "epoch": 0.1837656200777066, + "grad_norm": 0.5366025567054749, + "learning_rate": 0.00019904604685009133, + "loss": 0.1578, + "step": 875 + }, + { + "epoch": 0.18397563792922397, + "grad_norm": 0.584173858165741, + "learning_rate": 0.00019904125477767303, + "loss": 0.2381, + "step": 876 + }, + { + "epoch": 0.18418565578074136, + "grad_norm": 0.6884530186653137, + "learning_rate": 0.00019903645075715798, + "loss": 0.2043, + "step": 877 + }, + { + "epoch": 0.18439567363225876, + "grad_norm": 0.6070178747177124, + "learning_rate": 0.00019903163478912563, + "loss": 0.1792, + "step": 878 + }, + { + "epoch": 0.18460569148377612, + "grad_norm": 0.6375721096992493, + "learning_rate": 0.00019902680687415705, + "loss": 0.218, + "step": 879 + }, + { + "epoch": 0.1848157093352935, + "grad_norm": 0.564017653465271, + "learning_rate": 0.0001990219670128346, + "loss": 0.1885, + "step": 880 + }, + { + "epoch": 0.18502572718681087, + "grad_norm": 0.6779912710189819, + "learning_rate": 0.0001990171152057422, + "loss": 0.1949, + "step": 881 + }, + { + "epoch": 0.18523574503832826, + "grad_norm": 0.6086128950119019, + "learning_rate": 0.0001990122514534651, + "loss": 0.1818, + "step": 882 + }, + { + "epoch": 0.18544576288984563, + "grad_norm": 0.4768702983856201, + "learning_rate": 0.0001990073757565901, + "loss": 0.1459, + "step": 883 + }, + { + "epoch": 0.18565578074136302, + "grad_norm": 0.5171164870262146, + "learning_rate": 0.0001990024881157054, + "loss": 0.1624, + "step": 884 + }, + { + "epoch": 0.18586579859288038, + "grad_norm": 0.6542419195175171, + "learning_rate": 0.00019899758853140064, + "loss": 0.2035, + "step": 885 + }, + { + "epoch": 0.18607581644439777, + "grad_norm": 0.7479321956634521, + "learning_rate": 0.0001989926770042668, + "loss": 0.3654, + "step": 886 + }, + { + "epoch": 0.18628583429591516, + "grad_norm": 0.7446826696395874, + "learning_rate": 0.0001989877535348965, + "loss": 0.236, + "step": 887 + }, + { + "epoch": 0.18649585214743253, + "grad_norm": 0.5898016691207886, + "learning_rate": 0.00019898281812388366, + "loss": 0.2013, + "step": 888 + }, + { + "epoch": 0.18670586999894992, + "grad_norm": 0.6942265629768372, + "learning_rate": 0.00019897787077182368, + "loss": 0.1912, + "step": 889 + }, + { + "epoch": 0.18691588785046728, + "grad_norm": 0.7095215320587158, + "learning_rate": 0.0001989729114793134, + "loss": 0.2031, + "step": 890 + }, + { + "epoch": 0.18712590570198467, + "grad_norm": 0.49590814113616943, + "learning_rate": 0.00019896794024695108, + "loss": 0.1848, + "step": 891 + }, + { + "epoch": 0.18733592355350204, + "grad_norm": 0.3615363836288452, + "learning_rate": 0.00019896295707533642, + "loss": 0.1357, + "step": 892 + }, + { + "epoch": 0.18754594140501943, + "grad_norm": 0.540952205657959, + "learning_rate": 0.00019895796196507063, + "loss": 0.1622, + "step": 893 + }, + { + "epoch": 0.1877559592565368, + "grad_norm": 0.6152564883232117, + "learning_rate": 0.00019895295491675628, + "loss": 0.2229, + "step": 894 + }, + { + "epoch": 0.18796597710805418, + "grad_norm": 0.6287555694580078, + "learning_rate": 0.0001989479359309974, + "loss": 0.1855, + "step": 895 + }, + { + "epoch": 0.18817599495957157, + "grad_norm": 0.6615211963653564, + "learning_rate": 0.00019894290500839946, + "loss": 0.2001, + "step": 896 + }, + { + "epoch": 0.18838601281108894, + "grad_norm": 0.6587905883789062, + "learning_rate": 0.00019893786214956945, + "loss": 0.2368, + "step": 897 + }, + { + "epoch": 0.18859603066260633, + "grad_norm": 0.3502175807952881, + "learning_rate": 0.00019893280735511565, + "loss": 0.1203, + "step": 898 + }, + { + "epoch": 0.1888060485141237, + "grad_norm": 0.6989165544509888, + "learning_rate": 0.00019892774062564786, + "loss": 0.2108, + "step": 899 + }, + { + "epoch": 0.18901606636564108, + "grad_norm": 0.5993213057518005, + "learning_rate": 0.00019892266196177736, + "loss": 0.2667, + "step": 900 + }, + { + "epoch": 0.18922608421715845, + "grad_norm": 0.6625016331672668, + "learning_rate": 0.0001989175713641168, + "loss": 0.3081, + "step": 901 + }, + { + "epoch": 0.18943610206867584, + "grad_norm": 0.6831103563308716, + "learning_rate": 0.0001989124688332803, + "loss": 0.2826, + "step": 902 + }, + { + "epoch": 0.18964611992019323, + "grad_norm": 0.6341603994369507, + "learning_rate": 0.00019890735436988347, + "loss": 0.2738, + "step": 903 + }, + { + "epoch": 0.1898561377717106, + "grad_norm": 0.6546643376350403, + "learning_rate": 0.0001989022279745432, + "loss": 0.3065, + "step": 904 + }, + { + "epoch": 0.19006615562322798, + "grad_norm": 0.7356497645378113, + "learning_rate": 0.000198897089647878, + "loss": 0.2955, + "step": 905 + }, + { + "epoch": 0.19027617347474535, + "grad_norm": 0.71455317735672, + "learning_rate": 0.00019889193939050777, + "loss": 0.2069, + "step": 906 + }, + { + "epoch": 0.19048619132626274, + "grad_norm": 0.7142229676246643, + "learning_rate": 0.00019888677720305374, + "loss": 0.3386, + "step": 907 + }, + { + "epoch": 0.1906962091777801, + "grad_norm": 0.6420140862464905, + "learning_rate": 0.00019888160308613874, + "loss": 0.2952, + "step": 908 + }, + { + "epoch": 0.1909062270292975, + "grad_norm": 0.757895290851593, + "learning_rate": 0.00019887641704038688, + "loss": 0.299, + "step": 909 + }, + { + "epoch": 0.19111624488081486, + "grad_norm": 0.5329816937446594, + "learning_rate": 0.00019887121906642387, + "loss": 0.2005, + "step": 910 + }, + { + "epoch": 0.19132626273233225, + "grad_norm": 0.5069072842597961, + "learning_rate": 0.00019886600916487677, + "loss": 0.1971, + "step": 911 + }, + { + "epoch": 0.19153628058384964, + "grad_norm": 0.7712031602859497, + "learning_rate": 0.00019886078733637408, + "loss": 0.2952, + "step": 912 + }, + { + "epoch": 0.191746298435367, + "grad_norm": 0.6340819001197815, + "learning_rate": 0.00019885555358154574, + "loss": 0.2403, + "step": 913 + }, + { + "epoch": 0.1919563162868844, + "grad_norm": 0.707127034664154, + "learning_rate": 0.0001988503079010231, + "loss": 0.262, + "step": 914 + }, + { + "epoch": 0.19216633413840176, + "grad_norm": 0.5502609014511108, + "learning_rate": 0.00019884505029543908, + "loss": 0.1767, + "step": 915 + }, + { + "epoch": 0.19237635198991915, + "grad_norm": 0.6637031435966492, + "learning_rate": 0.00019883978076542787, + "loss": 0.317, + "step": 916 + }, + { + "epoch": 0.1925863698414365, + "grad_norm": 0.5921664237976074, + "learning_rate": 0.00019883449931162517, + "loss": 0.2848, + "step": 917 + }, + { + "epoch": 0.1927963876929539, + "grad_norm": 0.8460182547569275, + "learning_rate": 0.0001988292059346682, + "loss": 0.2741, + "step": 918 + }, + { + "epoch": 0.1930064055444713, + "grad_norm": 0.7577118277549744, + "learning_rate": 0.00019882390063519543, + "loss": 0.2589, + "step": 919 + }, + { + "epoch": 0.19321642339598866, + "grad_norm": 0.5957863330841064, + "learning_rate": 0.00019881858341384696, + "loss": 0.1834, + "step": 920 + }, + { + "epoch": 0.19342644124750605, + "grad_norm": 0.6584639549255371, + "learning_rate": 0.00019881325427126422, + "loss": 0.232, + "step": 921 + }, + { + "epoch": 0.1936364590990234, + "grad_norm": 0.6941714882850647, + "learning_rate": 0.0001988079132080901, + "loss": 0.2514, + "step": 922 + }, + { + "epoch": 0.1938464769505408, + "grad_norm": 0.829231321811676, + "learning_rate": 0.00019880256022496897, + "loss": 0.2023, + "step": 923 + }, + { + "epoch": 0.19405649480205817, + "grad_norm": 0.6720934510231018, + "learning_rate": 0.00019879719532254654, + "loss": 0.2535, + "step": 924 + }, + { + "epoch": 0.19426651265357556, + "grad_norm": 0.7159935832023621, + "learning_rate": 0.00019879181850147005, + "loss": 0.3129, + "step": 925 + }, + { + "epoch": 0.19447653050509292, + "grad_norm": 0.6411039233207703, + "learning_rate": 0.00019878642976238817, + "loss": 0.1729, + "step": 926 + }, + { + "epoch": 0.1946865483566103, + "grad_norm": 0.7253606915473938, + "learning_rate": 0.00019878102910595095, + "loss": 0.2599, + "step": 927 + }, + { + "epoch": 0.1948965662081277, + "grad_norm": 0.6732550859451294, + "learning_rate": 0.0001987756165328099, + "loss": 0.1881, + "step": 928 + }, + { + "epoch": 0.19510658405964507, + "grad_norm": 0.6675817966461182, + "learning_rate": 0.00019877019204361804, + "loss": 0.2417, + "step": 929 + }, + { + "epoch": 0.19531660191116246, + "grad_norm": 0.5525332093238831, + "learning_rate": 0.0001987647556390297, + "loss": 0.2445, + "step": 930 + }, + { + "epoch": 0.19552661976267982, + "grad_norm": 0.7800937294960022, + "learning_rate": 0.00019875930731970076, + "loss": 0.2401, + "step": 931 + }, + { + "epoch": 0.1957366376141972, + "grad_norm": 0.5669112205505371, + "learning_rate": 0.00019875384708628848, + "loss": 0.1925, + "step": 932 + }, + { + "epoch": 0.19594665546571458, + "grad_norm": 0.6367275714874268, + "learning_rate": 0.00019874837493945156, + "loss": 0.205, + "step": 933 + }, + { + "epoch": 0.19615667331723197, + "grad_norm": 0.6173298954963684, + "learning_rate": 0.00019874289087985013, + "loss": 0.2426, + "step": 934 + }, + { + "epoch": 0.19636669116874933, + "grad_norm": 0.7045214176177979, + "learning_rate": 0.00019873739490814583, + "loss": 0.1647, + "step": 935 + }, + { + "epoch": 0.19657670902026672, + "grad_norm": 0.5824179649353027, + "learning_rate": 0.00019873188702500163, + "loss": 0.1527, + "step": 936 + }, + { + "epoch": 0.1967867268717841, + "grad_norm": 0.585749626159668, + "learning_rate": 0.000198726367231082, + "loss": 0.2119, + "step": 937 + }, + { + "epoch": 0.19699674472330148, + "grad_norm": 0.679140031337738, + "learning_rate": 0.00019872083552705284, + "loss": 0.2037, + "step": 938 + }, + { + "epoch": 0.19720676257481887, + "grad_norm": 0.3865984380245209, + "learning_rate": 0.0001987152919135815, + "loss": 0.1508, + "step": 939 + }, + { + "epoch": 0.19741678042633623, + "grad_norm": 0.5994648933410645, + "learning_rate": 0.0001987097363913367, + "loss": 0.1536, + "step": 940 + }, + { + "epoch": 0.19762679827785362, + "grad_norm": 0.8374373912811279, + "learning_rate": 0.0001987041689609887, + "loss": 0.3113, + "step": 941 + }, + { + "epoch": 0.19783681612937098, + "grad_norm": 0.4448517858982086, + "learning_rate": 0.0001986985896232091, + "loss": 0.1523, + "step": 942 + }, + { + "epoch": 0.19804683398088838, + "grad_norm": 0.5031003952026367, + "learning_rate": 0.00019869299837867098, + "loss": 0.2351, + "step": 943 + }, + { + "epoch": 0.19825685183240577, + "grad_norm": 0.8319448232650757, + "learning_rate": 0.0001986873952280489, + "loss": 0.2768, + "step": 944 + }, + { + "epoch": 0.19846686968392313, + "grad_norm": 0.4768364429473877, + "learning_rate": 0.00019868178017201874, + "loss": 0.2041, + "step": 945 + }, + { + "epoch": 0.19867688753544052, + "grad_norm": 0.36797624826431274, + "learning_rate": 0.00019867615321125795, + "loss": 0.1703, + "step": 946 + }, + { + "epoch": 0.19888690538695789, + "grad_norm": 0.629489541053772, + "learning_rate": 0.0001986705143464453, + "loss": 0.1989, + "step": 947 + }, + { + "epoch": 0.19909692323847528, + "grad_norm": 0.757764458656311, + "learning_rate": 0.00019866486357826107, + "loss": 0.1972, + "step": 948 + }, + { + "epoch": 0.19930694108999264, + "grad_norm": 0.884556770324707, + "learning_rate": 0.00019865920090738698, + "loss": 0.2592, + "step": 949 + }, + { + "epoch": 0.19951695894151003, + "grad_norm": 0.5489534139633179, + "learning_rate": 0.00019865352633450614, + "loss": 0.2075, + "step": 950 + }, + { + "epoch": 0.1997269767930274, + "grad_norm": 0.6485860347747803, + "learning_rate": 0.00019864783986030314, + "loss": 0.3648, + "step": 951 + }, + { + "epoch": 0.19993699464454479, + "grad_norm": 0.8612170219421387, + "learning_rate": 0.00019864214148546393, + "loss": 0.2175, + "step": 952 + }, + { + "epoch": 0.20014701249606218, + "grad_norm": 0.6336376070976257, + "learning_rate": 0.00019863643121067597, + "loss": 0.2935, + "step": 953 + }, + { + "epoch": 0.20035703034757954, + "grad_norm": 0.7330135703086853, + "learning_rate": 0.00019863070903662817, + "loss": 0.4322, + "step": 954 + }, + { + "epoch": 0.20056704819909693, + "grad_norm": 0.6464625000953674, + "learning_rate": 0.0001986249749640108, + "loss": 0.242, + "step": 955 + }, + { + "epoch": 0.2007770660506143, + "grad_norm": 0.6884174942970276, + "learning_rate": 0.00019861922899351561, + "loss": 0.3043, + "step": 956 + }, + { + "epoch": 0.20098708390213169, + "grad_norm": 0.4948609471321106, + "learning_rate": 0.0001986134711258358, + "loss": 0.1735, + "step": 957 + }, + { + "epoch": 0.20119710175364905, + "grad_norm": 0.9207262396812439, + "learning_rate": 0.00019860770136166596, + "loss": 0.2473, + "step": 958 + }, + { + "epoch": 0.20140711960516644, + "grad_norm": 0.6444927453994751, + "learning_rate": 0.00019860191970170216, + "loss": 0.2995, + "step": 959 + }, + { + "epoch": 0.20161713745668383, + "grad_norm": 0.8041002750396729, + "learning_rate": 0.00019859612614664185, + "loss": 0.3079, + "step": 960 + }, + { + "epoch": 0.2018271553082012, + "grad_norm": 0.520293653011322, + "learning_rate": 0.000198590320697184, + "loss": 0.2038, + "step": 961 + }, + { + "epoch": 0.20203717315971859, + "grad_norm": 0.6968462467193604, + "learning_rate": 0.00019858450335402897, + "loss": 0.2791, + "step": 962 + }, + { + "epoch": 0.20224719101123595, + "grad_norm": 0.5260444283485413, + "learning_rate": 0.00019857867411787847, + "loss": 0.2164, + "step": 963 + }, + { + "epoch": 0.20245720886275334, + "grad_norm": 0.7742235660552979, + "learning_rate": 0.0001985728329894358, + "loss": 0.3005, + "step": 964 + }, + { + "epoch": 0.2026672267142707, + "grad_norm": 0.4388875961303711, + "learning_rate": 0.0001985669799694056, + "loss": 0.2046, + "step": 965 + }, + { + "epoch": 0.2028772445657881, + "grad_norm": 0.8159006237983704, + "learning_rate": 0.00019856111505849394, + "loss": 0.2219, + "step": 966 + }, + { + "epoch": 0.20308726241730546, + "grad_norm": 0.5616422295570374, + "learning_rate": 0.0001985552382574084, + "loss": 0.3792, + "step": 967 + }, + { + "epoch": 0.20329728026882285, + "grad_norm": 0.5863935351371765, + "learning_rate": 0.00019854934956685792, + "loss": 0.2077, + "step": 968 + }, + { + "epoch": 0.20350729812034024, + "grad_norm": 0.5828328728675842, + "learning_rate": 0.00019854344898755287, + "loss": 0.273, + "step": 969 + }, + { + "epoch": 0.2037173159718576, + "grad_norm": 0.5963171124458313, + "learning_rate": 0.00019853753652020507, + "loss": 0.2407, + "step": 970 + }, + { + "epoch": 0.203927333823375, + "grad_norm": 0.5114577412605286, + "learning_rate": 0.00019853161216552788, + "loss": 0.1663, + "step": 971 + }, + { + "epoch": 0.20413735167489236, + "grad_norm": 0.5106688737869263, + "learning_rate": 0.0001985256759242359, + "loss": 0.1823, + "step": 972 + }, + { + "epoch": 0.20434736952640975, + "grad_norm": 0.5732094645500183, + "learning_rate": 0.00019851972779704534, + "loss": 0.2206, + "step": 973 + }, + { + "epoch": 0.2045573873779271, + "grad_norm": 0.5627723932266235, + "learning_rate": 0.00019851376778467366, + "loss": 0.1715, + "step": 974 + }, + { + "epoch": 0.2047674052294445, + "grad_norm": 0.7939655184745789, + "learning_rate": 0.00019850779588783998, + "loss": 0.1669, + "step": 975 + }, + { + "epoch": 0.20497742308096187, + "grad_norm": 0.5675683617591858, + "learning_rate": 0.00019850181210726467, + "loss": 0.1696, + "step": 976 + }, + { + "epoch": 0.20518744093247926, + "grad_norm": 0.9706809520721436, + "learning_rate": 0.00019849581644366965, + "loss": 0.376, + "step": 977 + }, + { + "epoch": 0.20539745878399665, + "grad_norm": 0.6787039041519165, + "learning_rate": 0.00019848980889777815, + "loss": 0.1528, + "step": 978 + }, + { + "epoch": 0.205607476635514, + "grad_norm": 0.8098447918891907, + "learning_rate": 0.00019848378947031492, + "loss": 0.1659, + "step": 979 + }, + { + "epoch": 0.2058174944870314, + "grad_norm": 0.46255457401275635, + "learning_rate": 0.0001984777581620062, + "loss": 0.1445, + "step": 980 + }, + { + "epoch": 0.20602751233854877, + "grad_norm": 0.7909157872200012, + "learning_rate": 0.00019847171497357953, + "loss": 0.2294, + "step": 981 + }, + { + "epoch": 0.20623753019006616, + "grad_norm": 0.6739736795425415, + "learning_rate": 0.00019846565990576392, + "loss": 0.2624, + "step": 982 + }, + { + "epoch": 0.20644754804158352, + "grad_norm": 0.7638704180717468, + "learning_rate": 0.00019845959295928994, + "loss": 0.2863, + "step": 983 + }, + { + "epoch": 0.20665756589310091, + "grad_norm": 0.48239898681640625, + "learning_rate": 0.00019845351413488939, + "loss": 0.1564, + "step": 984 + }, + { + "epoch": 0.2068675837446183, + "grad_norm": 0.6511039137840271, + "learning_rate": 0.00019844742343329568, + "loss": 0.1856, + "step": 985 + }, + { + "epoch": 0.20707760159613567, + "grad_norm": 0.48949161171913147, + "learning_rate": 0.0001984413208552435, + "loss": 0.1862, + "step": 986 + }, + { + "epoch": 0.20728761944765306, + "grad_norm": 0.6529719829559326, + "learning_rate": 0.0001984352064014691, + "loss": 0.2008, + "step": 987 + }, + { + "epoch": 0.20749763729917042, + "grad_norm": 0.5295738577842712, + "learning_rate": 0.00019842908007271012, + "loss": 0.2141, + "step": 988 + }, + { + "epoch": 0.20770765515068781, + "grad_norm": 0.5440765023231506, + "learning_rate": 0.00019842294186970562, + "loss": 0.264, + "step": 989 + }, + { + "epoch": 0.20791767300220518, + "grad_norm": 0.5315092206001282, + "learning_rate": 0.00019841679179319606, + "loss": 0.2116, + "step": 990 + }, + { + "epoch": 0.20812769085372257, + "grad_norm": 0.4537929892539978, + "learning_rate": 0.0001984106298439234, + "loss": 0.1269, + "step": 991 + }, + { + "epoch": 0.20833770870523993, + "grad_norm": 0.5806244015693665, + "learning_rate": 0.000198404456022631, + "loss": 0.1808, + "step": 992 + }, + { + "epoch": 0.20854772655675732, + "grad_norm": 0.5772647261619568, + "learning_rate": 0.00019839827033006372, + "loss": 0.1637, + "step": 993 + }, + { + "epoch": 0.20875774440827471, + "grad_norm": 0.4130006432533264, + "learning_rate": 0.00019839207276696764, + "loss": 0.1398, + "step": 994 + }, + { + "epoch": 0.20896776225979208, + "grad_norm": 0.47043028473854065, + "learning_rate": 0.00019838586333409056, + "loss": 0.1209, + "step": 995 + }, + { + "epoch": 0.20917778011130947, + "grad_norm": 0.713445782661438, + "learning_rate": 0.00019837964203218148, + "loss": 0.2175, + "step": 996 + }, + { + "epoch": 0.20938779796282683, + "grad_norm": 0.7947505116462708, + "learning_rate": 0.00019837340886199096, + "loss": 0.3172, + "step": 997 + }, + { + "epoch": 0.20959781581434422, + "grad_norm": 0.7544185519218445, + "learning_rate": 0.00019836716382427096, + "loss": 0.2506, + "step": 998 + }, + { + "epoch": 0.2098078336658616, + "grad_norm": 0.8411846160888672, + "learning_rate": 0.00019836090691977484, + "loss": 0.2619, + "step": 999 + }, + { + "epoch": 0.21001785151737898, + "grad_norm": 0.7358798384666443, + "learning_rate": 0.00019835463814925745, + "loss": 0.264, + "step": 1000 + } + ], + "logging_steps": 1, + "max_steps": 9522, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.969459342303437e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}