| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0011848341232228, |
| "eval_steps": 50, |
| "global_step": 845, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.001184834123222749, |
| "grad_norm": 4.842912681173325, |
| "learning_rate": 5e-09, |
| "loss": 0.6931, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.001184834123222749, |
| "eval_loss": 0.6931471228599548, |
| "eval_runtime": 62.005, |
| "eval_samples_per_second": 13.273, |
| "eval_steps_per_second": 0.839, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.002369668246445498, |
| "grad_norm": 13.013466540861657, |
| "learning_rate": 1e-08, |
| "loss": 0.6931, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.0035545023696682463, |
| "grad_norm": 25.207549721573656, |
| "learning_rate": 1.5e-08, |
| "loss": 0.6931, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.004739336492890996, |
| "grad_norm": 11.273510432292214, |
| "learning_rate": 2e-08, |
| "loss": 0.6931, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.005924170616113744, |
| "grad_norm": 24.11230272682227, |
| "learning_rate": 2.5e-08, |
| "loss": 0.6931, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.0071090047393364926, |
| "grad_norm": 5.18401838103531, |
| "learning_rate": 3e-08, |
| "loss": 0.6931, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.008293838862559242, |
| "grad_norm": 7.3509565781013135, |
| "learning_rate": 3.5e-08, |
| "loss": 0.6931, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.009478672985781991, |
| "grad_norm": 6.56880629718354, |
| "learning_rate": 4e-08, |
| "loss": 0.6931, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.01066350710900474, |
| "grad_norm": 5.592557790447292, |
| "learning_rate": 4.5e-08, |
| "loss": 0.6931, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.011848341232227487, |
| "grad_norm": 20.527050875976098, |
| "learning_rate": 5e-08, |
| "loss": 0.6932, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.013033175355450236, |
| "grad_norm": 6.269601088107528, |
| "learning_rate": 5.4999999999999996e-08, |
| "loss": 0.6931, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.014218009478672985, |
| "grad_norm": 9.294134534941529, |
| "learning_rate": 6e-08, |
| "loss": 0.6931, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.015402843601895734, |
| "grad_norm": 5.855434213215903, |
| "learning_rate": 6.5e-08, |
| "loss": 0.6931, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.016587677725118485, |
| "grad_norm": 10.474986444115153, |
| "learning_rate": 7e-08, |
| "loss": 0.6931, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.017772511848341232, |
| "grad_norm": 6.6840985514065325, |
| "learning_rate": 7.5e-08, |
| "loss": 0.6931, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.018957345971563982, |
| "grad_norm": 7.4437101801933645, |
| "learning_rate": 8e-08, |
| "loss": 0.6931, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.02014218009478673, |
| "grad_norm": 6.13165834073612, |
| "learning_rate": 8.500000000000001e-08, |
| "loss": 0.6931, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.02132701421800948, |
| "grad_norm": 8.753680910080762, |
| "learning_rate": 9e-08, |
| "loss": 0.6931, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.022511848341232227, |
| "grad_norm": 4.494857956282118, |
| "learning_rate": 9.499999999999999e-08, |
| "loss": 0.6931, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.023696682464454975, |
| "grad_norm": 5.518409034456923, |
| "learning_rate": 1e-07, |
| "loss": 0.6931, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.024881516587677725, |
| "grad_norm": 22.99472715304125, |
| "learning_rate": 1.0499999999999999e-07, |
| "loss": 0.6932, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.026066350710900472, |
| "grad_norm": 5.44008866558376, |
| "learning_rate": 1.0999999999999999e-07, |
| "loss": 0.6931, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.027251184834123223, |
| "grad_norm": 8.315427696253538, |
| "learning_rate": 1.15e-07, |
| "loss": 0.693, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.02843601895734597, |
| "grad_norm": 9.819572604936798, |
| "learning_rate": 1.2e-07, |
| "loss": 0.693, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.02962085308056872, |
| "grad_norm": 19.108539667598766, |
| "learning_rate": 1.25e-07, |
| "loss": 0.6929, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.030805687203791468, |
| "grad_norm": 14.585034902914206, |
| "learning_rate": 1.3e-07, |
| "loss": 0.6932, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.031990521327014215, |
| "grad_norm": 9.101944616686147, |
| "learning_rate": 1.35e-07, |
| "loss": 0.693, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.03317535545023697, |
| "grad_norm": 5.776790369186141, |
| "learning_rate": 1.4e-07, |
| "loss": 0.693, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.034360189573459717, |
| "grad_norm": 12.420816953833807, |
| "learning_rate": 1.45e-07, |
| "loss": 0.6931, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.035545023696682464, |
| "grad_norm": 12.016877135801524, |
| "learning_rate": 1.5e-07, |
| "loss": 0.6929, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.03672985781990521, |
| "grad_norm": 17.865737609497657, |
| "learning_rate": 1.55e-07, |
| "loss": 0.6931, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.037914691943127965, |
| "grad_norm": 10.946159830311144, |
| "learning_rate": 1.6e-07, |
| "loss": 0.6929, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.03909952606635071, |
| "grad_norm": 7.26222881952521, |
| "learning_rate": 1.65e-07, |
| "loss": 0.693, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.04028436018957346, |
| "grad_norm": 16.017675753924713, |
| "learning_rate": 1.7000000000000001e-07, |
| "loss": 0.6927, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.041469194312796206, |
| "grad_norm": 14.865936997931808, |
| "learning_rate": 1.75e-07, |
| "loss": 0.6931, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.04265402843601896, |
| "grad_norm": 6.989152070363955, |
| "learning_rate": 1.8e-07, |
| "loss": 0.6929, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.04383886255924171, |
| "grad_norm": 5.830142644690912, |
| "learning_rate": 1.85e-07, |
| "loss": 0.693, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.045023696682464455, |
| "grad_norm": 18.298834268327568, |
| "learning_rate": 1.8999999999999998e-07, |
| "loss": 0.6931, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.0462085308056872, |
| "grad_norm": 8.595226932799667, |
| "learning_rate": 1.9499999999999999e-07, |
| "loss": 0.6929, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.04739336492890995, |
| "grad_norm": 5.6255428173780055, |
| "learning_rate": 2e-07, |
| "loss": 0.6929, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.0485781990521327, |
| "grad_norm": 10.273539930083835, |
| "learning_rate": 2.0499999999999997e-07, |
| "loss": 0.6927, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.04976303317535545, |
| "grad_norm": 5.959521496263993, |
| "learning_rate": 2.0999999999999997e-07, |
| "loss": 0.6928, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.0509478672985782, |
| "grad_norm": 5.398338620565399, |
| "learning_rate": 2.1499999999999998e-07, |
| "loss": 0.6929, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.052132701421800945, |
| "grad_norm": 18.51701133821435, |
| "learning_rate": 2.1999999999999998e-07, |
| "loss": 0.6928, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.0533175355450237, |
| "grad_norm": 10.120166425396867, |
| "learning_rate": 2.25e-07, |
| "loss": 0.6928, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.054502369668246446, |
| "grad_norm": 9.406423565733586, |
| "learning_rate": 2.3e-07, |
| "loss": 0.6927, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.05568720379146919, |
| "grad_norm": 13.864654741893007, |
| "learning_rate": 2.3499999999999997e-07, |
| "loss": 0.6927, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.05687203791469194, |
| "grad_norm": 4.845925029634522, |
| "learning_rate": 2.4e-07, |
| "loss": 0.6928, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.058056872037914695, |
| "grad_norm": 5.37379618929059, |
| "learning_rate": 2.45e-07, |
| "loss": 0.6928, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.05924170616113744, |
| "grad_norm": 6.6369233721322605, |
| "learning_rate": 2.5e-07, |
| "loss": 0.6927, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.05924170616113744, |
| "eval_loss": 0.6926634311676025, |
| "eval_runtime": 58.7485, |
| "eval_samples_per_second": 14.009, |
| "eval_steps_per_second": 0.885, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.06042654028436019, |
| "grad_norm": 10.966673821105905, |
| "learning_rate": 2.55e-07, |
| "loss": 0.6929, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.061611374407582936, |
| "grad_norm": 11.582337862061097, |
| "learning_rate": 2.6e-07, |
| "loss": 0.6926, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.06279620853080568, |
| "grad_norm": 8.941972889362651, |
| "learning_rate": 2.65e-07, |
| "loss": 0.6927, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.06398104265402843, |
| "grad_norm": 14.997343470976562, |
| "learning_rate": 2.7e-07, |
| "loss": 0.6927, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.06516587677725119, |
| "grad_norm": 5.677663010655966, |
| "learning_rate": 2.75e-07, |
| "loss": 0.6926, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.06635071090047394, |
| "grad_norm": 5.020981860702543, |
| "learning_rate": 2.8e-07, |
| "loss": 0.6925, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.06753554502369669, |
| "grad_norm": 7.474033040046244, |
| "learning_rate": 2.8499999999999997e-07, |
| "loss": 0.6925, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.06872037914691943, |
| "grad_norm": 17.180875525483216, |
| "learning_rate": 2.9e-07, |
| "loss": 0.6929, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.06990521327014218, |
| "grad_norm": 13.25617168356445, |
| "learning_rate": 2.95e-07, |
| "loss": 0.6925, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.07109004739336493, |
| "grad_norm": 30.116258413787573, |
| "learning_rate": 3e-07, |
| "loss": 0.693, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.07227488151658767, |
| "grad_norm": 10.64426253518926, |
| "learning_rate": 3.05e-07, |
| "loss": 0.6922, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.07345971563981042, |
| "grad_norm": 11.453245118504901, |
| "learning_rate": 3.1e-07, |
| "loss": 0.6925, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.07464454976303317, |
| "grad_norm": 5.907026386237989, |
| "learning_rate": 3.15e-07, |
| "loss": 0.6924, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.07582938388625593, |
| "grad_norm": 19.406661340061074, |
| "learning_rate": 3.2e-07, |
| "loss": 0.6925, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.07701421800947868, |
| "grad_norm": 18.18198520017815, |
| "learning_rate": 3.25e-07, |
| "loss": 0.6927, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.07819905213270142, |
| "grad_norm": 4.7357931743672035, |
| "learning_rate": 3.3e-07, |
| "loss": 0.6924, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.07938388625592417, |
| "grad_norm": 10.532802042903194, |
| "learning_rate": 3.35e-07, |
| "loss": 0.6924, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.08056872037914692, |
| "grad_norm": 11.915177348864768, |
| "learning_rate": 3.4000000000000003e-07, |
| "loss": 0.6924, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.08175355450236967, |
| "grad_norm": 18.00153138952007, |
| "learning_rate": 3.45e-07, |
| "loss": 0.6924, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.08293838862559241, |
| "grad_norm": 10.234716621499297, |
| "learning_rate": 3.5e-07, |
| "loss": 0.6922, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.08412322274881516, |
| "grad_norm": 11.147351790721068, |
| "learning_rate": 3.55e-07, |
| "loss": 0.6923, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.08530805687203792, |
| "grad_norm": 5.9427941542555365, |
| "learning_rate": 3.6e-07, |
| "loss": 0.692, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.08649289099526067, |
| "grad_norm": 9.650743281341448, |
| "learning_rate": 3.65e-07, |
| "loss": 0.6921, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.08767772511848342, |
| "grad_norm": 14.437098875051285, |
| "learning_rate": 3.7e-07, |
| "loss": 0.6923, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.08886255924170616, |
| "grad_norm": 5.378048791391647, |
| "learning_rate": 3.75e-07, |
| "loss": 0.6921, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.09004739336492891, |
| "grad_norm": 23.76267009040223, |
| "learning_rate": 3.7999999999999996e-07, |
| "loss": 0.6921, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.09123222748815166, |
| "grad_norm": 26.84411773978452, |
| "learning_rate": 3.8499999999999997e-07, |
| "loss": 0.6926, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.0924170616113744, |
| "grad_norm": 6.313251132925155, |
| "learning_rate": 3.8999999999999997e-07, |
| "loss": 0.6918, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.09360189573459715, |
| "grad_norm": 5.002486541981642, |
| "learning_rate": 3.95e-07, |
| "loss": 0.6922, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.0947867298578199, |
| "grad_norm": 5.125567908053878, |
| "learning_rate": 4e-07, |
| "loss": 0.6921, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.09597156398104266, |
| "grad_norm": 5.76391233867192, |
| "learning_rate": 4.05e-07, |
| "loss": 0.6926, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.0971563981042654, |
| "grad_norm": 9.172761657475997, |
| "learning_rate": 4.0999999999999994e-07, |
| "loss": 0.6923, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.09834123222748815, |
| "grad_norm": 15.521052808202064, |
| "learning_rate": 4.1499999999999994e-07, |
| "loss": 0.6922, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.0995260663507109, |
| "grad_norm": 6.256832828679843, |
| "learning_rate": 4.1999999999999995e-07, |
| "loss": 0.6917, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.10071090047393365, |
| "grad_norm": 6.119025109284402, |
| "learning_rate": 4.2499999999999995e-07, |
| "loss": 0.6923, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.1018957345971564, |
| "grad_norm": 15.199284387604703, |
| "learning_rate": 4.2999999999999996e-07, |
| "loss": 0.6919, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.10308056872037914, |
| "grad_norm": 15.339972724148373, |
| "learning_rate": 4.3499999999999996e-07, |
| "loss": 0.6921, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.10426540284360189, |
| "grad_norm": 11.496730664780461, |
| "learning_rate": 4.3999999999999997e-07, |
| "loss": 0.692, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.10545023696682465, |
| "grad_norm": 4.773306731969947, |
| "learning_rate": 4.45e-07, |
| "loss": 0.6919, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.1066350710900474, |
| "grad_norm": 8.260323351824246, |
| "learning_rate": 4.5e-07, |
| "loss": 0.6913, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.10781990521327015, |
| "grad_norm": 5.659723672215949, |
| "learning_rate": 4.55e-07, |
| "loss": 0.6915, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.10900473933649289, |
| "grad_norm": 5.107079958535661, |
| "learning_rate": 4.6e-07, |
| "loss": 0.6915, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.11018957345971564, |
| "grad_norm": 22.056688920378733, |
| "learning_rate": 4.65e-07, |
| "loss": 0.6917, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.11137440758293839, |
| "grad_norm": 4.86312181195146, |
| "learning_rate": 4.6999999999999995e-07, |
| "loss": 0.6916, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.11255924170616113, |
| "grad_norm": 4.682425298642347, |
| "learning_rate": 4.7499999999999995e-07, |
| "loss": 0.6921, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.11374407582938388, |
| "grad_norm": 7.5909421969744315, |
| "learning_rate": 4.8e-07, |
| "loss": 0.6917, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.11492890995260663, |
| "grad_norm": 9.52244613318277, |
| "learning_rate": 4.85e-07, |
| "loss": 0.6918, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.11611374407582939, |
| "grad_norm": 20.463831503545507, |
| "learning_rate": 4.9e-07, |
| "loss": 0.6922, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.11729857819905214, |
| "grad_norm": 5.85073635301077, |
| "learning_rate": 4.95e-07, |
| "loss": 0.6919, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.11848341232227488, |
| "grad_norm": 9.713864024698502, |
| "learning_rate": 5e-07, |
| "loss": 0.691, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.11848341232227488, |
| "eval_loss": 0.6912016868591309, |
| "eval_runtime": 55.0989, |
| "eval_samples_per_second": 14.937, |
| "eval_steps_per_second": 0.944, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.11966824644549763, |
| "grad_norm": 8.735757832881882, |
| "learning_rate": 5.049999999999999e-07, |
| "loss": 0.6913, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.12085308056872038, |
| "grad_norm": 8.536276252611597, |
| "learning_rate": 5.1e-07, |
| "loss": 0.6916, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.12203791469194313, |
| "grad_norm": 12.307955522799803, |
| "learning_rate": 5.149999999999999e-07, |
| "loss": 0.6919, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.12322274881516587, |
| "grad_norm": 11.5436391524331, |
| "learning_rate": 5.2e-07, |
| "loss": 0.6908, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.12440758293838862, |
| "grad_norm": 10.060636220616002, |
| "learning_rate": 5.25e-07, |
| "loss": 0.6907, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.12559241706161137, |
| "grad_norm": 5.041585370529337, |
| "learning_rate": 5.3e-07, |
| "loss": 0.691, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.12677725118483413, |
| "grad_norm": 11.64846080453421, |
| "learning_rate": 5.35e-07, |
| "loss": 0.6914, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.12796208530805686, |
| "grad_norm": 19.76068719894559, |
| "learning_rate": 5.4e-07, |
| "loss": 0.6919, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.12914691943127962, |
| "grad_norm": 14.136074416348508, |
| "learning_rate": 5.45e-07, |
| "loss": 0.6909, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.13033175355450238, |
| "grad_norm": 10.28440428895041, |
| "learning_rate": 5.5e-07, |
| "loss": 0.6911, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.13151658767772512, |
| "grad_norm": 7.391244110222167, |
| "learning_rate": 5.55e-07, |
| "loss": 0.6909, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.13270142180094788, |
| "grad_norm": 4.823429256379869, |
| "learning_rate": 5.6e-07, |
| "loss": 0.6911, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.1338862559241706, |
| "grad_norm": 23.383696969681193, |
| "learning_rate": 5.649999999999999e-07, |
| "loss": 0.6906, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.13507109004739337, |
| "grad_norm": 19.49459565572995, |
| "learning_rate": 5.699999999999999e-07, |
| "loss": 0.6896, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.1362559241706161, |
| "grad_norm": 9.86021667122269, |
| "learning_rate": 5.749999999999999e-07, |
| "loss": 0.6908, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.13744075829383887, |
| "grad_norm": 9.176213186401592, |
| "learning_rate": 5.8e-07, |
| "loss": 0.69, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.1386255924170616, |
| "grad_norm": 10.965194330031808, |
| "learning_rate": 5.849999999999999e-07, |
| "loss": 0.6907, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.13981042654028436, |
| "grad_norm": 5.324309937382266, |
| "learning_rate": 5.9e-07, |
| "loss": 0.6901, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.14099526066350712, |
| "grad_norm": 6.295743777248256, |
| "learning_rate": 5.949999999999999e-07, |
| "loss": 0.6904, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.14218009478672985, |
| "grad_norm": 15.67891975538989, |
| "learning_rate": 6e-07, |
| "loss": 0.692, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.14336492890995262, |
| "grad_norm": 9.127359508013708, |
| "learning_rate": 6.049999999999999e-07, |
| "loss": 0.6906, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.14454976303317535, |
| "grad_norm": 7.5044121458383595, |
| "learning_rate": 6.1e-07, |
| "loss": 0.6891, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.1457345971563981, |
| "grad_norm": 12.683289947181704, |
| "learning_rate": 6.149999999999999e-07, |
| "loss": 0.6906, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.14691943127962084, |
| "grad_norm": 13.110766398513006, |
| "learning_rate": 6.2e-07, |
| "loss": 0.6904, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.1481042654028436, |
| "grad_norm": 11.995325494799939, |
| "learning_rate": 6.249999999999999e-07, |
| "loss": 0.6903, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.14928909952606634, |
| "grad_norm": 6.422118572134765, |
| "learning_rate": 6.3e-07, |
| "loss": 0.6904, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.1504739336492891, |
| "grad_norm": 10.691214133018695, |
| "learning_rate": 6.35e-07, |
| "loss": 0.6905, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.15165876777251186, |
| "grad_norm": 5.404413946055909, |
| "learning_rate": 6.4e-07, |
| "loss": 0.691, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.1528436018957346, |
| "grad_norm": 4.447046938476608, |
| "learning_rate": 6.45e-07, |
| "loss": 0.6911, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.15402843601895735, |
| "grad_norm": 5.0248072341394865, |
| "learning_rate": 6.5e-07, |
| "loss": 0.6914, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.1552132701421801, |
| "grad_norm": 5.056679953391262, |
| "learning_rate": 6.55e-07, |
| "loss": 0.6905, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.15639810426540285, |
| "grad_norm": 16.537683924677737, |
| "learning_rate": 6.6e-07, |
| "loss": 0.6895, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.15758293838862558, |
| "grad_norm": 21.091493660710064, |
| "learning_rate": 6.65e-07, |
| "loss": 0.6897, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.15876777251184834, |
| "grad_norm": 8.432912706940217, |
| "learning_rate": 6.7e-07, |
| "loss": 0.689, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.15995260663507108, |
| "grad_norm": 5.270172573046601, |
| "learning_rate": 6.75e-07, |
| "loss": 0.6894, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.16113744075829384, |
| "grad_norm": 16.85436413901504, |
| "learning_rate": 6.800000000000001e-07, |
| "loss": 0.6906, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.1623222748815166, |
| "grad_norm": 6.916851112907155, |
| "learning_rate": 6.85e-07, |
| "loss": 0.6889, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.16350710900473933, |
| "grad_norm": 7.1221016172033655, |
| "learning_rate": 6.9e-07, |
| "loss": 0.6897, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.1646919431279621, |
| "grad_norm": 10.163791082368434, |
| "learning_rate": 6.949999999999999e-07, |
| "loss": 0.6896, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.16587677725118483, |
| "grad_norm": 14.381198582029823, |
| "learning_rate": 7e-07, |
| "loss": 0.6892, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.1670616113744076, |
| "grad_norm": 27.46453698599659, |
| "learning_rate": 7.049999999999999e-07, |
| "loss": 0.69, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.16824644549763032, |
| "grad_norm": 6.604176868933808, |
| "learning_rate": 7.1e-07, |
| "loss": 0.6875, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.16943127962085308, |
| "grad_norm": 9.080958816193693, |
| "learning_rate": 7.149999999999999e-07, |
| "loss": 0.6882, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.17061611374407584, |
| "grad_norm": 8.606545021512053, |
| "learning_rate": 7.2e-07, |
| "loss": 0.688, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.17180094786729858, |
| "grad_norm": 8.818101912012192, |
| "learning_rate": 7.249999999999999e-07, |
| "loss": 0.6888, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.17298578199052134, |
| "grad_norm": 7.023229633046101, |
| "learning_rate": 7.3e-07, |
| "loss": 0.6883, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.17417061611374407, |
| "grad_norm": 13.202458096332483, |
| "learning_rate": 7.35e-07, |
| "loss": 0.6911, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.17535545023696683, |
| "grad_norm": 15.969335522541915, |
| "learning_rate": 7.4e-07, |
| "loss": 0.6898, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.17654028436018956, |
| "grad_norm": 21.252078515205625, |
| "learning_rate": 7.45e-07, |
| "loss": 0.6911, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.17772511848341233, |
| "grad_norm": 10.052620681652504, |
| "learning_rate": 7.5e-07, |
| "loss": 0.6878, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.17772511848341233, |
| "eval_loss": 0.6874876618385315, |
| "eval_runtime": 54.043, |
| "eval_samples_per_second": 15.229, |
| "eval_steps_per_second": 0.962, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.17890995260663506, |
| "grad_norm": 7.158864715478862, |
| "learning_rate": 7.55e-07, |
| "loss": 0.6871, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.18009478672985782, |
| "grad_norm": 9.1358043468594, |
| "learning_rate": 7.599999999999999e-07, |
| "loss": 0.6875, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.18127962085308058, |
| "grad_norm": 5.457525606930803, |
| "learning_rate": 7.65e-07, |
| "loss": 0.6882, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.18246445497630331, |
| "grad_norm": 8.695681816363182, |
| "learning_rate": 7.699999999999999e-07, |
| "loss": 0.6875, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.18364928909952608, |
| "grad_norm": 6.992833901092717, |
| "learning_rate": 7.75e-07, |
| "loss": 0.69, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.1848341232227488, |
| "grad_norm": 11.327106053516577, |
| "learning_rate": 7.799999999999999e-07, |
| "loss": 0.6861, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.18601895734597157, |
| "grad_norm": 12.206795478765468, |
| "learning_rate": 7.85e-07, |
| "loss": 0.6872, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.1872037914691943, |
| "grad_norm": 11.670452718361528, |
| "learning_rate": 7.9e-07, |
| "loss": 0.6873, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.18838862559241706, |
| "grad_norm": 10.394255318275, |
| "learning_rate": 7.95e-07, |
| "loss": 0.6883, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.1895734597156398, |
| "grad_norm": 29.830224779254273, |
| "learning_rate": 8e-07, |
| "loss": 0.6873, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.19075829383886256, |
| "grad_norm": 8.19405638351032, |
| "learning_rate": 8.05e-07, |
| "loss": 0.6849, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.19194312796208532, |
| "grad_norm": 6.4691692398191165, |
| "learning_rate": 8.1e-07, |
| "loss": 0.6854, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.19312796208530805, |
| "grad_norm": 7.639628649694936, |
| "learning_rate": 8.149999999999999e-07, |
| "loss": 0.6887, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.1943127962085308, |
| "grad_norm": 17.413499986090972, |
| "learning_rate": 8.199999999999999e-07, |
| "loss": 0.6846, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.19549763033175355, |
| "grad_norm": 5.309092918246534, |
| "learning_rate": 8.249999999999999e-07, |
| "loss": 0.6891, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.1966824644549763, |
| "grad_norm": 7.446590034492426, |
| "learning_rate": 8.299999999999999e-07, |
| "loss": 0.6855, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.19786729857819904, |
| "grad_norm": 9.96015478678009, |
| "learning_rate": 8.349999999999999e-07, |
| "loss": 0.6843, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.1990521327014218, |
| "grad_norm": 12.633638991242824, |
| "learning_rate": 8.399999999999999e-07, |
| "loss": 0.6844, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.20023696682464456, |
| "grad_norm": 7.533772786798129, |
| "learning_rate": 8.45e-07, |
| "loss": 0.685, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.2014218009478673, |
| "grad_norm": 6.594946642463255, |
| "learning_rate": 8.499999999999999e-07, |
| "loss": 0.6851, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.20260663507109006, |
| "grad_norm": 10.234001728156082, |
| "learning_rate": 8.55e-07, |
| "loss": 0.6845, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.2037914691943128, |
| "grad_norm": 14.063644054463136, |
| "learning_rate": 8.599999999999999e-07, |
| "loss": 0.683, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.20497630331753555, |
| "grad_norm": 7.492696336567383, |
| "learning_rate": 8.65e-07, |
| "loss": 0.6849, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.20616113744075829, |
| "grad_norm": 9.371933323337045, |
| "learning_rate": 8.699999999999999e-07, |
| "loss": 0.6859, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.20734597156398105, |
| "grad_norm": 14.68266196345255, |
| "learning_rate": 8.75e-07, |
| "loss": 0.685, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.20853080568720378, |
| "grad_norm": 8.595805104024867, |
| "learning_rate": 8.799999999999999e-07, |
| "loss": 0.682, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.20971563981042654, |
| "grad_norm": 8.176461365837259, |
| "learning_rate": 8.85e-07, |
| "loss": 0.6834, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.2109004739336493, |
| "grad_norm": 24.75404438632194, |
| "learning_rate": 8.9e-07, |
| "loss": 0.6823, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.21208530805687204, |
| "grad_norm": 14.936592212079153, |
| "learning_rate": 8.95e-07, |
| "loss": 0.6804, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.2132701421800948, |
| "grad_norm": 8.958711285200756, |
| "learning_rate": 9e-07, |
| "loss": 0.6812, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.21445497630331753, |
| "grad_norm": 19.370653934060627, |
| "learning_rate": 9.05e-07, |
| "loss": 0.6846, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.2156398104265403, |
| "grad_norm": 11.292808912187253, |
| "learning_rate": 9.1e-07, |
| "loss": 0.6793, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.21682464454976302, |
| "grad_norm": 20.1962448672265, |
| "learning_rate": 9.15e-07, |
| "loss": 0.6837, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.21800947867298578, |
| "grad_norm": 7.866346608383013, |
| "learning_rate": 9.2e-07, |
| "loss": 0.6791, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.21919431279620852, |
| "grad_norm": 8.265265530197812, |
| "learning_rate": 9.25e-07, |
| "loss": 0.6805, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.22037914691943128, |
| "grad_norm": 8.27541943064013, |
| "learning_rate": 9.3e-07, |
| "loss": 0.6818, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.22156398104265404, |
| "grad_norm": 9.175840706959432, |
| "learning_rate": 9.35e-07, |
| "loss": 0.6782, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.22274881516587677, |
| "grad_norm": 8.750697832461059, |
| "learning_rate": 9.399999999999999e-07, |
| "loss": 0.6815, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.22393364928909953, |
| "grad_norm": 19.807879890918883, |
| "learning_rate": 9.45e-07, |
| "loss": 0.6794, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.22511848341232227, |
| "grad_norm": 9.84886237169711, |
| "learning_rate": 9.499999999999999e-07, |
| "loss": 0.6761, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.22630331753554503, |
| "grad_norm": 8.847377027851778, |
| "learning_rate": 9.55e-07, |
| "loss": 0.674, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.22748815165876776, |
| "grad_norm": 10.078019239583814, |
| "learning_rate": 9.6e-07, |
| "loss": 0.6785, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.22867298578199052, |
| "grad_norm": 10.221043516188288, |
| "learning_rate": 9.649999999999999e-07, |
| "loss": 0.6731, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.22985781990521326, |
| "grad_norm": 8.647282299307719, |
| "learning_rate": 9.7e-07, |
| "loss": 0.6738, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.23104265402843602, |
| "grad_norm": 19.36746818547541, |
| "learning_rate": 9.75e-07, |
| "loss": 0.6771, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.23222748815165878, |
| "grad_norm": 11.068473794452629, |
| "learning_rate": 9.8e-07, |
| "loss": 0.6738, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.2334123222748815, |
| "grad_norm": 19.420363183493418, |
| "learning_rate": 9.849999999999999e-07, |
| "loss": 0.6715, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.23459715639810427, |
| "grad_norm": 24.42503330707244, |
| "learning_rate": 9.9e-07, |
| "loss": 0.6701, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.235781990521327, |
| "grad_norm": 16.175882257620756, |
| "learning_rate": 9.95e-07, |
| "loss": 0.6737, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.23696682464454977, |
| "grad_norm": 20.652990783070912, |
| "learning_rate": 1e-06, |
| "loss": 0.6723, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.23696682464454977, |
| "eval_loss": 0.6700453162193298, |
| "eval_runtime": 58.1638, |
| "eval_samples_per_second": 14.15, |
| "eval_steps_per_second": 0.894, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.2381516587677725, |
| "grad_norm": 15.879396676885444, |
| "learning_rate": 9.999988856189192e-07, |
| "loss": 0.6766, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.23933649289099526, |
| "grad_norm": 12.333108739189962, |
| "learning_rate": 9.999955424806443e-07, |
| "loss": 0.6731, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.24052132701421802, |
| "grad_norm": 7.242095379151133, |
| "learning_rate": 9.99989970600077e-07, |
| "loss": 0.6751, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.24170616113744076, |
| "grad_norm": 8.415435271063192, |
| "learning_rate": 9.999821700020548e-07, |
| "loss": 0.6734, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.24289099526066352, |
| "grad_norm": 9.671774375618897, |
| "learning_rate": 9.999721407213486e-07, |
| "loss": 0.6702, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.24407582938388625, |
| "grad_norm": 10.14187864614814, |
| "learning_rate": 9.999598828026642e-07, |
| "loss": 0.6681, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.245260663507109, |
| "grad_norm": 15.681885174284957, |
| "learning_rate": 9.999453963006417e-07, |
| "loss": 0.6659, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.24644549763033174, |
| "grad_norm": 10.7172396576495, |
| "learning_rate": 9.99928681279855e-07, |
| "loss": 0.6622, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.2476303317535545, |
| "grad_norm": 11.41841503900657, |
| "learning_rate": 9.999097378148114e-07, |
| "loss": 0.6723, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.24881516587677724, |
| "grad_norm": 17.238661996134446, |
| "learning_rate": 9.998885659899523e-07, |
| "loss": 0.6675, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 19.98765993245952, |
| "learning_rate": 9.998651658996514e-07, |
| "loss": 0.6611, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.25118483412322273, |
| "grad_norm": 10.75284639990253, |
| "learning_rate": 9.998395376482152e-07, |
| "loss": 0.6593, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.2523696682464455, |
| "grad_norm": 18.467338204557816, |
| "learning_rate": 9.998116813498823e-07, |
| "loss": 0.6517, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.25355450236966826, |
| "grad_norm": 9.031428370949925, |
| "learning_rate": 9.99781597128823e-07, |
| "loss": 0.669, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.254739336492891, |
| "grad_norm": 20.37162505190203, |
| "learning_rate": 9.997492851191378e-07, |
| "loss": 0.645, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.2559241706161137, |
| "grad_norm": 14.505828957564063, |
| "learning_rate": 9.997147454648588e-07, |
| "loss": 0.6427, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.2571090047393365, |
| "grad_norm": 29.49184766373109, |
| "learning_rate": 9.996779783199475e-07, |
| "loss": 0.6579, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.25829383886255924, |
| "grad_norm": 13.33521595334444, |
| "learning_rate": 9.996389838482942e-07, |
| "loss": 0.6493, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.259478672985782, |
| "grad_norm": 13.69883838689083, |
| "learning_rate": 9.995977622237173e-07, |
| "loss": 0.6475, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.26066350710900477, |
| "grad_norm": 22.69929227147676, |
| "learning_rate": 9.995543136299635e-07, |
| "loss": 0.6572, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.2618483412322275, |
| "grad_norm": 15.389124321700107, |
| "learning_rate": 9.995086382607063e-07, |
| "loss": 0.646, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.26303317535545023, |
| "grad_norm": 12.968325719512725, |
| "learning_rate": 9.994607363195442e-07, |
| "loss": 0.6469, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.26421800947867297, |
| "grad_norm": 16.79050316553183, |
| "learning_rate": 9.994106080200015e-07, |
| "loss": 0.6383, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.26540284360189575, |
| "grad_norm": 17.7107351566139, |
| "learning_rate": 9.993582535855263e-07, |
| "loss": 0.6317, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.2665876777251185, |
| "grad_norm": 13.001311901357276, |
| "learning_rate": 9.9930367324949e-07, |
| "loss": 0.6389, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.2677725118483412, |
| "grad_norm": 12.560158599901298, |
| "learning_rate": 9.992468672551852e-07, |
| "loss": 0.6465, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.26895734597156395, |
| "grad_norm": 17.194433268804165, |
| "learning_rate": 9.991878358558268e-07, |
| "loss": 0.628, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.27014218009478674, |
| "grad_norm": 14.21824731123265, |
| "learning_rate": 9.991265793145479e-07, |
| "loss": 0.6286, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.2713270142180095, |
| "grad_norm": 11.717975256273306, |
| "learning_rate": 9.990630979044014e-07, |
| "loss": 0.6417, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.2725118483412322, |
| "grad_norm": 22.73985386534345, |
| "learning_rate": 9.989973919083573e-07, |
| "loss": 0.6294, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.273696682464455, |
| "grad_norm": 17.287735794213862, |
| "learning_rate": 9.989294616193017e-07, |
| "loss": 0.6158, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.27488151658767773, |
| "grad_norm": 23.899199553299088, |
| "learning_rate": 9.988593073400354e-07, |
| "loss": 0.6181, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.27606635071090047, |
| "grad_norm": 19.875731229760778, |
| "learning_rate": 9.987869293832727e-07, |
| "loss": 0.609, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.2772511848341232, |
| "grad_norm": 16.046553980851552, |
| "learning_rate": 9.987123280716402e-07, |
| "loss": 0.6294, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.278436018957346, |
| "grad_norm": 12.483201348763467, |
| "learning_rate": 9.98635503737675e-07, |
| "loss": 0.6294, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.2796208530805687, |
| "grad_norm": 17.59142339660279, |
| "learning_rate": 9.985564567238236e-07, |
| "loss": 0.6094, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.28080568720379145, |
| "grad_norm": 16.71315419070765, |
| "learning_rate": 9.9847518738244e-07, |
| "loss": 0.6036, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.28199052132701424, |
| "grad_norm": 24.971593610223145, |
| "learning_rate": 9.98391696075784e-07, |
| "loss": 0.6105, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.283175355450237, |
| "grad_norm": 14.983923687572714, |
| "learning_rate": 9.983059831760205e-07, |
| "loss": 0.611, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.2843601895734597, |
| "grad_norm": 20.998049091911874, |
| "learning_rate": 9.982180490652164e-07, |
| "loss": 0.5744, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.28554502369668244, |
| "grad_norm": 26.833381930400048, |
| "learning_rate": 9.981278941353406e-07, |
| "loss": 0.6077, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.28672985781990523, |
| "grad_norm": 18.431818850347863, |
| "learning_rate": 9.980355187882604e-07, |
| "loss": 0.6052, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.28791469194312796, |
| "grad_norm": 13.590781471879904, |
| "learning_rate": 9.979409234357416e-07, |
| "loss": 0.6044, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.2890995260663507, |
| "grad_norm": 16.197540843506868, |
| "learning_rate": 9.97844108499445e-07, |
| "loss": 0.5904, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.2902843601895735, |
| "grad_norm": 14.326350645052589, |
| "learning_rate": 9.977450744109258e-07, |
| "loss": 0.6138, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.2914691943127962, |
| "grad_norm": 16.000841952080826, |
| "learning_rate": 9.976438216116304e-07, |
| "loss": 0.5841, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.29265402843601895, |
| "grad_norm": 18.889358277976672, |
| "learning_rate": 9.975403505528961e-07, |
| "loss": 0.575, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.2938388625592417, |
| "grad_norm": 22.108322730446503, |
| "learning_rate": 9.974346616959475e-07, |
| "loss": 0.5583, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.2950236966824645, |
| "grad_norm": 18.209727273646372, |
| "learning_rate": 9.973267555118952e-07, |
| "loss": 0.5823, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.2962085308056872, |
| "grad_norm": 13.709315722628926, |
| "learning_rate": 9.972166324817336e-07, |
| "loss": 0.5913, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.2962085308056872, |
| "eval_loss": 0.5701844692230225, |
| "eval_runtime": 55.0664, |
| "eval_samples_per_second": 14.946, |
| "eval_steps_per_second": 0.944, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.29739336492890994, |
| "grad_norm": 18.07345575992755, |
| "learning_rate": 9.97104293096339e-07, |
| "loss": 0.5743, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.2985781990521327, |
| "grad_norm": 20.622897768235585, |
| "learning_rate": 9.969897378564667e-07, |
| "loss": 0.5779, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.29976303317535546, |
| "grad_norm": 13.016776898697422, |
| "learning_rate": 9.968729672727493e-07, |
| "loss": 0.5739, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.3009478672985782, |
| "grad_norm": 17.91120105322002, |
| "learning_rate": 9.967539818656952e-07, |
| "loss": 0.5588, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.30213270142180093, |
| "grad_norm": 19.232325042748638, |
| "learning_rate": 9.966327821656841e-07, |
| "loss": 0.5633, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.3033175355450237, |
| "grad_norm": 23.47321995589726, |
| "learning_rate": 9.965093687129667e-07, |
| "loss": 0.5704, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.30450236966824645, |
| "grad_norm": 12.62217881537126, |
| "learning_rate": 9.963837420576618e-07, |
| "loss": 0.5795, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.3056872037914692, |
| "grad_norm": 16.57740235377143, |
| "learning_rate": 9.96255902759753e-07, |
| "loss": 0.5857, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.3068720379146919, |
| "grad_norm": 14.94128961102294, |
| "learning_rate": 9.961258513890873e-07, |
| "loss": 0.5666, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.3080568720379147, |
| "grad_norm": 11.83770792082476, |
| "learning_rate": 9.959935885253715e-07, |
| "loss": 0.5701, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.30924170616113744, |
| "grad_norm": 16.858957675590407, |
| "learning_rate": 9.958591147581707e-07, |
| "loss": 0.5513, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.3104265402843602, |
| "grad_norm": 8.613890836880318, |
| "learning_rate": 9.957224306869053e-07, |
| "loss": 0.5861, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.31161137440758296, |
| "grad_norm": 24.30412660533916, |
| "learning_rate": 9.955835369208473e-07, |
| "loss": 0.5253, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.3127962085308057, |
| "grad_norm": 13.509894066187833, |
| "learning_rate": 9.954424340791195e-07, |
| "loss": 0.5404, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.31398104265402843, |
| "grad_norm": 19.97859230272331, |
| "learning_rate": 9.952991227906909e-07, |
| "loss": 0.4929, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.31516587677725116, |
| "grad_norm": 14.659876223619262, |
| "learning_rate": 9.951536036943753e-07, |
| "loss": 0.5207, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.31635071090047395, |
| "grad_norm": 16.82648724963916, |
| "learning_rate": 9.950058774388277e-07, |
| "loss": 0.5042, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.3175355450236967, |
| "grad_norm": 16.48505871135186, |
| "learning_rate": 9.948559446825411e-07, |
| "loss": 0.5068, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.3187203791469194, |
| "grad_norm": 8.986957964532992, |
| "learning_rate": 9.94703806093845e-07, |
| "loss": 0.5482, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.31990521327014215, |
| "grad_norm": 15.557609628645618, |
| "learning_rate": 9.945494623509002e-07, |
| "loss": 0.5149, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.32109004739336494, |
| "grad_norm": 8.220937327819602, |
| "learning_rate": 9.943929141416977e-07, |
| "loss": 0.5604, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.3222748815165877, |
| "grad_norm": 8.409373218590941, |
| "learning_rate": 9.942341621640557e-07, |
| "loss": 0.5693, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.3234597156398104, |
| "grad_norm": 16.43820378461214, |
| "learning_rate": 9.940732071256144e-07, |
| "loss": 0.4995, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.3246445497630332, |
| "grad_norm": 13.252808150885926, |
| "learning_rate": 9.93910049743835e-07, |
| "loss": 0.5654, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.32582938388625593, |
| "grad_norm": 13.986032395862484, |
| "learning_rate": 9.937446907459953e-07, |
| "loss": 0.4991, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.32701421800947866, |
| "grad_norm": 17.89107100963545, |
| "learning_rate": 9.93577130869187e-07, |
| "loss": 0.5458, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.3281990521327014, |
| "grad_norm": 11.837076385727045, |
| "learning_rate": 9.934073708603129e-07, |
| "loss": 0.5505, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.3293838862559242, |
| "grad_norm": 24.996997176345527, |
| "learning_rate": 9.932354114760817e-07, |
| "loss": 0.4814, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.3305687203791469, |
| "grad_norm": 18.253469210284003, |
| "learning_rate": 9.930612534830068e-07, |
| "loss": 0.4836, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.33175355450236965, |
| "grad_norm": 18.194976720930658, |
| "learning_rate": 9.928848976574018e-07, |
| "loss": 0.5073, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.33293838862559244, |
| "grad_norm": 12.062798161873806, |
| "learning_rate": 9.92706344785377e-07, |
| "loss": 0.5107, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.3341232227488152, |
| "grad_norm": 8.94049601499799, |
| "learning_rate": 9.925255956628361e-07, |
| "loss": 0.518, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.3353080568720379, |
| "grad_norm": 10.188890972987107, |
| "learning_rate": 9.92342651095473e-07, |
| "loss": 0.5235, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.33649289099526064, |
| "grad_norm": 12.800443292579475, |
| "learning_rate": 9.921575118987671e-07, |
| "loss": 0.5658, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.33767772511848343, |
| "grad_norm": 31.455015869085003, |
| "learning_rate": 9.919701788979812e-07, |
| "loss": 0.5875, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.33886255924170616, |
| "grad_norm": 15.744017529825106, |
| "learning_rate": 9.917806529281566e-07, |
| "loss": 0.49, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.3400473933649289, |
| "grad_norm": 19.703316838489233, |
| "learning_rate": 9.915889348341096e-07, |
| "loss": 0.5247, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.3412322274881517, |
| "grad_norm": 11.492986218008904, |
| "learning_rate": 9.91395025470429e-07, |
| "loss": 0.5366, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.3424170616113744, |
| "grad_norm": 11.964355876969535, |
| "learning_rate": 9.911989257014699e-07, |
| "loss": 0.4738, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.34360189573459715, |
| "grad_norm": 10.275962280696506, |
| "learning_rate": 9.91000636401352e-07, |
| "loss": 0.5351, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.3447867298578199, |
| "grad_norm": 10.213809707901728, |
| "learning_rate": 9.908001584539547e-07, |
| "loss": 0.5354, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.3459715639810427, |
| "grad_norm": 17.95448112723858, |
| "learning_rate": 9.905974927529133e-07, |
| "loss": 0.5051, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.3471563981042654, |
| "grad_norm": 26.40149829400645, |
| "learning_rate": 9.90392640201615e-07, |
| "loss": 0.4529, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.34834123222748814, |
| "grad_norm": 13.875172918400288, |
| "learning_rate": 9.901856017131954e-07, |
| "loss": 0.4558, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.3495260663507109, |
| "grad_norm": 11.52864219662223, |
| "learning_rate": 9.899763782105331e-07, |
| "loss": 0.4818, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.35071090047393366, |
| "grad_norm": 21.328486936498273, |
| "learning_rate": 9.897649706262473e-07, |
| "loss": 0.4784, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.3518957345971564, |
| "grad_norm": 17.693095134232415, |
| "learning_rate": 9.89551379902692e-07, |
| "loss": 0.4655, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.35308056872037913, |
| "grad_norm": 25.968174875217105, |
| "learning_rate": 9.893356069919537e-07, |
| "loss": 0.5219, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.3542654028436019, |
| "grad_norm": 19.931738567555474, |
| "learning_rate": 9.89117652855845e-07, |
| "loss": 0.5079, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.35545023696682465, |
| "grad_norm": 12.373031819577077, |
| "learning_rate": 9.888975184659016e-07, |
| "loss": 0.4765, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.35545023696682465, |
| "eval_loss": 0.4756671190261841, |
| "eval_runtime": 53.9082, |
| "eval_samples_per_second": 15.267, |
| "eval_steps_per_second": 0.965, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.3566350710900474, |
| "grad_norm": 11.901393690452515, |
| "learning_rate": 9.886752048033784e-07, |
| "loss": 0.5016, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.3578199052132701, |
| "grad_norm": 13.650779933985442, |
| "learning_rate": 9.884507128592434e-07, |
| "loss": 0.4753, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.3590047393364929, |
| "grad_norm": 20.815524417557032, |
| "learning_rate": 9.882240436341753e-07, |
| "loss": 0.5277, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.36018957345971564, |
| "grad_norm": 12.74609352334382, |
| "learning_rate": 9.879951981385577e-07, |
| "loss": 0.4511, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.3613744075829384, |
| "grad_norm": 11.556206853163904, |
| "learning_rate": 9.877641773924747e-07, |
| "loss": 0.4753, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.36255924170616116, |
| "grad_norm": 15.56091408911645, |
| "learning_rate": 9.87530982425707e-07, |
| "loss": 0.4923, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.3637440758293839, |
| "grad_norm": 26.248499034472612, |
| "learning_rate": 9.872956142777269e-07, |
| "loss": 0.4912, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.36492890995260663, |
| "grad_norm": 21.677314925110117, |
| "learning_rate": 9.870580739976935e-07, |
| "loss": 0.4263, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.36611374407582936, |
| "grad_norm": 16.062837045314517, |
| "learning_rate": 9.868183626444486e-07, |
| "loss": 0.423, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.36729857819905215, |
| "grad_norm": 22.884567704263773, |
| "learning_rate": 9.865764812865111e-07, |
| "loss": 0.486, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.3684834123222749, |
| "grad_norm": 7.861313978764723, |
| "learning_rate": 9.863324310020733e-07, |
| "loss": 0.4869, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.3696682464454976, |
| "grad_norm": 17.102991536445742, |
| "learning_rate": 9.860862128789952e-07, |
| "loss": 0.4391, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.3708530805687204, |
| "grad_norm": 11.567087012823887, |
| "learning_rate": 9.858378280148002e-07, |
| "loss": 0.4084, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.37203791469194314, |
| "grad_norm": 11.587792701499762, |
| "learning_rate": 9.855872775166694e-07, |
| "loss": 0.4433, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.3732227488151659, |
| "grad_norm": 23.622410585506035, |
| "learning_rate": 9.853345625014383e-07, |
| "loss": 0.4305, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.3744075829383886, |
| "grad_norm": 13.658748759519776, |
| "learning_rate": 9.850796840955899e-07, |
| "loss": 0.3982, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.3755924170616114, |
| "grad_norm": 10.548121395453487, |
| "learning_rate": 9.848226434352512e-07, |
| "loss": 0.441, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.3767772511848341, |
| "grad_norm": 8.899859469015052, |
| "learning_rate": 9.845634416661867e-07, |
| "loss": 0.4485, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.37796208530805686, |
| "grad_norm": 13.742583201602377, |
| "learning_rate": 9.843020799437949e-07, |
| "loss": 0.3761, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.3791469194312796, |
| "grad_norm": 13.610551297016709, |
| "learning_rate": 9.840385594331022e-07, |
| "loss": 0.4778, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.3803317535545024, |
| "grad_norm": 9.005064796206403, |
| "learning_rate": 9.837728813087573e-07, |
| "loss": 0.4638, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.3815165876777251, |
| "grad_norm": 14.791810107030555, |
| "learning_rate": 9.835050467550272e-07, |
| "loss": 0.4583, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.38270142180094785, |
| "grad_norm": 10.65855589614688, |
| "learning_rate": 9.832350569657909e-07, |
| "loss": 0.4448, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.38388625592417064, |
| "grad_norm": 9.546803884411506, |
| "learning_rate": 9.82962913144534e-07, |
| "loss": 0.4436, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.38507109004739337, |
| "grad_norm": 10.287431160752277, |
| "learning_rate": 9.82688616504345e-07, |
| "loss": 0.4474, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.3862559241706161, |
| "grad_norm": 17.228603164183177, |
| "learning_rate": 9.824121682679072e-07, |
| "loss": 0.3995, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.38744075829383884, |
| "grad_norm": 13.097102197908033, |
| "learning_rate": 9.821335696674956e-07, |
| "loss": 0.4524, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.3886255924170616, |
| "grad_norm": 8.529650091081256, |
| "learning_rate": 9.818528219449704e-07, |
| "loss": 0.4443, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.38981042654028436, |
| "grad_norm": 8.07159063167578, |
| "learning_rate": 9.81569926351771e-07, |
| "loss": 0.4785, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.3909952606635071, |
| "grad_norm": 12.444263995999188, |
| "learning_rate": 9.812848841489118e-07, |
| "loss": 0.3992, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.3921800947867299, |
| "grad_norm": 13.809272050407982, |
| "learning_rate": 9.80997696606975e-07, |
| "loss": 0.4228, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.3933649289099526, |
| "grad_norm": 12.96433701549732, |
| "learning_rate": 9.807083650061062e-07, |
| "loss": 0.4429, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.39454976303317535, |
| "grad_norm": 25.039415619632724, |
| "learning_rate": 9.80416890636008e-07, |
| "loss": 0.4477, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.3957345971563981, |
| "grad_norm": 11.801141489512451, |
| "learning_rate": 9.801232747959347e-07, |
| "loss": 0.4057, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.39691943127962087, |
| "grad_norm": 10.40250553997614, |
| "learning_rate": 9.798275187946859e-07, |
| "loss": 0.4461, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.3981042654028436, |
| "grad_norm": 15.30568042611723, |
| "learning_rate": 9.79529623950601e-07, |
| "loss": 0.3718, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.39928909952606634, |
| "grad_norm": 14.314286641368666, |
| "learning_rate": 9.792295915915538e-07, |
| "loss": 0.4786, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.4004739336492891, |
| "grad_norm": 25.85018234396281, |
| "learning_rate": 9.789274230549457e-07, |
| "loss": 0.3853, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.40165876777251186, |
| "grad_norm": 11.907701908243332, |
| "learning_rate": 9.786231196877003e-07, |
| "loss": 0.4767, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.4028436018957346, |
| "grad_norm": 15.77066249877433, |
| "learning_rate": 9.783166828462572e-07, |
| "loss": 0.416, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.4040284360189573, |
| "grad_norm": 12.04950477801811, |
| "learning_rate": 9.780081138965663e-07, |
| "loss": 0.3606, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.4052132701421801, |
| "grad_norm": 15.477847258339152, |
| "learning_rate": 9.77697414214081e-07, |
| "loss": 0.4484, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.40639810426540285, |
| "grad_norm": 14.577869548106914, |
| "learning_rate": 9.773845851837526e-07, |
| "loss": 0.3849, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.4075829383886256, |
| "grad_norm": 12.396572387524897, |
| "learning_rate": 9.770696282000244e-07, |
| "loss": 0.4499, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.4087677725118483, |
| "grad_norm": 26.07143660678688, |
| "learning_rate": 9.767525446668245e-07, |
| "loss": 0.4213, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.4099526066350711, |
| "grad_norm": 11.257790868634928, |
| "learning_rate": 9.764333359975609e-07, |
| "loss": 0.3746, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.41113744075829384, |
| "grad_norm": 12.642981496078672, |
| "learning_rate": 9.761120036151135e-07, |
| "loss": 0.3848, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.41232227488151657, |
| "grad_norm": 10.063976006804062, |
| "learning_rate": 9.757885489518296e-07, |
| "loss": 0.4124, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.41350710900473936, |
| "grad_norm": 17.188706479555265, |
| "learning_rate": 9.754629734495162e-07, |
| "loss": 0.4039, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.4146919431279621, |
| "grad_norm": 9.497211967243476, |
| "learning_rate": 9.751352785594336e-07, |
| "loss": 0.382, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.4146919431279621, |
| "eval_loss": 0.39488136768341064, |
| "eval_runtime": 52.1468, |
| "eval_samples_per_second": 15.782, |
| "eval_steps_per_second": 0.997, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.4158767772511848, |
| "grad_norm": 12.756224679112309, |
| "learning_rate": 9.748054657422901e-07, |
| "loss": 0.4015, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.41706161137440756, |
| "grad_norm": 9.07206117193569, |
| "learning_rate": 9.744735364682344e-07, |
| "loss": 0.3581, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.41824644549763035, |
| "grad_norm": 23.106705695654462, |
| "learning_rate": 9.741394922168494e-07, |
| "loss": 0.3788, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.4194312796208531, |
| "grad_norm": 10.974238261984121, |
| "learning_rate": 9.73803334477145e-07, |
| "loss": 0.4242, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.4206161137440758, |
| "grad_norm": 12.994216146112727, |
| "learning_rate": 9.73465064747553e-07, |
| "loss": 0.403, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.4218009478672986, |
| "grad_norm": 18.925007359784697, |
| "learning_rate": 9.731246845359184e-07, |
| "loss": 0.3239, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.42298578199052134, |
| "grad_norm": 8.90716567987188, |
| "learning_rate": 9.727821953594949e-07, |
| "loss": 0.4393, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.42417061611374407, |
| "grad_norm": 9.875417282979411, |
| "learning_rate": 9.724375987449358e-07, |
| "loss": 0.431, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.4253554502369668, |
| "grad_norm": 9.978683204562415, |
| "learning_rate": 9.720908962282891e-07, |
| "loss": 0.3797, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.4265402843601896, |
| "grad_norm": 12.328565183998132, |
| "learning_rate": 9.7174208935499e-07, |
| "loss": 0.3872, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.4277251184834123, |
| "grad_norm": 16.8038180357482, |
| "learning_rate": 9.713911796798532e-07, |
| "loss": 0.326, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.42890995260663506, |
| "grad_norm": 13.56237675029536, |
| "learning_rate": 9.710381687670674e-07, |
| "loss": 0.3193, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.43009478672985785, |
| "grad_norm": 9.779248850664269, |
| "learning_rate": 9.70683058190187e-07, |
| "loss": 0.4309, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.4312796208530806, |
| "grad_norm": 25.506229124536276, |
| "learning_rate": 9.703258495321265e-07, |
| "loss": 0.3538, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.4324644549763033, |
| "grad_norm": 20.45504872820827, |
| "learning_rate": 9.699665443851516e-07, |
| "loss": 0.3492, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.43364928909952605, |
| "grad_norm": 13.89447348939714, |
| "learning_rate": 9.696051443508743e-07, |
| "loss": 0.3398, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.43483412322274884, |
| "grad_norm": 8.113018348581734, |
| "learning_rate": 9.692416510402438e-07, |
| "loss": 0.3549, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.43601895734597157, |
| "grad_norm": 9.72301680291045, |
| "learning_rate": 9.688760660735402e-07, |
| "loss": 0.3721, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.4372037914691943, |
| "grad_norm": 33.8473636494632, |
| "learning_rate": 9.685083910803675e-07, |
| "loss": 0.3994, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.43838862559241704, |
| "grad_norm": 13.79742948772135, |
| "learning_rate": 9.681386276996462e-07, |
| "loss": 0.2895, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.4395734597156398, |
| "grad_norm": 20.772884497594852, |
| "learning_rate": 9.677667775796051e-07, |
| "loss": 0.3482, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.44075829383886256, |
| "grad_norm": 8.179600721905086, |
| "learning_rate": 9.673928423777756e-07, |
| "loss": 0.3725, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.4419431279620853, |
| "grad_norm": 13.178159783502442, |
| "learning_rate": 9.670168237609826e-07, |
| "loss": 0.2856, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.4431279620853081, |
| "grad_norm": 8.912426226468748, |
| "learning_rate": 9.666387234053385e-07, |
| "loss": 0.3303, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.4443127962085308, |
| "grad_norm": 21.400582293310897, |
| "learning_rate": 9.662585429962343e-07, |
| "loss": 0.2905, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.44549763033175355, |
| "grad_norm": 32.80855736267371, |
| "learning_rate": 9.658762842283341e-07, |
| "loss": 0.3911, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.4466824644549763, |
| "grad_norm": 22.395443998215274, |
| "learning_rate": 9.654919488055655e-07, |
| "loss": 0.4053, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.44786729857819907, |
| "grad_norm": 23.236385222445783, |
| "learning_rate": 9.651055384411128e-07, |
| "loss": 0.3227, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.4490521327014218, |
| "grad_norm": 18.495951608692828, |
| "learning_rate": 9.647170548574096e-07, |
| "loss": 0.3531, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.45023696682464454, |
| "grad_norm": 10.010304118535766, |
| "learning_rate": 9.643264997861312e-07, |
| "loss": 0.3041, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.4514218009478673, |
| "grad_norm": 16.208569360943383, |
| "learning_rate": 9.639338749681858e-07, |
| "loss": 0.3441, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.45260663507109006, |
| "grad_norm": 20.365615006060256, |
| "learning_rate": 9.635391821537087e-07, |
| "loss": 0.2808, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.4537914691943128, |
| "grad_norm": 10.662406472341873, |
| "learning_rate": 9.631424231020522e-07, |
| "loss": 0.3038, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.4549763033175355, |
| "grad_norm": 18.323625372296732, |
| "learning_rate": 9.627435995817797e-07, |
| "loss": 0.3919, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.4561611374407583, |
| "grad_norm": 44.08391965239445, |
| "learning_rate": 9.623427133706567e-07, |
| "loss": 0.393, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.45734597156398105, |
| "grad_norm": 27.040918313360542, |
| "learning_rate": 9.619397662556433e-07, |
| "loss": 0.3563, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.4585308056872038, |
| "grad_norm": 25.347089501942985, |
| "learning_rate": 9.61534760032886e-07, |
| "loss": 0.3483, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.4597156398104265, |
| "grad_norm": 17.035195231859507, |
| "learning_rate": 9.611276965077097e-07, |
| "loss": 0.3364, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.4609004739336493, |
| "grad_norm": 21.246750011503874, |
| "learning_rate": 9.607185774946104e-07, |
| "loss": 0.3101, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.46208530805687204, |
| "grad_norm": 17.210843425254826, |
| "learning_rate": 9.603074048172457e-07, |
| "loss": 0.3319, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.46327014218009477, |
| "grad_norm": 24.98240664073611, |
| "learning_rate": 9.59894180308428e-07, |
| "loss": 0.3649, |
| "step": 391 |
| }, |
| { |
| "epoch": 0.46445497630331756, |
| "grad_norm": 24.468616634471285, |
| "learning_rate": 9.594789058101153e-07, |
| "loss": 0.3073, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.4656398104265403, |
| "grad_norm": 15.180353369606431, |
| "learning_rate": 9.59061583173404e-07, |
| "loss": 0.3272, |
| "step": 393 |
| }, |
| { |
| "epoch": 0.466824644549763, |
| "grad_norm": 11.417227295893692, |
| "learning_rate": 9.5864221425852e-07, |
| "loss": 0.2914, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.46800947867298576, |
| "grad_norm": 28.69801888783048, |
| "learning_rate": 9.582208009348102e-07, |
| "loss": 0.3353, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.46919431279620855, |
| "grad_norm": 14.918763851473592, |
| "learning_rate": 9.577973450807351e-07, |
| "loss": 0.3012, |
| "step": 396 |
| }, |
| { |
| "epoch": 0.4703791469194313, |
| "grad_norm": 9.293352558696206, |
| "learning_rate": 9.57371848583859e-07, |
| "loss": 0.3748, |
| "step": 397 |
| }, |
| { |
| "epoch": 0.471563981042654, |
| "grad_norm": 15.565028469986906, |
| "learning_rate": 9.569443133408433e-07, |
| "loss": 0.2178, |
| "step": 398 |
| }, |
| { |
| "epoch": 0.4727488151658768, |
| "grad_norm": 10.490221123541932, |
| "learning_rate": 9.565147412574365e-07, |
| "loss": 0.2922, |
| "step": 399 |
| }, |
| { |
| "epoch": 0.47393364928909953, |
| "grad_norm": 11.285895674191094, |
| "learning_rate": 9.560831342484666e-07, |
| "loss": 0.3337, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.47393364928909953, |
| "eval_loss": 0.330232173204422, |
| "eval_runtime": 53.3543, |
| "eval_samples_per_second": 15.425, |
| "eval_steps_per_second": 0.975, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.47511848341232227, |
| "grad_norm": 28.038068999167958, |
| "learning_rate": 9.556494942378326e-07, |
| "loss": 0.3151, |
| "step": 401 |
| }, |
| { |
| "epoch": 0.476303317535545, |
| "grad_norm": 11.577255213698676, |
| "learning_rate": 9.55213823158495e-07, |
| "loss": 0.3169, |
| "step": 402 |
| }, |
| { |
| "epoch": 0.4774881516587678, |
| "grad_norm": 28.394190964261917, |
| "learning_rate": 9.547761229524686e-07, |
| "loss": 0.3333, |
| "step": 403 |
| }, |
| { |
| "epoch": 0.4786729857819905, |
| "grad_norm": 15.348322403958202, |
| "learning_rate": 9.543363955708124e-07, |
| "loss": 0.2914, |
| "step": 404 |
| }, |
| { |
| "epoch": 0.47985781990521326, |
| "grad_norm": 18.702903061519496, |
| "learning_rate": 9.538946429736222e-07, |
| "loss": 0.2772, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.48104265402843605, |
| "grad_norm": 16.755380600846234, |
| "learning_rate": 9.534508671300207e-07, |
| "loss": 0.2896, |
| "step": 406 |
| }, |
| { |
| "epoch": 0.4822274881516588, |
| "grad_norm": 20.912613748172376, |
| "learning_rate": 9.530050700181498e-07, |
| "loss": 0.2833, |
| "step": 407 |
| }, |
| { |
| "epoch": 0.4834123222748815, |
| "grad_norm": 20.828932167424576, |
| "learning_rate": 9.525572536251605e-07, |
| "loss": 0.3388, |
| "step": 408 |
| }, |
| { |
| "epoch": 0.48459715639810425, |
| "grad_norm": 10.1105886949867, |
| "learning_rate": 9.521074199472058e-07, |
| "loss": 0.304, |
| "step": 409 |
| }, |
| { |
| "epoch": 0.48578199052132703, |
| "grad_norm": 13.465306721197914, |
| "learning_rate": 9.516555709894298e-07, |
| "loss": 0.3369, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.48696682464454977, |
| "grad_norm": 13.601780436154742, |
| "learning_rate": 9.512017087659607e-07, |
| "loss": 0.2554, |
| "step": 411 |
| }, |
| { |
| "epoch": 0.4881516587677725, |
| "grad_norm": 33.58961567504914, |
| "learning_rate": 9.507458352999001e-07, |
| "loss": 0.2462, |
| "step": 412 |
| }, |
| { |
| "epoch": 0.48933649289099523, |
| "grad_norm": 24.341590716742427, |
| "learning_rate": 9.50287952623315e-07, |
| "loss": 0.3197, |
| "step": 413 |
| }, |
| { |
| "epoch": 0.490521327014218, |
| "grad_norm": 30.667743812966282, |
| "learning_rate": 9.498280627772286e-07, |
| "loss": 0.3444, |
| "step": 414 |
| }, |
| { |
| "epoch": 0.49170616113744076, |
| "grad_norm": 11.036970679678214, |
| "learning_rate": 9.493661678116111e-07, |
| "loss": 0.2676, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.4928909952606635, |
| "grad_norm": 13.416233053816551, |
| "learning_rate": 9.489022697853708e-07, |
| "loss": 0.2569, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.4940758293838863, |
| "grad_norm": 18.498918047484647, |
| "learning_rate": 9.484363707663441e-07, |
| "loss": 0.2786, |
| "step": 417 |
| }, |
| { |
| "epoch": 0.495260663507109, |
| "grad_norm": 13.091240724568658, |
| "learning_rate": 9.479684728312873e-07, |
| "loss": 0.3145, |
| "step": 418 |
| }, |
| { |
| "epoch": 0.49644549763033174, |
| "grad_norm": 15.68575581642405, |
| "learning_rate": 9.474985780658669e-07, |
| "loss": 0.3368, |
| "step": 419 |
| }, |
| { |
| "epoch": 0.4976303317535545, |
| "grad_norm": 10.607199617999157, |
| "learning_rate": 9.470266885646503e-07, |
| "loss": 0.3069, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.49881516587677727, |
| "grad_norm": 11.646651108047797, |
| "learning_rate": 9.465528064310962e-07, |
| "loss": 0.2546, |
| "step": 421 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 33.53142097878634, |
| "learning_rate": 9.46076933777546e-07, |
| "loss": 0.2844, |
| "step": 422 |
| }, |
| { |
| "epoch": 0.5011848341232228, |
| "grad_norm": 49.38351100638877, |
| "learning_rate": 9.455990727252134e-07, |
| "loss": 0.3986, |
| "step": 423 |
| }, |
| { |
| "epoch": 0.5023696682464455, |
| "grad_norm": 76.11068369059677, |
| "learning_rate": 9.451192254041758e-07, |
| "loss": 0.4806, |
| "step": 424 |
| }, |
| { |
| "epoch": 0.5035545023696683, |
| "grad_norm": 20.921249429852224, |
| "learning_rate": 9.446373939533642e-07, |
| "loss": 0.2513, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.504739336492891, |
| "grad_norm": 10.556489076929935, |
| "learning_rate": 9.44153580520554e-07, |
| "loss": 0.2736, |
| "step": 426 |
| }, |
| { |
| "epoch": 0.5059241706161137, |
| "grad_norm": 11.161679055753279, |
| "learning_rate": 9.436677872623556e-07, |
| "loss": 0.3311, |
| "step": 427 |
| }, |
| { |
| "epoch": 0.5071090047393365, |
| "grad_norm": 29.411384894701257, |
| "learning_rate": 9.431800163442041e-07, |
| "loss": 0.2743, |
| "step": 428 |
| }, |
| { |
| "epoch": 0.5082938388625592, |
| "grad_norm": 24.09078330704124, |
| "learning_rate": 9.426902699403501e-07, |
| "loss": 0.2984, |
| "step": 429 |
| }, |
| { |
| "epoch": 0.509478672985782, |
| "grad_norm": 41.60195448463097, |
| "learning_rate": 9.421985502338503e-07, |
| "loss": 0.3429, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.5106635071090048, |
| "grad_norm": 20.506598663359913, |
| "learning_rate": 9.417048594165571e-07, |
| "loss": 0.2812, |
| "step": 431 |
| }, |
| { |
| "epoch": 0.5118483412322274, |
| "grad_norm": 9.64959188004726, |
| "learning_rate": 9.412091996891095e-07, |
| "loss": 0.3062, |
| "step": 432 |
| }, |
| { |
| "epoch": 0.5130331753554502, |
| "grad_norm": 9.836481752429766, |
| "learning_rate": 9.407115732609227e-07, |
| "loss": 0.2302, |
| "step": 433 |
| }, |
| { |
| "epoch": 0.514218009478673, |
| "grad_norm": 17.30655443858829, |
| "learning_rate": 9.402119823501785e-07, |
| "loss": 0.2861, |
| "step": 434 |
| }, |
| { |
| "epoch": 0.5154028436018957, |
| "grad_norm": 9.843572391305129, |
| "learning_rate": 9.397104291838157e-07, |
| "loss": 0.2392, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.5165876777251185, |
| "grad_norm": 29.15515296698138, |
| "learning_rate": 9.392069159975198e-07, |
| "loss": 0.2655, |
| "step": 436 |
| }, |
| { |
| "epoch": 0.5177725118483413, |
| "grad_norm": 12.813628762652185, |
| "learning_rate": 9.387014450357127e-07, |
| "loss": 0.2816, |
| "step": 437 |
| }, |
| { |
| "epoch": 0.518957345971564, |
| "grad_norm": 17.614534795911805, |
| "learning_rate": 9.381940185515439e-07, |
| "loss": 0.3352, |
| "step": 438 |
| }, |
| { |
| "epoch": 0.5201421800947867, |
| "grad_norm": 11.6468061201244, |
| "learning_rate": 9.376846388068791e-07, |
| "loss": 0.2779, |
| "step": 439 |
| }, |
| { |
| "epoch": 0.5213270142180095, |
| "grad_norm": 8.711937405694224, |
| "learning_rate": 9.37173308072291e-07, |
| "loss": 0.2976, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.5225118483412322, |
| "grad_norm": 12.218827336023685, |
| "learning_rate": 9.366600286270488e-07, |
| "loss": 0.2385, |
| "step": 441 |
| }, |
| { |
| "epoch": 0.523696682464455, |
| "grad_norm": 8.929215350730324, |
| "learning_rate": 9.361448027591079e-07, |
| "loss": 0.2481, |
| "step": 442 |
| }, |
| { |
| "epoch": 0.5248815165876777, |
| "grad_norm": 19.502457511436713, |
| "learning_rate": 9.356276327651005e-07, |
| "loss": 0.3382, |
| "step": 443 |
| }, |
| { |
| "epoch": 0.5260663507109005, |
| "grad_norm": 9.4443581912619, |
| "learning_rate": 9.35108520950324e-07, |
| "loss": 0.2613, |
| "step": 444 |
| }, |
| { |
| "epoch": 0.5272511848341233, |
| "grad_norm": 14.850874350089374, |
| "learning_rate": 9.345874696287323e-07, |
| "loss": 0.2541, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.5284360189573459, |
| "grad_norm": 9.118502911961933, |
| "learning_rate": 9.340644811229242e-07, |
| "loss": 0.276, |
| "step": 446 |
| }, |
| { |
| "epoch": 0.5296208530805687, |
| "grad_norm": 23.523085623060844, |
| "learning_rate": 9.335395577641336e-07, |
| "loss": 0.2921, |
| "step": 447 |
| }, |
| { |
| "epoch": 0.5308056872037915, |
| "grad_norm": 18.972413118807122, |
| "learning_rate": 9.330127018922193e-07, |
| "loss": 0.3997, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.5319905213270142, |
| "grad_norm": 17.752099885602984, |
| "learning_rate": 9.324839158556541e-07, |
| "loss": 0.2269, |
| "step": 449 |
| }, |
| { |
| "epoch": 0.533175355450237, |
| "grad_norm": 16.130571939179283, |
| "learning_rate": 9.319532020115146e-07, |
| "loss": 0.2667, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.533175355450237, |
| "eval_loss": 0.2882239818572998, |
| "eval_runtime": 57.9598, |
| "eval_samples_per_second": 14.199, |
| "eval_steps_per_second": 0.897, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.5343601895734598, |
| "grad_norm": 10.798615084456, |
| "learning_rate": 9.314205627254705e-07, |
| "loss": 0.3282, |
| "step": 451 |
| }, |
| { |
| "epoch": 0.5355450236966824, |
| "grad_norm": 9.774035887071843, |
| "learning_rate": 9.308860003717748e-07, |
| "loss": 0.281, |
| "step": 452 |
| }, |
| { |
| "epoch": 0.5367298578199052, |
| "grad_norm": 16.038788995378557, |
| "learning_rate": 9.303495173332518e-07, |
| "loss": 0.2562, |
| "step": 453 |
| }, |
| { |
| "epoch": 0.5379146919431279, |
| "grad_norm": 11.384133516677016, |
| "learning_rate": 9.298111160012879e-07, |
| "loss": 0.3327, |
| "step": 454 |
| }, |
| { |
| "epoch": 0.5390995260663507, |
| "grad_norm": 8.79727604270708, |
| "learning_rate": 9.2927079877582e-07, |
| "loss": 0.2762, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.5402843601895735, |
| "grad_norm": 18.163749435039747, |
| "learning_rate": 9.287285680653254e-07, |
| "loss": 0.3339, |
| "step": 456 |
| }, |
| { |
| "epoch": 0.5414691943127962, |
| "grad_norm": 14.60130218396388, |
| "learning_rate": 9.281844262868107e-07, |
| "loss": 0.2554, |
| "step": 457 |
| }, |
| { |
| "epoch": 0.542654028436019, |
| "grad_norm": 13.71276669752716, |
| "learning_rate": 9.27638375865801e-07, |
| "loss": 0.2719, |
| "step": 458 |
| }, |
| { |
| "epoch": 0.5438388625592417, |
| "grad_norm": 9.152330320021967, |
| "learning_rate": 9.270904192363293e-07, |
| "loss": 0.271, |
| "step": 459 |
| }, |
| { |
| "epoch": 0.5450236966824644, |
| "grad_norm": 8.622874280639174, |
| "learning_rate": 9.265405588409256e-07, |
| "loss": 0.2657, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.5462085308056872, |
| "grad_norm": 26.89194137073647, |
| "learning_rate": 9.259887971306063e-07, |
| "loss": 0.31, |
| "step": 461 |
| }, |
| { |
| "epoch": 0.54739336492891, |
| "grad_norm": 20.12125062032546, |
| "learning_rate": 9.254351365648623e-07, |
| "loss": 0.2562, |
| "step": 462 |
| }, |
| { |
| "epoch": 0.5485781990521327, |
| "grad_norm": 12.317200130554701, |
| "learning_rate": 9.248795796116491e-07, |
| "loss": 0.2975, |
| "step": 463 |
| }, |
| { |
| "epoch": 0.5497630331753555, |
| "grad_norm": 11.025734474055628, |
| "learning_rate": 9.243221287473755e-07, |
| "loss": 0.2336, |
| "step": 464 |
| }, |
| { |
| "epoch": 0.5509478672985783, |
| "grad_norm": 27.10300511896542, |
| "learning_rate": 9.23762786456892e-07, |
| "loss": 0.3397, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.5521327014218009, |
| "grad_norm": 8.033107110139403, |
| "learning_rate": 9.232015552334806e-07, |
| "loss": 0.2816, |
| "step": 466 |
| }, |
| { |
| "epoch": 0.5533175355450237, |
| "grad_norm": 8.737091390744437, |
| "learning_rate": 9.226384375788434e-07, |
| "loss": 0.2823, |
| "step": 467 |
| }, |
| { |
| "epoch": 0.5545023696682464, |
| "grad_norm": 15.494483524008073, |
| "learning_rate": 9.220734360030906e-07, |
| "loss": 0.2121, |
| "step": 468 |
| }, |
| { |
| "epoch": 0.5556872037914692, |
| "grad_norm": 18.735896230365537, |
| "learning_rate": 9.215065530247308e-07, |
| "loss": 0.2533, |
| "step": 469 |
| }, |
| { |
| "epoch": 0.556872037914692, |
| "grad_norm": 23.81669178709177, |
| "learning_rate": 9.209377911706584e-07, |
| "loss": 0.2746, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.5580568720379147, |
| "grad_norm": 21.159454245219628, |
| "learning_rate": 9.203671529761434e-07, |
| "loss": 0.2383, |
| "step": 471 |
| }, |
| { |
| "epoch": 0.5592417061611374, |
| "grad_norm": 9.301848540598128, |
| "learning_rate": 9.197946409848194e-07, |
| "loss": 0.2104, |
| "step": 472 |
| }, |
| { |
| "epoch": 0.5604265402843602, |
| "grad_norm": 23.482486410314653, |
| "learning_rate": 9.192202577486724e-07, |
| "loss": 0.2782, |
| "step": 473 |
| }, |
| { |
| "epoch": 0.5616113744075829, |
| "grad_norm": 17.27462356394257, |
| "learning_rate": 9.186440058280298e-07, |
| "loss": 0.2889, |
| "step": 474 |
| }, |
| { |
| "epoch": 0.5627962085308057, |
| "grad_norm": 15.545463980531496, |
| "learning_rate": 9.180658877915484e-07, |
| "loss": 0.2527, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.5639810426540285, |
| "grad_norm": 11.567535127758402, |
| "learning_rate": 9.174859062162037e-07, |
| "loss": 0.1832, |
| "step": 476 |
| }, |
| { |
| "epoch": 0.5651658767772512, |
| "grad_norm": 9.388250647196948, |
| "learning_rate": 9.169040636872773e-07, |
| "loss": 0.2731, |
| "step": 477 |
| }, |
| { |
| "epoch": 0.566350710900474, |
| "grad_norm": 23.509643846600934, |
| "learning_rate": 9.163203627983466e-07, |
| "loss": 0.3582, |
| "step": 478 |
| }, |
| { |
| "epoch": 0.5675355450236966, |
| "grad_norm": 12.082207751549461, |
| "learning_rate": 9.157348061512726e-07, |
| "loss": 0.2983, |
| "step": 479 |
| }, |
| { |
| "epoch": 0.5687203791469194, |
| "grad_norm": 16.662630945663924, |
| "learning_rate": 9.151473963561882e-07, |
| "loss": 0.2527, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.5699052132701422, |
| "grad_norm": 40.485858820321404, |
| "learning_rate": 9.145581360314867e-07, |
| "loss": 0.2963, |
| "step": 481 |
| }, |
| { |
| "epoch": 0.5710900473933649, |
| "grad_norm": 9.050228610199474, |
| "learning_rate": 9.139670278038107e-07, |
| "loss": 0.2482, |
| "step": 482 |
| }, |
| { |
| "epoch": 0.5722748815165877, |
| "grad_norm": 9.662434463280741, |
| "learning_rate": 9.133740743080392e-07, |
| "loss": 0.2478, |
| "step": 483 |
| }, |
| { |
| "epoch": 0.5734597156398105, |
| "grad_norm": 21.678208323314635, |
| "learning_rate": 9.127792781872768e-07, |
| "loss": 0.239, |
| "step": 484 |
| }, |
| { |
| "epoch": 0.5746445497630331, |
| "grad_norm": 11.468515771661801, |
| "learning_rate": 9.12182642092842e-07, |
| "loss": 0.2798, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.5758293838862559, |
| "grad_norm": 14.951189679567591, |
| "learning_rate": 9.115841686842543e-07, |
| "loss": 0.2098, |
| "step": 486 |
| }, |
| { |
| "epoch": 0.5770142180094787, |
| "grad_norm": 19.92647185342026, |
| "learning_rate": 9.109838606292239e-07, |
| "loss": 0.2159, |
| "step": 487 |
| }, |
| { |
| "epoch": 0.5781990521327014, |
| "grad_norm": 13.36729958783011, |
| "learning_rate": 9.103817206036382e-07, |
| "loss": 0.2629, |
| "step": 488 |
| }, |
| { |
| "epoch": 0.5793838862559242, |
| "grad_norm": 9.313353223167388, |
| "learning_rate": 9.09777751291551e-07, |
| "loss": 0.2911, |
| "step": 489 |
| }, |
| { |
| "epoch": 0.580568720379147, |
| "grad_norm": 9.262594651674652, |
| "learning_rate": 9.091719553851706e-07, |
| "loss": 0.2717, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.5817535545023697, |
| "grad_norm": 29.4435625345033, |
| "learning_rate": 9.085643355848466e-07, |
| "loss": 0.2651, |
| "step": 491 |
| }, |
| { |
| "epoch": 0.5829383886255924, |
| "grad_norm": 18.54225369010181, |
| "learning_rate": 9.079548945990592e-07, |
| "loss": 0.2638, |
| "step": 492 |
| }, |
| { |
| "epoch": 0.5841232227488151, |
| "grad_norm": 19.484158097176785, |
| "learning_rate": 9.073436351444064e-07, |
| "loss": 0.2397, |
| "step": 493 |
| }, |
| { |
| "epoch": 0.5853080568720379, |
| "grad_norm": 13.867282359289979, |
| "learning_rate": 9.067305599455919e-07, |
| "loss": 0.2688, |
| "step": 494 |
| }, |
| { |
| "epoch": 0.5864928909952607, |
| "grad_norm": 22.877893811260783, |
| "learning_rate": 9.061156717354137e-07, |
| "loss": 0.298, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.5876777251184834, |
| "grad_norm": 15.47565263048955, |
| "learning_rate": 9.054989732547506e-07, |
| "loss": 0.1763, |
| "step": 496 |
| }, |
| { |
| "epoch": 0.5888625592417062, |
| "grad_norm": 22.335682115478164, |
| "learning_rate": 9.048804672525512e-07, |
| "loss": 0.3208, |
| "step": 497 |
| }, |
| { |
| "epoch": 0.590047393364929, |
| "grad_norm": 28.746533629080943, |
| "learning_rate": 9.042601564858212e-07, |
| "loss": 0.2347, |
| "step": 498 |
| }, |
| { |
| "epoch": 0.5912322274881516, |
| "grad_norm": 13.201402592912574, |
| "learning_rate": 9.036380437196108e-07, |
| "loss": 0.2278, |
| "step": 499 |
| }, |
| { |
| "epoch": 0.5924170616113744, |
| "grad_norm": 9.51664794041542, |
| "learning_rate": 9.030141317270025e-07, |
| "loss": 0.2287, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.5924170616113744, |
| "eval_loss": 0.2544219195842743, |
| "eval_runtime": 60.3512, |
| "eval_samples_per_second": 13.637, |
| "eval_steps_per_second": 0.862, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.5936018957345972, |
| "grad_norm": 9.947570386307305, |
| "learning_rate": 9.023884232890997e-07, |
| "loss": 0.2918, |
| "step": 501 |
| }, |
| { |
| "epoch": 0.5947867298578199, |
| "grad_norm": 14.937306114287768, |
| "learning_rate": 9.017609211950126e-07, |
| "loss": 0.2268, |
| "step": 502 |
| }, |
| { |
| "epoch": 0.5959715639810427, |
| "grad_norm": 20.03266579033559, |
| "learning_rate": 9.011316282418473e-07, |
| "loss": 0.2355, |
| "step": 503 |
| }, |
| { |
| "epoch": 0.5971563981042654, |
| "grad_norm": 10.688118849978588, |
| "learning_rate": 9.005005472346923e-07, |
| "loss": 0.2649, |
| "step": 504 |
| }, |
| { |
| "epoch": 0.5983412322274881, |
| "grad_norm": 21.412875220411518, |
| "learning_rate": 8.998676809866066e-07, |
| "loss": 0.2503, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.5995260663507109, |
| "grad_norm": 11.743872196709809, |
| "learning_rate": 8.992330323186068e-07, |
| "loss": 0.2682, |
| "step": 506 |
| }, |
| { |
| "epoch": 0.6007109004739336, |
| "grad_norm": 11.771441009882276, |
| "learning_rate": 8.985966040596549e-07, |
| "loss": 0.2756, |
| "step": 507 |
| }, |
| { |
| "epoch": 0.6018957345971564, |
| "grad_norm": 17.976263194907734, |
| "learning_rate": 8.979583990466452e-07, |
| "loss": 0.202, |
| "step": 508 |
| }, |
| { |
| "epoch": 0.6030805687203792, |
| "grad_norm": 25.90047189441292, |
| "learning_rate": 8.973184201243922e-07, |
| "loss": 0.2376, |
| "step": 509 |
| }, |
| { |
| "epoch": 0.6042654028436019, |
| "grad_norm": 20.62266389659553, |
| "learning_rate": 8.966766701456176e-07, |
| "loss": 0.3115, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.6054502369668247, |
| "grad_norm": 23.091302678759135, |
| "learning_rate": 8.960331519709372e-07, |
| "loss": 0.2091, |
| "step": 511 |
| }, |
| { |
| "epoch": 0.6066350710900474, |
| "grad_norm": 16.47382278548211, |
| "learning_rate": 8.953878684688492e-07, |
| "loss": 0.2174, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.6078199052132701, |
| "grad_norm": 14.906355441019993, |
| "learning_rate": 8.947408225157205e-07, |
| "loss": 0.2446, |
| "step": 513 |
| }, |
| { |
| "epoch": 0.6090047393364929, |
| "grad_norm": 8.216540291358418, |
| "learning_rate": 8.940920169957739e-07, |
| "loss": 0.2575, |
| "step": 514 |
| }, |
| { |
| "epoch": 0.6101895734597157, |
| "grad_norm": 13.034643278399566, |
| "learning_rate": 8.934414548010762e-07, |
| "loss": 0.197, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.6113744075829384, |
| "grad_norm": 13.048988852473453, |
| "learning_rate": 8.92789138831524e-07, |
| "loss": 0.1853, |
| "step": 516 |
| }, |
| { |
| "epoch": 0.6125592417061612, |
| "grad_norm": 17.38819922422907, |
| "learning_rate": 8.921350719948315e-07, |
| "loss": 0.2427, |
| "step": 517 |
| }, |
| { |
| "epoch": 0.6137440758293838, |
| "grad_norm": 24.37628041900463, |
| "learning_rate": 8.914792572065177e-07, |
| "loss": 0.2592, |
| "step": 518 |
| }, |
| { |
| "epoch": 0.6149289099526066, |
| "grad_norm": 20.993815161831648, |
| "learning_rate": 8.908216973898928e-07, |
| "loss": 0.2072, |
| "step": 519 |
| }, |
| { |
| "epoch": 0.6161137440758294, |
| "grad_norm": 28.61411000218645, |
| "learning_rate": 8.901623954760459e-07, |
| "loss": 0.2746, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.6172985781990521, |
| "grad_norm": 11.09636600039551, |
| "learning_rate": 8.89501354403831e-07, |
| "loss": 0.2656, |
| "step": 521 |
| }, |
| { |
| "epoch": 0.6184834123222749, |
| "grad_norm": 16.998967485383094, |
| "learning_rate": 8.888385771198552e-07, |
| "loss": 0.2235, |
| "step": 522 |
| }, |
| { |
| "epoch": 0.6196682464454977, |
| "grad_norm": 28.861443273678475, |
| "learning_rate": 8.88174066578464e-07, |
| "loss": 0.2978, |
| "step": 523 |
| }, |
| { |
| "epoch": 0.6208530805687204, |
| "grad_norm": 14.714955209304444, |
| "learning_rate": 8.875078257417294e-07, |
| "loss": 0.2732, |
| "step": 524 |
| }, |
| { |
| "epoch": 0.6220379146919431, |
| "grad_norm": 9.735548367963192, |
| "learning_rate": 8.868398575794362e-07, |
| "loss": 0.1986, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.6232227488151659, |
| "grad_norm": 17.493573015109547, |
| "learning_rate": 8.861701650690685e-07, |
| "loss": 0.2646, |
| "step": 526 |
| }, |
| { |
| "epoch": 0.6244075829383886, |
| "grad_norm": 25.375815141472067, |
| "learning_rate": 8.854987511957973e-07, |
| "loss": 0.1653, |
| "step": 527 |
| }, |
| { |
| "epoch": 0.6255924170616114, |
| "grad_norm": 35.646843725332126, |
| "learning_rate": 8.84825618952466e-07, |
| "loss": 0.3044, |
| "step": 528 |
| }, |
| { |
| "epoch": 0.6267772511848341, |
| "grad_norm": 16.38434340194561, |
| "learning_rate": 8.841507713395782e-07, |
| "loss": 0.2138, |
| "step": 529 |
| }, |
| { |
| "epoch": 0.6279620853080569, |
| "grad_norm": 9.828589104598672, |
| "learning_rate": 8.834742113652833e-07, |
| "loss": 0.2025, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.6291469194312796, |
| "grad_norm": 14.966605589812715, |
| "learning_rate": 8.827959420453642e-07, |
| "loss": 0.2194, |
| "step": 531 |
| }, |
| { |
| "epoch": 0.6303317535545023, |
| "grad_norm": 9.043281744519222, |
| "learning_rate": 8.821159664032223e-07, |
| "loss": 0.2106, |
| "step": 532 |
| }, |
| { |
| "epoch": 0.6315165876777251, |
| "grad_norm": 25.156545576852338, |
| "learning_rate": 8.814342874698659e-07, |
| "loss": 0.296, |
| "step": 533 |
| }, |
| { |
| "epoch": 0.6327014218009479, |
| "grad_norm": 30.207865323923034, |
| "learning_rate": 8.807509082838956e-07, |
| "loss": 0.3279, |
| "step": 534 |
| }, |
| { |
| "epoch": 0.6338862559241706, |
| "grad_norm": 21.8597722447571, |
| "learning_rate": 8.800658318914905e-07, |
| "loss": 0.22, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.6350710900473934, |
| "grad_norm": 18.83124332148458, |
| "learning_rate": 8.793790613463954e-07, |
| "loss": 0.2945, |
| "step": 536 |
| }, |
| { |
| "epoch": 0.6362559241706162, |
| "grad_norm": 11.621272923164774, |
| "learning_rate": 8.786905997099066e-07, |
| "loss": 0.1921, |
| "step": 537 |
| }, |
| { |
| "epoch": 0.6374407582938388, |
| "grad_norm": 15.981620422788177, |
| "learning_rate": 8.780004500508587e-07, |
| "loss": 0.2368, |
| "step": 538 |
| }, |
| { |
| "epoch": 0.6386255924170616, |
| "grad_norm": 30.452192715434165, |
| "learning_rate": 8.773086154456106e-07, |
| "loss": 0.2448, |
| "step": 539 |
| }, |
| { |
| "epoch": 0.6398104265402843, |
| "grad_norm": 9.559309492322154, |
| "learning_rate": 8.766150989780317e-07, |
| "loss": 0.2027, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.6409952606635071, |
| "grad_norm": 27.607639874576503, |
| "learning_rate": 8.759199037394886e-07, |
| "loss": 0.2556, |
| "step": 541 |
| }, |
| { |
| "epoch": 0.6421800947867299, |
| "grad_norm": 12.26608497304879, |
| "learning_rate": 8.752230328288313e-07, |
| "loss": 0.1891, |
| "step": 542 |
| }, |
| { |
| "epoch": 0.6433649289099526, |
| "grad_norm": 33.570472082251705, |
| "learning_rate": 8.745244893523783e-07, |
| "loss": 0.3039, |
| "step": 543 |
| }, |
| { |
| "epoch": 0.6445497630331753, |
| "grad_norm": 14.538932588131711, |
| "learning_rate": 8.738242764239046e-07, |
| "loss": 0.1903, |
| "step": 544 |
| }, |
| { |
| "epoch": 0.6457345971563981, |
| "grad_norm": 8.885681558345862, |
| "learning_rate": 8.73122397164626e-07, |
| "loss": 0.25, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.6469194312796208, |
| "grad_norm": 9.724842455204085, |
| "learning_rate": 8.724188547031865e-07, |
| "loss": 0.2286, |
| "step": 546 |
| }, |
| { |
| "epoch": 0.6481042654028436, |
| "grad_norm": 11.641123423029827, |
| "learning_rate": 8.717136521756439e-07, |
| "loss": 0.2298, |
| "step": 547 |
| }, |
| { |
| "epoch": 0.6492890995260664, |
| "grad_norm": 35.64156668351365, |
| "learning_rate": 8.710067927254554e-07, |
| "loss": 0.2487, |
| "step": 548 |
| }, |
| { |
| "epoch": 0.6504739336492891, |
| "grad_norm": 55.07441844223735, |
| "learning_rate": 8.702982795034644e-07, |
| "loss": 0.2974, |
| "step": 549 |
| }, |
| { |
| "epoch": 0.6516587677725119, |
| "grad_norm": 30.67359922217043, |
| "learning_rate": 8.695881156678855e-07, |
| "loss": 0.2409, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.6516587677725119, |
| "eval_loss": 0.26306506991386414, |
| "eval_runtime": 55.58, |
| "eval_samples_per_second": 14.807, |
| "eval_steps_per_second": 0.936, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.6528436018957346, |
| "grad_norm": 34.111707302892505, |
| "learning_rate": 8.688763043842915e-07, |
| "loss": 0.3007, |
| "step": 551 |
| }, |
| { |
| "epoch": 0.6540284360189573, |
| "grad_norm": 24.72916737680904, |
| "learning_rate": 8.681628488255986e-07, |
| "loss": 0.2861, |
| "step": 552 |
| }, |
| { |
| "epoch": 0.6552132701421801, |
| "grad_norm": 15.195799871782196, |
| "learning_rate": 8.674477521720521e-07, |
| "loss": 0.2379, |
| "step": 553 |
| }, |
| { |
| "epoch": 0.6563981042654028, |
| "grad_norm": 35.31766339327681, |
| "learning_rate": 8.667310176112129e-07, |
| "loss": 0.3087, |
| "step": 554 |
| }, |
| { |
| "epoch": 0.6575829383886256, |
| "grad_norm": 29.669297495826285, |
| "learning_rate": 8.660126483379426e-07, |
| "loss": 0.2826, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.6587677725118484, |
| "grad_norm": 20.188142404979462, |
| "learning_rate": 8.652926475543898e-07, |
| "loss": 0.2353, |
| "step": 556 |
| }, |
| { |
| "epoch": 0.659952606635071, |
| "grad_norm": 9.878030902564287, |
| "learning_rate": 8.645710184699754e-07, |
| "loss": 0.2091, |
| "step": 557 |
| }, |
| { |
| "epoch": 0.6611374407582938, |
| "grad_norm": 8.326879683208155, |
| "learning_rate": 8.638477643013787e-07, |
| "loss": 0.2097, |
| "step": 558 |
| }, |
| { |
| "epoch": 0.6623222748815166, |
| "grad_norm": 17.04100715275265, |
| "learning_rate": 8.631228882725227e-07, |
| "loss": 0.2862, |
| "step": 559 |
| }, |
| { |
| "epoch": 0.6635071090047393, |
| "grad_norm": 21.154307793886165, |
| "learning_rate": 8.623963936145599e-07, |
| "loss": 0.3105, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.6646919431279621, |
| "grad_norm": 20.578330200177938, |
| "learning_rate": 8.61668283565858e-07, |
| "loss": 0.2362, |
| "step": 561 |
| }, |
| { |
| "epoch": 0.6658767772511849, |
| "grad_norm": 23.48493045218934, |
| "learning_rate": 8.609385613719853e-07, |
| "loss": 0.2178, |
| "step": 562 |
| }, |
| { |
| "epoch": 0.6670616113744076, |
| "grad_norm": 9.91650775543966, |
| "learning_rate": 8.60207230285696e-07, |
| "loss": 0.3152, |
| "step": 563 |
| }, |
| { |
| "epoch": 0.6682464454976303, |
| "grad_norm": 14.993510418458422, |
| "learning_rate": 8.594742935669164e-07, |
| "loss": 0.2369, |
| "step": 564 |
| }, |
| { |
| "epoch": 0.669431279620853, |
| "grad_norm": 36.79636092939016, |
| "learning_rate": 8.587397544827295e-07, |
| "loss": 0.2666, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.6706161137440758, |
| "grad_norm": 13.316556858524384, |
| "learning_rate": 8.580036163073614e-07, |
| "loss": 0.2829, |
| "step": 566 |
| }, |
| { |
| "epoch": 0.6718009478672986, |
| "grad_norm": 14.159800939343656, |
| "learning_rate": 8.572658823221658e-07, |
| "loss": 0.198, |
| "step": 567 |
| }, |
| { |
| "epoch": 0.6729857819905213, |
| "grad_norm": 9.745584395603467, |
| "learning_rate": 8.565265558156101e-07, |
| "loss": 0.2444, |
| "step": 568 |
| }, |
| { |
| "epoch": 0.6741706161137441, |
| "grad_norm": 12.150823832817574, |
| "learning_rate": 8.5578564008326e-07, |
| "loss": 0.2192, |
| "step": 569 |
| }, |
| { |
| "epoch": 0.6753554502369669, |
| "grad_norm": 24.362220333619582, |
| "learning_rate": 8.550431384277653e-07, |
| "loss": 0.2476, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.6765402843601895, |
| "grad_norm": 8.952930844365163, |
| "learning_rate": 8.542990541588453e-07, |
| "loss": 0.2264, |
| "step": 571 |
| }, |
| { |
| "epoch": 0.6777251184834123, |
| "grad_norm": 9.560301949012237, |
| "learning_rate": 8.535533905932737e-07, |
| "loss": 0.2736, |
| "step": 572 |
| }, |
| { |
| "epoch": 0.6789099526066351, |
| "grad_norm": 9.139618148174796, |
| "learning_rate": 8.528061510548641e-07, |
| "loss": 0.22, |
| "step": 573 |
| }, |
| { |
| "epoch": 0.6800947867298578, |
| "grad_norm": 18.201403006271022, |
| "learning_rate": 8.520573388744548e-07, |
| "loss": 0.2366, |
| "step": 574 |
| }, |
| { |
| "epoch": 0.6812796208530806, |
| "grad_norm": 8.042669272668338, |
| "learning_rate": 8.513069573898943e-07, |
| "loss": 0.2055, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.6824644549763034, |
| "grad_norm": 11.84351957271319, |
| "learning_rate": 8.505550099460263e-07, |
| "loss": 0.2745, |
| "step": 576 |
| }, |
| { |
| "epoch": 0.683649289099526, |
| "grad_norm": 16.427699085907832, |
| "learning_rate": 8.49801499894675e-07, |
| "loss": 0.187, |
| "step": 577 |
| }, |
| { |
| "epoch": 0.6848341232227488, |
| "grad_norm": 18.357805574878622, |
| "learning_rate": 8.490464305946294e-07, |
| "loss": 0.2259, |
| "step": 578 |
| }, |
| { |
| "epoch": 0.6860189573459715, |
| "grad_norm": 21.137167128440737, |
| "learning_rate": 8.482898054116299e-07, |
| "loss": 0.2139, |
| "step": 579 |
| }, |
| { |
| "epoch": 0.6872037914691943, |
| "grad_norm": 17.717042156687203, |
| "learning_rate": 8.475316277183508e-07, |
| "loss": 0.2515, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.6883886255924171, |
| "grad_norm": 23.793425039907586, |
| "learning_rate": 8.467719008943886e-07, |
| "loss": 0.2664, |
| "step": 581 |
| }, |
| { |
| "epoch": 0.6895734597156398, |
| "grad_norm": 11.919196893599274, |
| "learning_rate": 8.460106283262431e-07, |
| "loss": 0.1886, |
| "step": 582 |
| }, |
| { |
| "epoch": 0.6907582938388626, |
| "grad_norm": 22.28808906798008, |
| "learning_rate": 8.452478134073062e-07, |
| "loss": 0.2738, |
| "step": 583 |
| }, |
| { |
| "epoch": 0.6919431279620853, |
| "grad_norm": 23.537434314232886, |
| "learning_rate": 8.444834595378433e-07, |
| "loss": 0.2796, |
| "step": 584 |
| }, |
| { |
| "epoch": 0.693127962085308, |
| "grad_norm": 8.778711331006974, |
| "learning_rate": 8.437175701249805e-07, |
| "loss": 0.2371, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.6943127962085308, |
| "grad_norm": 8.478947761255274, |
| "learning_rate": 8.429501485826889e-07, |
| "loss": 0.183, |
| "step": 586 |
| }, |
| { |
| "epoch": 0.6954976303317536, |
| "grad_norm": 24.182074930968447, |
| "learning_rate": 8.421811983317681e-07, |
| "loss": 0.2003, |
| "step": 587 |
| }, |
| { |
| "epoch": 0.6966824644549763, |
| "grad_norm": 9.861012071727558, |
| "learning_rate": 8.414107227998328e-07, |
| "loss": 0.2774, |
| "step": 588 |
| }, |
| { |
| "epoch": 0.6978672985781991, |
| "grad_norm": 11.632857692746178, |
| "learning_rate": 8.406387254212965e-07, |
| "loss": 0.2403, |
| "step": 589 |
| }, |
| { |
| "epoch": 0.6990521327014217, |
| "grad_norm": 20.917402317103377, |
| "learning_rate": 8.398652096373564e-07, |
| "loss": 0.2078, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.7002369668246445, |
| "grad_norm": 11.067641557659394, |
| "learning_rate": 8.390901788959777e-07, |
| "loss": 0.2554, |
| "step": 591 |
| }, |
| { |
| "epoch": 0.7014218009478673, |
| "grad_norm": 11.776999680097008, |
| "learning_rate": 8.383136366518787e-07, |
| "loss": 0.2309, |
| "step": 592 |
| }, |
| { |
| "epoch": 0.70260663507109, |
| "grad_norm": 25.259853705212183, |
| "learning_rate": 8.375355863665154e-07, |
| "loss": 0.2359, |
| "step": 593 |
| }, |
| { |
| "epoch": 0.7037914691943128, |
| "grad_norm": 19.40581177984766, |
| "learning_rate": 8.367560315080662e-07, |
| "loss": 0.1904, |
| "step": 594 |
| }, |
| { |
| "epoch": 0.7049763033175356, |
| "grad_norm": 15.1929515678031, |
| "learning_rate": 8.359749755514154e-07, |
| "loss": 0.2692, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.7061611374407583, |
| "grad_norm": 15.147667421338586, |
| "learning_rate": 8.351924219781392e-07, |
| "loss": 0.254, |
| "step": 596 |
| }, |
| { |
| "epoch": 0.707345971563981, |
| "grad_norm": 7.901829661318031, |
| "learning_rate": 8.344083742764891e-07, |
| "loss": 0.1868, |
| "step": 597 |
| }, |
| { |
| "epoch": 0.7085308056872038, |
| "grad_norm": 14.381280613101293, |
| "learning_rate": 8.336228359413768e-07, |
| "loss": 0.1958, |
| "step": 598 |
| }, |
| { |
| "epoch": 0.7097156398104265, |
| "grad_norm": 36.71771015436861, |
| "learning_rate": 8.328358104743585e-07, |
| "loss": 0.3001, |
| "step": 599 |
| }, |
| { |
| "epoch": 0.7109004739336493, |
| "grad_norm": 33.7833690585172, |
| "learning_rate": 8.320473013836195e-07, |
| "loss": 0.259, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.7109004739336493, |
| "eval_loss": 0.251609206199646, |
| "eval_runtime": 59.1343, |
| "eval_samples_per_second": 13.917, |
| "eval_steps_per_second": 0.879, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.7120853080568721, |
| "grad_norm": 27.43991881068027, |
| "learning_rate": 8.312573121839581e-07, |
| "loss": 0.2528, |
| "step": 601 |
| }, |
| { |
| "epoch": 0.7132701421800948, |
| "grad_norm": 11.54084932255205, |
| "learning_rate": 8.304658463967705e-07, |
| "loss": 0.2258, |
| "step": 602 |
| }, |
| { |
| "epoch": 0.7144549763033176, |
| "grad_norm": 9.38181972163327, |
| "learning_rate": 8.296729075500343e-07, |
| "loss": 0.1483, |
| "step": 603 |
| }, |
| { |
| "epoch": 0.7156398104265402, |
| "grad_norm": 9.63021263503591, |
| "learning_rate": 8.288784991782945e-07, |
| "loss": 0.2134, |
| "step": 604 |
| }, |
| { |
| "epoch": 0.716824644549763, |
| "grad_norm": 25.241305745406983, |
| "learning_rate": 8.280826248226449e-07, |
| "loss": 0.2241, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.7180094786729858, |
| "grad_norm": 19.58953870780054, |
| "learning_rate": 8.272852880307153e-07, |
| "loss": 0.2973, |
| "step": 606 |
| }, |
| { |
| "epoch": 0.7191943127962085, |
| "grad_norm": 20.908601756019916, |
| "learning_rate": 8.264864923566537e-07, |
| "loss": 0.3026, |
| "step": 607 |
| }, |
| { |
| "epoch": 0.7203791469194313, |
| "grad_norm": 18.641795222590094, |
| "learning_rate": 8.256862413611112e-07, |
| "loss": 0.2092, |
| "step": 608 |
| }, |
| { |
| "epoch": 0.7215639810426541, |
| "grad_norm": 9.078402132315118, |
| "learning_rate": 8.24884538611226e-07, |
| "loss": 0.2293, |
| "step": 609 |
| }, |
| { |
| "epoch": 0.7227488151658767, |
| "grad_norm": 9.870990754662362, |
| "learning_rate": 8.240813876806078e-07, |
| "loss": 0.2364, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.7239336492890995, |
| "grad_norm": 32.61024025487414, |
| "learning_rate": 8.232767921493215e-07, |
| "loss": 0.2391, |
| "step": 611 |
| }, |
| { |
| "epoch": 0.7251184834123223, |
| "grad_norm": 24.836388708454873, |
| "learning_rate": 8.22470755603871e-07, |
| "loss": 0.2667, |
| "step": 612 |
| }, |
| { |
| "epoch": 0.726303317535545, |
| "grad_norm": 41.315609395728075, |
| "learning_rate": 8.216632816371838e-07, |
| "loss": 0.2302, |
| "step": 613 |
| }, |
| { |
| "epoch": 0.7274881516587678, |
| "grad_norm": 17.579148648614137, |
| "learning_rate": 8.208543738485949e-07, |
| "loss": 0.2644, |
| "step": 614 |
| }, |
| { |
| "epoch": 0.7286729857819905, |
| "grad_norm": 7.685937476575705, |
| "learning_rate": 8.200440358438305e-07, |
| "loss": 0.2128, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.7298578199052133, |
| "grad_norm": 18.987732881829164, |
| "learning_rate": 8.192322712349917e-07, |
| "loss": 0.1535, |
| "step": 616 |
| }, |
| { |
| "epoch": 0.731042654028436, |
| "grad_norm": 25.078374459237278, |
| "learning_rate": 8.184190836405393e-07, |
| "loss": 0.2313, |
| "step": 617 |
| }, |
| { |
| "epoch": 0.7322274881516587, |
| "grad_norm": 19.93693990442207, |
| "learning_rate": 8.176044766852765e-07, |
| "loss": 0.1826, |
| "step": 618 |
| }, |
| { |
| "epoch": 0.7334123222748815, |
| "grad_norm": 9.174225763509883, |
| "learning_rate": 8.167884540003337e-07, |
| "loss": 0.2183, |
| "step": 619 |
| }, |
| { |
| "epoch": 0.7345971563981043, |
| "grad_norm": 22.712219561256028, |
| "learning_rate": 8.159710192231519e-07, |
| "loss": 0.2233, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.735781990521327, |
| "grad_norm": 12.602248166695643, |
| "learning_rate": 8.151521759974666e-07, |
| "loss": 0.2168, |
| "step": 621 |
| }, |
| { |
| "epoch": 0.7369668246445498, |
| "grad_norm": 11.648901495290106, |
| "learning_rate": 8.143319279732913e-07, |
| "loss": 0.3022, |
| "step": 622 |
| }, |
| { |
| "epoch": 0.7381516587677726, |
| "grad_norm": 25.848339933826537, |
| "learning_rate": 8.135102788069015e-07, |
| "loss": 0.2619, |
| "step": 623 |
| }, |
| { |
| "epoch": 0.7393364928909952, |
| "grad_norm": 26.446445165860816, |
| "learning_rate": 8.126872321608183e-07, |
| "loss": 0.1956, |
| "step": 624 |
| }, |
| { |
| "epoch": 0.740521327014218, |
| "grad_norm": 11.078507030053842, |
| "learning_rate": 8.118627917037924e-07, |
| "loss": 0.2531, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.7417061611374408, |
| "grad_norm": 16.144215184908756, |
| "learning_rate": 8.110369611107868e-07, |
| "loss": 0.2048, |
| "step": 626 |
| }, |
| { |
| "epoch": 0.7428909952606635, |
| "grad_norm": 9.763738781518256, |
| "learning_rate": 8.102097440629618e-07, |
| "loss": 0.2536, |
| "step": 627 |
| }, |
| { |
| "epoch": 0.7440758293838863, |
| "grad_norm": 15.658750679197453, |
| "learning_rate": 8.093811442476572e-07, |
| "loss": 0.2143, |
| "step": 628 |
| }, |
| { |
| "epoch": 0.745260663507109, |
| "grad_norm": 14.261166875712147, |
| "learning_rate": 8.085511653583772e-07, |
| "loss": 0.2228, |
| "step": 629 |
| }, |
| { |
| "epoch": 0.7464454976303317, |
| "grad_norm": 18.41294125408353, |
| "learning_rate": 8.077198110947725e-07, |
| "loss": 0.2047, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.7476303317535545, |
| "grad_norm": 7.188949902488354, |
| "learning_rate": 8.068870851626253e-07, |
| "loss": 0.2506, |
| "step": 631 |
| }, |
| { |
| "epoch": 0.7488151658767772, |
| "grad_norm": 21.1672326000127, |
| "learning_rate": 8.060529912738314e-07, |
| "loss": 0.2493, |
| "step": 632 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 11.771338026161125, |
| "learning_rate": 8.052175331463848e-07, |
| "loss": 0.1622, |
| "step": 633 |
| }, |
| { |
| "epoch": 0.7511848341232228, |
| "grad_norm": 21.486215050391507, |
| "learning_rate": 8.043807145043603e-07, |
| "loss": 0.2102, |
| "step": 634 |
| }, |
| { |
| "epoch": 0.7523696682464455, |
| "grad_norm": 22.130536750069474, |
| "learning_rate": 8.035425390778973e-07, |
| "loss": 0.2048, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.7535545023696683, |
| "grad_norm": 15.296795850248081, |
| "learning_rate": 8.027030106031835e-07, |
| "loss": 0.199, |
| "step": 636 |
| }, |
| { |
| "epoch": 0.754739336492891, |
| "grad_norm": 12.519470076894477, |
| "learning_rate": 8.018621328224371e-07, |
| "loss": 0.2273, |
| "step": 637 |
| }, |
| { |
| "epoch": 0.7559241706161137, |
| "grad_norm": 22.060617497410405, |
| "learning_rate": 8.010199094838914e-07, |
| "loss": 0.194, |
| "step": 638 |
| }, |
| { |
| "epoch": 0.7571090047393365, |
| "grad_norm": 9.606082768273398, |
| "learning_rate": 8.001763443417775e-07, |
| "loss": 0.1982, |
| "step": 639 |
| }, |
| { |
| "epoch": 0.7582938388625592, |
| "grad_norm": 14.494565608281063, |
| "learning_rate": 7.993314411563075e-07, |
| "loss": 0.162, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.759478672985782, |
| "grad_norm": 16.589300163864532, |
| "learning_rate": 7.984852036936578e-07, |
| "loss": 0.2806, |
| "step": 641 |
| }, |
| { |
| "epoch": 0.7606635071090048, |
| "grad_norm": 11.576925049645048, |
| "learning_rate": 7.976376357259526e-07, |
| "loss": 0.1665, |
| "step": 642 |
| }, |
| { |
| "epoch": 0.7618483412322274, |
| "grad_norm": 14.58868190468772, |
| "learning_rate": 7.967887410312466e-07, |
| "loss": 0.1948, |
| "step": 643 |
| }, |
| { |
| "epoch": 0.7630331753554502, |
| "grad_norm": 21.320856814477253, |
| "learning_rate": 7.959385233935085e-07, |
| "loss": 0.2362, |
| "step": 644 |
| }, |
| { |
| "epoch": 0.764218009478673, |
| "grad_norm": 15.04617262190784, |
| "learning_rate": 7.950869866026045e-07, |
| "loss": 0.2164, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.7654028436018957, |
| "grad_norm": 9.80317001840058, |
| "learning_rate": 7.9423413445428e-07, |
| "loss": 0.1788, |
| "step": 646 |
| }, |
| { |
| "epoch": 0.7665876777251185, |
| "grad_norm": 19.844028210298966, |
| "learning_rate": 7.933799707501447e-07, |
| "loss": 0.2209, |
| "step": 647 |
| }, |
| { |
| "epoch": 0.7677725118483413, |
| "grad_norm": 13.751437660235108, |
| "learning_rate": 7.925244992976537e-07, |
| "loss": 0.1899, |
| "step": 648 |
| }, |
| { |
| "epoch": 0.768957345971564, |
| "grad_norm": 9.41285729970307, |
| "learning_rate": 7.916677239100922e-07, |
| "loss": 0.2487, |
| "step": 649 |
| }, |
| { |
| "epoch": 0.7701421800947867, |
| "grad_norm": 18.895758189790847, |
| "learning_rate": 7.908096484065568e-07, |
| "loss": 0.2111, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.7701421800947867, |
| "eval_loss": 0.2293587177991867, |
| "eval_runtime": 57.8447, |
| "eval_samples_per_second": 14.228, |
| "eval_steps_per_second": 0.899, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.7713270142180095, |
| "grad_norm": 13.862666177134349, |
| "learning_rate": 7.899502766119403e-07, |
| "loss": 0.2696, |
| "step": 651 |
| }, |
| { |
| "epoch": 0.7725118483412322, |
| "grad_norm": 16.120955468412227, |
| "learning_rate": 7.890896123569135e-07, |
| "loss": 0.214, |
| "step": 652 |
| }, |
| { |
| "epoch": 0.773696682464455, |
| "grad_norm": 9.011365770979943, |
| "learning_rate": 7.882276594779079e-07, |
| "loss": 0.1631, |
| "step": 653 |
| }, |
| { |
| "epoch": 0.7748815165876777, |
| "grad_norm": 9.222611641594378, |
| "learning_rate": 7.873644218170996e-07, |
| "loss": 0.2073, |
| "step": 654 |
| }, |
| { |
| "epoch": 0.7760663507109005, |
| "grad_norm": 27.175342390723387, |
| "learning_rate": 7.864999032223914e-07, |
| "loss": 0.1848, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.7772511848341233, |
| "grad_norm": 30.41172358480833, |
| "learning_rate": 7.856341075473961e-07, |
| "loss": 0.2405, |
| "step": 656 |
| }, |
| { |
| "epoch": 0.7784360189573459, |
| "grad_norm": 9.377963440893206, |
| "learning_rate": 7.847670386514189e-07, |
| "loss": 0.2181, |
| "step": 657 |
| }, |
| { |
| "epoch": 0.7796208530805687, |
| "grad_norm": 9.330139452266417, |
| "learning_rate": 7.838987003994404e-07, |
| "loss": 0.2582, |
| "step": 658 |
| }, |
| { |
| "epoch": 0.7808056872037915, |
| "grad_norm": 21.896919162817507, |
| "learning_rate": 7.830290966620995e-07, |
| "loss": 0.1798, |
| "step": 659 |
| }, |
| { |
| "epoch": 0.7819905213270142, |
| "grad_norm": 9.358252051552995, |
| "learning_rate": 7.821582313156763e-07, |
| "loss": 0.2502, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.783175355450237, |
| "grad_norm": 10.845451720550335, |
| "learning_rate": 7.812861082420739e-07, |
| "loss": 0.2301, |
| "step": 661 |
| }, |
| { |
| "epoch": 0.7843601895734598, |
| "grad_norm": 9.9684188287499, |
| "learning_rate": 7.804127313288023e-07, |
| "loss": 0.2687, |
| "step": 662 |
| }, |
| { |
| "epoch": 0.7855450236966824, |
| "grad_norm": 25.687262257581693, |
| "learning_rate": 7.795381044689602e-07, |
| "loss": 0.1958, |
| "step": 663 |
| }, |
| { |
| "epoch": 0.7867298578199052, |
| "grad_norm": 13.55499223559753, |
| "learning_rate": 7.786622315612181e-07, |
| "loss": 0.2485, |
| "step": 664 |
| }, |
| { |
| "epoch": 0.7879146919431279, |
| "grad_norm": 12.218617521005212, |
| "learning_rate": 7.777851165098011e-07, |
| "loss": 0.1479, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.7890995260663507, |
| "grad_norm": 7.721800269638889, |
| "learning_rate": 7.769067632244706e-07, |
| "loss": 0.1626, |
| "step": 666 |
| }, |
| { |
| "epoch": 0.7902843601895735, |
| "grad_norm": 24.314109763668625, |
| "learning_rate": 7.760271756205077e-07, |
| "loss": 0.2109, |
| "step": 667 |
| }, |
| { |
| "epoch": 0.7914691943127962, |
| "grad_norm": 7.701242021010707, |
| "learning_rate": 7.751463576186957e-07, |
| "loss": 0.1472, |
| "step": 668 |
| }, |
| { |
| "epoch": 0.792654028436019, |
| "grad_norm": 13.37997398622404, |
| "learning_rate": 7.742643131453021e-07, |
| "loss": 0.2217, |
| "step": 669 |
| }, |
| { |
| "epoch": 0.7938388625592417, |
| "grad_norm": 8.745184371153652, |
| "learning_rate": 7.733810461320618e-07, |
| "loss": 0.1434, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.7950236966824644, |
| "grad_norm": 11.16885275222009, |
| "learning_rate": 7.724965605161588e-07, |
| "loss": 0.2009, |
| "step": 671 |
| }, |
| { |
| "epoch": 0.7962085308056872, |
| "grad_norm": 9.806142647457065, |
| "learning_rate": 7.716108602402094e-07, |
| "loss": 0.18, |
| "step": 672 |
| }, |
| { |
| "epoch": 0.79739336492891, |
| "grad_norm": 16.335655824559705, |
| "learning_rate": 7.707239492522439e-07, |
| "loss": 0.2102, |
| "step": 673 |
| }, |
| { |
| "epoch": 0.7985781990521327, |
| "grad_norm": 17.375625902322422, |
| "learning_rate": 7.6983583150569e-07, |
| "loss": 0.2452, |
| "step": 674 |
| }, |
| { |
| "epoch": 0.7997630331753555, |
| "grad_norm": 13.425423648864005, |
| "learning_rate": 7.689465109593539e-07, |
| "loss": 0.222, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.8009478672985783, |
| "grad_norm": 16.905635944612605, |
| "learning_rate": 7.680559915774033e-07, |
| "loss": 0.2264, |
| "step": 676 |
| }, |
| { |
| "epoch": 0.8021327014218009, |
| "grad_norm": 16.781958411958737, |
| "learning_rate": 7.671642773293505e-07, |
| "loss": 0.2078, |
| "step": 677 |
| }, |
| { |
| "epoch": 0.8033175355450237, |
| "grad_norm": 17.58937112986415, |
| "learning_rate": 7.662713721900331e-07, |
| "loss": 0.2382, |
| "step": 678 |
| }, |
| { |
| "epoch": 0.8045023696682464, |
| "grad_norm": 17.576419370888175, |
| "learning_rate": 7.653772801395977e-07, |
| "loss": 0.219, |
| "step": 679 |
| }, |
| { |
| "epoch": 0.8056872037914692, |
| "grad_norm": 42.78433100503283, |
| "learning_rate": 7.644820051634812e-07, |
| "loss": 0.3042, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.806872037914692, |
| "grad_norm": 12.172358570342093, |
| "learning_rate": 7.635855512523937e-07, |
| "loss": 0.1905, |
| "step": 681 |
| }, |
| { |
| "epoch": 0.8080568720379147, |
| "grad_norm": 7.694538953431653, |
| "learning_rate": 7.626879224023001e-07, |
| "loss": 0.209, |
| "step": 682 |
| }, |
| { |
| "epoch": 0.8092417061611374, |
| "grad_norm": 12.703516850184618, |
| "learning_rate": 7.617891226144033e-07, |
| "loss": 0.1979, |
| "step": 683 |
| }, |
| { |
| "epoch": 0.8104265402843602, |
| "grad_norm": 9.11601457295805, |
| "learning_rate": 7.608891558951248e-07, |
| "loss": 0.1748, |
| "step": 684 |
| }, |
| { |
| "epoch": 0.8116113744075829, |
| "grad_norm": 9.014204421884562, |
| "learning_rate": 7.599880262560882e-07, |
| "loss": 0.2629, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.8127962085308057, |
| "grad_norm": 14.044829378680554, |
| "learning_rate": 7.590857377141009e-07, |
| "loss": 0.3174, |
| "step": 686 |
| }, |
| { |
| "epoch": 0.8139810426540285, |
| "grad_norm": 14.2323739084943, |
| "learning_rate": 7.58182294291136e-07, |
| "loss": 0.214, |
| "step": 687 |
| }, |
| { |
| "epoch": 0.8151658767772512, |
| "grad_norm": 23.66550864483779, |
| "learning_rate": 7.572777000143145e-07, |
| "loss": 0.2742, |
| "step": 688 |
| }, |
| { |
| "epoch": 0.816350710900474, |
| "grad_norm": 16.58222384020301, |
| "learning_rate": 7.563719589158872e-07, |
| "loss": 0.2081, |
| "step": 689 |
| }, |
| { |
| "epoch": 0.8175355450236966, |
| "grad_norm": 14.390980642614025, |
| "learning_rate": 7.554650750332174e-07, |
| "loss": 0.1946, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.8187203791469194, |
| "grad_norm": 10.122145213378829, |
| "learning_rate": 7.545570524087619e-07, |
| "loss": 0.1758, |
| "step": 691 |
| }, |
| { |
| "epoch": 0.8199052132701422, |
| "grad_norm": 11.308324582282285, |
| "learning_rate": 7.536478950900536e-07, |
| "loss": 0.1705, |
| "step": 692 |
| }, |
| { |
| "epoch": 0.8210900473933649, |
| "grad_norm": 23.02921863609514, |
| "learning_rate": 7.527376071296836e-07, |
| "loss": 0.1841, |
| "step": 693 |
| }, |
| { |
| "epoch": 0.8222748815165877, |
| "grad_norm": 14.409456371586236, |
| "learning_rate": 7.518261925852823e-07, |
| "loss": 0.2406, |
| "step": 694 |
| }, |
| { |
| "epoch": 0.8234597156398105, |
| "grad_norm": 32.09834915802655, |
| "learning_rate": 7.509136555195023e-07, |
| "loss": 0.2367, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.8246445497630331, |
| "grad_norm": 12.356727122169731, |
| "learning_rate": 7.5e-07, |
| "loss": 0.2417, |
| "step": 696 |
| }, |
| { |
| "epoch": 0.8258293838862559, |
| "grad_norm": 13.126563741604407, |
| "learning_rate": 7.490852300994168e-07, |
| "loss": 0.2063, |
| "step": 697 |
| }, |
| { |
| "epoch": 0.8270142180094787, |
| "grad_norm": 8.032225849806373, |
| "learning_rate": 7.48169349895362e-07, |
| "loss": 0.1786, |
| "step": 698 |
| }, |
| { |
| "epoch": 0.8281990521327014, |
| "grad_norm": 20.962662296925956, |
| "learning_rate": 7.472523634703936e-07, |
| "loss": 0.1698, |
| "step": 699 |
| }, |
| { |
| "epoch": 0.8293838862559242, |
| "grad_norm": 12.75110057617325, |
| "learning_rate": 7.463342749120013e-07, |
| "loss": 0.1812, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.8293838862559242, |
| "eval_loss": 0.23034565150737762, |
| "eval_runtime": 55.8882, |
| "eval_samples_per_second": 14.726, |
| "eval_steps_per_second": 0.93, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.830568720379147, |
| "grad_norm": 9.913055744102358, |
| "learning_rate": 7.454150883125868e-07, |
| "loss": 0.2133, |
| "step": 701 |
| }, |
| { |
| "epoch": 0.8317535545023697, |
| "grad_norm": 25.833382992108902, |
| "learning_rate": 7.44494807769447e-07, |
| "loss": 0.2969, |
| "step": 702 |
| }, |
| { |
| "epoch": 0.8329383886255924, |
| "grad_norm": 11.281775033559253, |
| "learning_rate": 7.435734373847545e-07, |
| "loss": 0.1416, |
| "step": 703 |
| }, |
| { |
| "epoch": 0.8341232227488151, |
| "grad_norm": 19.594414240296107, |
| "learning_rate": 7.426509812655405e-07, |
| "loss": 0.2314, |
| "step": 704 |
| }, |
| { |
| "epoch": 0.8353080568720379, |
| "grad_norm": 17.70712288867108, |
| "learning_rate": 7.417274435236755e-07, |
| "loss": 0.2538, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.8364928909952607, |
| "grad_norm": 10.607970236644762, |
| "learning_rate": 7.408028282758514e-07, |
| "loss": 0.1711, |
| "step": 706 |
| }, |
| { |
| "epoch": 0.8376777251184834, |
| "grad_norm": 11.57185625276237, |
| "learning_rate": 7.398771396435632e-07, |
| "loss": 0.1872, |
| "step": 707 |
| }, |
| { |
| "epoch": 0.8388625592417062, |
| "grad_norm": 11.182287748396945, |
| "learning_rate": 7.389503817530905e-07, |
| "loss": 0.2349, |
| "step": 708 |
| }, |
| { |
| "epoch": 0.840047393364929, |
| "grad_norm": 16.684749926205544, |
| "learning_rate": 7.380225587354789e-07, |
| "loss": 0.2506, |
| "step": 709 |
| }, |
| { |
| "epoch": 0.8412322274881516, |
| "grad_norm": 11.69863271084872, |
| "learning_rate": 7.370936747265225e-07, |
| "loss": 0.2208, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.8424170616113744, |
| "grad_norm": 10.788997392898397, |
| "learning_rate": 7.361637338667441e-07, |
| "loss": 0.2114, |
| "step": 711 |
| }, |
| { |
| "epoch": 0.8436018957345972, |
| "grad_norm": 7.602485659984748, |
| "learning_rate": 7.352327403013779e-07, |
| "loss": 0.1821, |
| "step": 712 |
| }, |
| { |
| "epoch": 0.8447867298578199, |
| "grad_norm": 7.26614734509637, |
| "learning_rate": 7.343006981803499e-07, |
| "loss": 0.2309, |
| "step": 713 |
| }, |
| { |
| "epoch": 0.8459715639810427, |
| "grad_norm": 8.364511901273751, |
| "learning_rate": 7.33367611658261e-07, |
| "loss": 0.2118, |
| "step": 714 |
| }, |
| { |
| "epoch": 0.8471563981042654, |
| "grad_norm": 7.5672261947917985, |
| "learning_rate": 7.324334848943668e-07, |
| "loss": 0.2214, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.8483412322274881, |
| "grad_norm": 15.227230664760391, |
| "learning_rate": 7.314983220525604e-07, |
| "loss": 0.1782, |
| "step": 716 |
| }, |
| { |
| "epoch": 0.8495260663507109, |
| "grad_norm": 10.217799076813009, |
| "learning_rate": 7.305621273013525e-07, |
| "loss": 0.2302, |
| "step": 717 |
| }, |
| { |
| "epoch": 0.8507109004739336, |
| "grad_norm": 18.691186404113676, |
| "learning_rate": 7.296249048138542e-07, |
| "loss": 0.1682, |
| "step": 718 |
| }, |
| { |
| "epoch": 0.8518957345971564, |
| "grad_norm": 10.452172850788102, |
| "learning_rate": 7.286866587677574e-07, |
| "loss": 0.2355, |
| "step": 719 |
| }, |
| { |
| "epoch": 0.8530805687203792, |
| "grad_norm": 10.604574552375054, |
| "learning_rate": 7.277473933453169e-07, |
| "loss": 0.1133, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.8542654028436019, |
| "grad_norm": 9.891539825214483, |
| "learning_rate": 7.268071127333311e-07, |
| "loss": 0.2195, |
| "step": 721 |
| }, |
| { |
| "epoch": 0.8554502369668247, |
| "grad_norm": 22.53671982979302, |
| "learning_rate": 7.258658211231234e-07, |
| "loss": 0.2269, |
| "step": 722 |
| }, |
| { |
| "epoch": 0.8566350710900474, |
| "grad_norm": 23.83975858014513, |
| "learning_rate": 7.249235227105245e-07, |
| "loss": 0.2294, |
| "step": 723 |
| }, |
| { |
| "epoch": 0.8578199052132701, |
| "grad_norm": 19.583412313581313, |
| "learning_rate": 7.239802216958522e-07, |
| "loss": 0.2309, |
| "step": 724 |
| }, |
| { |
| "epoch": 0.8590047393364929, |
| "grad_norm": 9.156760472446708, |
| "learning_rate": 7.230359222838938e-07, |
| "loss": 0.1893, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.8601895734597157, |
| "grad_norm": 14.337897781365237, |
| "learning_rate": 7.220906286838868e-07, |
| "loss": 0.1709, |
| "step": 726 |
| }, |
| { |
| "epoch": 0.8613744075829384, |
| "grad_norm": 17.90848323335686, |
| "learning_rate": 7.211443451095006e-07, |
| "loss": 0.2353, |
| "step": 727 |
| }, |
| { |
| "epoch": 0.8625592417061612, |
| "grad_norm": 8.380769804085316, |
| "learning_rate": 7.201970757788171e-07, |
| "loss": 0.21, |
| "step": 728 |
| }, |
| { |
| "epoch": 0.8637440758293838, |
| "grad_norm": 11.322111166952071, |
| "learning_rate": 7.192488249143125e-07, |
| "loss": 0.2225, |
| "step": 729 |
| }, |
| { |
| "epoch": 0.8649289099526066, |
| "grad_norm": 10.045093401838667, |
| "learning_rate": 7.182995967428379e-07, |
| "loss": 0.1583, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.8661137440758294, |
| "grad_norm": 17.370985243300748, |
| "learning_rate": 7.173493954956011e-07, |
| "loss": 0.2028, |
| "step": 731 |
| }, |
| { |
| "epoch": 0.8672985781990521, |
| "grad_norm": 10.458119006433506, |
| "learning_rate": 7.163982254081474e-07, |
| "loss": 0.1776, |
| "step": 732 |
| }, |
| { |
| "epoch": 0.8684834123222749, |
| "grad_norm": 8.754265017532378, |
| "learning_rate": 7.154460907203405e-07, |
| "loss": 0.1875, |
| "step": 733 |
| }, |
| { |
| "epoch": 0.8696682464454977, |
| "grad_norm": 12.4225760590751, |
| "learning_rate": 7.144929956763437e-07, |
| "loss": 0.2778, |
| "step": 734 |
| }, |
| { |
| "epoch": 0.8708530805687204, |
| "grad_norm": 11.608393438177549, |
| "learning_rate": 7.135389445246017e-07, |
| "loss": 0.192, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.8720379146919431, |
| "grad_norm": 8.633498509424786, |
| "learning_rate": 7.125839415178203e-07, |
| "loss": 0.1541, |
| "step": 736 |
| }, |
| { |
| "epoch": 0.8732227488151659, |
| "grad_norm": 16.670199329768597, |
| "learning_rate": 7.116279909129491e-07, |
| "loss": 0.1941, |
| "step": 737 |
| }, |
| { |
| "epoch": 0.8744075829383886, |
| "grad_norm": 12.74769327926982, |
| "learning_rate": 7.106710969711609e-07, |
| "loss": 0.2348, |
| "step": 738 |
| }, |
| { |
| "epoch": 0.8755924170616114, |
| "grad_norm": 10.452681837710877, |
| "learning_rate": 7.097132639578337e-07, |
| "loss": 0.1702, |
| "step": 739 |
| }, |
| { |
| "epoch": 0.8767772511848341, |
| "grad_norm": 9.914095739792367, |
| "learning_rate": 7.087544961425316e-07, |
| "loss": 0.1876, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.8779620853080569, |
| "grad_norm": 9.459241899301656, |
| "learning_rate": 7.077947977989853e-07, |
| "loss": 0.2389, |
| "step": 741 |
| }, |
| { |
| "epoch": 0.8791469194312796, |
| "grad_norm": 10.31379047778746, |
| "learning_rate": 7.068341732050737e-07, |
| "loss": 0.2262, |
| "step": 742 |
| }, |
| { |
| "epoch": 0.8803317535545023, |
| "grad_norm": 14.015863725214746, |
| "learning_rate": 7.058726266428041e-07, |
| "loss": 0.2054, |
| "step": 743 |
| }, |
| { |
| "epoch": 0.8815165876777251, |
| "grad_norm": 10.095904056771356, |
| "learning_rate": 7.049101623982937e-07, |
| "loss": 0.2564, |
| "step": 744 |
| }, |
| { |
| "epoch": 0.8827014218009479, |
| "grad_norm": 14.900120087150492, |
| "learning_rate": 7.039467847617504e-07, |
| "loss": 0.2516, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.8838862559241706, |
| "grad_norm": 14.989089635791842, |
| "learning_rate": 7.029824980274534e-07, |
| "loss": 0.186, |
| "step": 746 |
| }, |
| { |
| "epoch": 0.8850710900473934, |
| "grad_norm": 21.089551499728422, |
| "learning_rate": 7.020173064937344e-07, |
| "loss": 0.1767, |
| "step": 747 |
| }, |
| { |
| "epoch": 0.8862559241706162, |
| "grad_norm": 8.4313495925763, |
| "learning_rate": 7.010512144629579e-07, |
| "loss": 0.227, |
| "step": 748 |
| }, |
| { |
| "epoch": 0.8874407582938388, |
| "grad_norm": 8.70162638226272, |
| "learning_rate": 7.000842262415028e-07, |
| "loss": 0.1797, |
| "step": 749 |
| }, |
| { |
| "epoch": 0.8886255924170616, |
| "grad_norm": 10.07719445107742, |
| "learning_rate": 6.991163461397424e-07, |
| "loss": 0.198, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.8886255924170616, |
| "eval_loss": 0.21252204477787018, |
| "eval_runtime": 55.1978, |
| "eval_samples_per_second": 14.91, |
| "eval_steps_per_second": 0.942, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.8898104265402843, |
| "grad_norm": 8.952461064774775, |
| "learning_rate": 6.981475784720262e-07, |
| "loss": 0.258, |
| "step": 751 |
| }, |
| { |
| "epoch": 0.8909952606635071, |
| "grad_norm": 20.719939479561397, |
| "learning_rate": 6.971779275566593e-07, |
| "loss": 0.2014, |
| "step": 752 |
| }, |
| { |
| "epoch": 0.8921800947867299, |
| "grad_norm": 7.936070074068664, |
| "learning_rate": 6.962073977158842e-07, |
| "loss": 0.1962, |
| "step": 753 |
| }, |
| { |
| "epoch": 0.8933649289099526, |
| "grad_norm": 15.527186441037935, |
| "learning_rate": 6.952359932758615e-07, |
| "loss": 0.1838, |
| "step": 754 |
| }, |
| { |
| "epoch": 0.8945497630331753, |
| "grad_norm": 11.785111972126451, |
| "learning_rate": 6.9426371856665e-07, |
| "loss": 0.2201, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.8957345971563981, |
| "grad_norm": 20.18101839777207, |
| "learning_rate": 6.93290577922188e-07, |
| "loss": 0.2597, |
| "step": 756 |
| }, |
| { |
| "epoch": 0.8969194312796208, |
| "grad_norm": 9.9152594543863, |
| "learning_rate": 6.923165756802733e-07, |
| "loss": 0.1701, |
| "step": 757 |
| }, |
| { |
| "epoch": 0.8981042654028436, |
| "grad_norm": 20.043984042966567, |
| "learning_rate": 6.913417161825449e-07, |
| "loss": 0.2574, |
| "step": 758 |
| }, |
| { |
| "epoch": 0.8992890995260664, |
| "grad_norm": 21.123597700553983, |
| "learning_rate": 6.903660037744626e-07, |
| "loss": 0.2426, |
| "step": 759 |
| }, |
| { |
| "epoch": 0.9004739336492891, |
| "grad_norm": 17.3636230361985, |
| "learning_rate": 6.89389442805288e-07, |
| "loss": 0.2198, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.9016587677725119, |
| "grad_norm": 7.896724380954798, |
| "learning_rate": 6.884120376280657e-07, |
| "loss": 0.2368, |
| "step": 761 |
| }, |
| { |
| "epoch": 0.9028436018957346, |
| "grad_norm": 10.744383746333114, |
| "learning_rate": 6.874337925996028e-07, |
| "loss": 0.2166, |
| "step": 762 |
| }, |
| { |
| "epoch": 0.9040284360189573, |
| "grad_norm": 11.973048617938426, |
| "learning_rate": 6.864547120804505e-07, |
| "loss": 0.2149, |
| "step": 763 |
| }, |
| { |
| "epoch": 0.9052132701421801, |
| "grad_norm": 15.374227350719309, |
| "learning_rate": 6.85474800434884e-07, |
| "loss": 0.1623, |
| "step": 764 |
| }, |
| { |
| "epoch": 0.9063981042654028, |
| "grad_norm": 9.460007923138491, |
| "learning_rate": 6.84494062030883e-07, |
| "loss": 0.1601, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.9075829383886256, |
| "grad_norm": 8.169757295616728, |
| "learning_rate": 6.835125012401131e-07, |
| "loss": 0.1693, |
| "step": 766 |
| }, |
| { |
| "epoch": 0.9087677725118484, |
| "grad_norm": 16.061474109942935, |
| "learning_rate": 6.825301224379056e-07, |
| "loss": 0.2211, |
| "step": 767 |
| }, |
| { |
| "epoch": 0.909952606635071, |
| "grad_norm": 10.477258738350997, |
| "learning_rate": 6.815469300032373e-07, |
| "loss": 0.2279, |
| "step": 768 |
| }, |
| { |
| "epoch": 0.9111374407582938, |
| "grad_norm": 10.65538754992105, |
| "learning_rate": 6.805629283187129e-07, |
| "loss": 0.1925, |
| "step": 769 |
| }, |
| { |
| "epoch": 0.9123222748815166, |
| "grad_norm": 9.123234339056102, |
| "learning_rate": 6.795781217705435e-07, |
| "loss": 0.2253, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.9135071090047393, |
| "grad_norm": 10.377714671981227, |
| "learning_rate": 6.785925147485285e-07, |
| "loss": 0.1754, |
| "step": 771 |
| }, |
| { |
| "epoch": 0.9146919431279621, |
| "grad_norm": 10.393576077978427, |
| "learning_rate": 6.776061116460352e-07, |
| "loss": 0.2009, |
| "step": 772 |
| }, |
| { |
| "epoch": 0.9158767772511849, |
| "grad_norm": 16.02077338864102, |
| "learning_rate": 6.766189168599789e-07, |
| "loss": 0.1636, |
| "step": 773 |
| }, |
| { |
| "epoch": 0.9170616113744076, |
| "grad_norm": 14.519560151061562, |
| "learning_rate": 6.756309347908051e-07, |
| "loss": 0.2455, |
| "step": 774 |
| }, |
| { |
| "epoch": 0.9182464454976303, |
| "grad_norm": 21.53296240222359, |
| "learning_rate": 6.746421698424676e-07, |
| "loss": 0.1942, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.919431279620853, |
| "grad_norm": 10.224973261982292, |
| "learning_rate": 6.7365262642241e-07, |
| "loss": 0.1958, |
| "step": 776 |
| }, |
| { |
| "epoch": 0.9206161137440758, |
| "grad_norm": 12.82882389122881, |
| "learning_rate": 6.726623089415467e-07, |
| "loss": 0.211, |
| "step": 777 |
| }, |
| { |
| "epoch": 0.9218009478672986, |
| "grad_norm": 15.696609544363108, |
| "learning_rate": 6.716712218142413e-07, |
| "loss": 0.281, |
| "step": 778 |
| }, |
| { |
| "epoch": 0.9229857819905213, |
| "grad_norm": 22.27881823197468, |
| "learning_rate": 6.706793694582891e-07, |
| "loss": 0.2251, |
| "step": 779 |
| }, |
| { |
| "epoch": 0.9241706161137441, |
| "grad_norm": 15.024010449291305, |
| "learning_rate": 6.696867562948962e-07, |
| "loss": 0.2681, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.9253554502369669, |
| "grad_norm": 20.247007205993885, |
| "learning_rate": 6.686933867486596e-07, |
| "loss": 0.2191, |
| "step": 781 |
| }, |
| { |
| "epoch": 0.9265402843601895, |
| "grad_norm": 7.998974278453153, |
| "learning_rate": 6.676992652475486e-07, |
| "loss": 0.1871, |
| "step": 782 |
| }, |
| { |
| "epoch": 0.9277251184834123, |
| "grad_norm": 10.708704166774769, |
| "learning_rate": 6.667043962228838e-07, |
| "loss": 0.182, |
| "step": 783 |
| }, |
| { |
| "epoch": 0.9289099526066351, |
| "grad_norm": 8.78684615317003, |
| "learning_rate": 6.657087841093179e-07, |
| "loss": 0.1568, |
| "step": 784 |
| }, |
| { |
| "epoch": 0.9300947867298578, |
| "grad_norm": 9.778799963563257, |
| "learning_rate": 6.647124333448164e-07, |
| "loss": 0.2085, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.9312796208530806, |
| "grad_norm": 12.864160262988083, |
| "learning_rate": 6.637153483706368e-07, |
| "loss": 0.2463, |
| "step": 786 |
| }, |
| { |
| "epoch": 0.9324644549763034, |
| "grad_norm": 24.981518255133043, |
| "learning_rate": 6.6271753363131e-07, |
| "loss": 0.1694, |
| "step": 787 |
| }, |
| { |
| "epoch": 0.933649289099526, |
| "grad_norm": 32.80984613365954, |
| "learning_rate": 6.61718993574619e-07, |
| "loss": 0.2152, |
| "step": 788 |
| }, |
| { |
| "epoch": 0.9348341232227488, |
| "grad_norm": 14.086295795520133, |
| "learning_rate": 6.607197326515807e-07, |
| "loss": 0.1499, |
| "step": 789 |
| }, |
| { |
| "epoch": 0.9360189573459715, |
| "grad_norm": 13.406991924570532, |
| "learning_rate": 6.597197553164251e-07, |
| "loss": 0.3099, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.9372037914691943, |
| "grad_norm": 13.492701196428424, |
| "learning_rate": 6.587190660265751e-07, |
| "loss": 0.1985, |
| "step": 791 |
| }, |
| { |
| "epoch": 0.9383886255924171, |
| "grad_norm": 12.689739842206874, |
| "learning_rate": 6.577176692426278e-07, |
| "loss": 0.2184, |
| "step": 792 |
| }, |
| { |
| "epoch": 0.9395734597156398, |
| "grad_norm": 25.733609867586548, |
| "learning_rate": 6.567155694283336e-07, |
| "loss": 0.1801, |
| "step": 793 |
| }, |
| { |
| "epoch": 0.9407582938388626, |
| "grad_norm": 30.12391831939558, |
| "learning_rate": 6.55712771050577e-07, |
| "loss": 0.2542, |
| "step": 794 |
| }, |
| { |
| "epoch": 0.9419431279620853, |
| "grad_norm": 29.270469329492194, |
| "learning_rate": 6.547092785793559e-07, |
| "loss": 0.2172, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.943127962085308, |
| "grad_norm": 40.42865798160653, |
| "learning_rate": 6.537050964877625e-07, |
| "loss": 0.2777, |
| "step": 796 |
| }, |
| { |
| "epoch": 0.9443127962085308, |
| "grad_norm": 16.58432837538837, |
| "learning_rate": 6.527002292519629e-07, |
| "loss": 0.227, |
| "step": 797 |
| }, |
| { |
| "epoch": 0.9454976303317536, |
| "grad_norm": 12.821494884458682, |
| "learning_rate": 6.516946813511773e-07, |
| "loss": 0.1727, |
| "step": 798 |
| }, |
| { |
| "epoch": 0.9466824644549763, |
| "grad_norm": 22.825107619536553, |
| "learning_rate": 6.5068845726766e-07, |
| "loss": 0.207, |
| "step": 799 |
| }, |
| { |
| "epoch": 0.9478672985781991, |
| "grad_norm": 31.69359524459485, |
| "learning_rate": 6.496815614866791e-07, |
| "loss": 0.2378, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.9478672985781991, |
| "eval_loss": 0.27354177832603455, |
| "eval_runtime": 62.6028, |
| "eval_samples_per_second": 13.146, |
| "eval_steps_per_second": 0.831, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.9490521327014217, |
| "grad_norm": 37.52743059970339, |
| "learning_rate": 6.486739984964971e-07, |
| "loss": 0.2762, |
| "step": 801 |
| }, |
| { |
| "epoch": 0.9502369668246445, |
| "grad_norm": 38.872276944043726, |
| "learning_rate": 6.476657727883506e-07, |
| "loss": 0.1925, |
| "step": 802 |
| }, |
| { |
| "epoch": 0.9514218009478673, |
| "grad_norm": 15.289062834224044, |
| "learning_rate": 6.466568888564302e-07, |
| "loss": 0.276, |
| "step": 803 |
| }, |
| { |
| "epoch": 0.95260663507109, |
| "grad_norm": 15.911245985870275, |
| "learning_rate": 6.456473511978606e-07, |
| "loss": 0.1977, |
| "step": 804 |
| }, |
| { |
| "epoch": 0.9537914691943128, |
| "grad_norm": 8.593962442854156, |
| "learning_rate": 6.446371643126805e-07, |
| "loss": 0.2118, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.9549763033175356, |
| "grad_norm": 11.503080592385206, |
| "learning_rate": 6.436263327038224e-07, |
| "loss": 0.2003, |
| "step": 806 |
| }, |
| { |
| "epoch": 0.9561611374407583, |
| "grad_norm": 16.784875802136042, |
| "learning_rate": 6.426148608770928e-07, |
| "loss": 0.211, |
| "step": 807 |
| }, |
| { |
| "epoch": 0.957345971563981, |
| "grad_norm": 8.039773337530425, |
| "learning_rate": 6.416027533411519e-07, |
| "loss": 0.1776, |
| "step": 808 |
| }, |
| { |
| "epoch": 0.9585308056872038, |
| "grad_norm": 35.99706053391443, |
| "learning_rate": 6.40590014607494e-07, |
| "loss": 0.2358, |
| "step": 809 |
| }, |
| { |
| "epoch": 0.9597156398104265, |
| "grad_norm": 40.99321152622947, |
| "learning_rate": 6.395766491904262e-07, |
| "loss": 0.3066, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.9609004739336493, |
| "grad_norm": 18.164194558552783, |
| "learning_rate": 6.385626616070498e-07, |
| "loss": 0.1893, |
| "step": 811 |
| }, |
| { |
| "epoch": 0.9620853080568721, |
| "grad_norm": 7.741779460691234, |
| "learning_rate": 6.375480563772389e-07, |
| "loss": 0.1874, |
| "step": 812 |
| }, |
| { |
| "epoch": 0.9632701421800948, |
| "grad_norm": 8.638620885624041, |
| "learning_rate": 6.365328380236213e-07, |
| "loss": 0.2293, |
| "step": 813 |
| }, |
| { |
| "epoch": 0.9644549763033176, |
| "grad_norm": 8.26453063301268, |
| "learning_rate": 6.355170110715571e-07, |
| "loss": 0.2436, |
| "step": 814 |
| }, |
| { |
| "epoch": 0.9656398104265402, |
| "grad_norm": 16.74712475701176, |
| "learning_rate": 6.3450058004912e-07, |
| "loss": 0.185, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.966824644549763, |
| "grad_norm": 11.579496290026954, |
| "learning_rate": 6.334835494870758e-07, |
| "loss": 0.2624, |
| "step": 816 |
| }, |
| { |
| "epoch": 0.9680094786729858, |
| "grad_norm": 23.226048329711308, |
| "learning_rate": 6.32465923918863e-07, |
| "loss": 0.225, |
| "step": 817 |
| }, |
| { |
| "epoch": 0.9691943127962085, |
| "grad_norm": 17.655075691563827, |
| "learning_rate": 6.314477078805723e-07, |
| "loss": 0.215, |
| "step": 818 |
| }, |
| { |
| "epoch": 0.9703791469194313, |
| "grad_norm": 13.202128531234838, |
| "learning_rate": 6.304289059109267e-07, |
| "loss": 0.2403, |
| "step": 819 |
| }, |
| { |
| "epoch": 0.9715639810426541, |
| "grad_norm": 13.076408764926182, |
| "learning_rate": 6.294095225512604e-07, |
| "loss": 0.1663, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.9727488151658767, |
| "grad_norm": 14.171619384960582, |
| "learning_rate": 6.283895623454997e-07, |
| "loss": 0.2774, |
| "step": 821 |
| }, |
| { |
| "epoch": 0.9739336492890995, |
| "grad_norm": 19.8877660661784, |
| "learning_rate": 6.273690298401419e-07, |
| "loss": 0.2723, |
| "step": 822 |
| }, |
| { |
| "epoch": 0.9751184834123223, |
| "grad_norm": 6.900387582314849, |
| "learning_rate": 6.263479295842357e-07, |
| "loss": 0.1971, |
| "step": 823 |
| }, |
| { |
| "epoch": 0.976303317535545, |
| "grad_norm": 18.414912712274486, |
| "learning_rate": 6.253262661293602e-07, |
| "loss": 0.2433, |
| "step": 824 |
| }, |
| { |
| "epoch": 0.9774881516587678, |
| "grad_norm": 15.35186217689985, |
| "learning_rate": 6.243040440296051e-07, |
| "loss": 0.1789, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.9786729857819905, |
| "grad_norm": 10.207560328723083, |
| "learning_rate": 6.232812678415504e-07, |
| "loss": 0.2385, |
| "step": 826 |
| }, |
| { |
| "epoch": 0.9798578199052133, |
| "grad_norm": 9.098282908972523, |
| "learning_rate": 6.222579421242455e-07, |
| "loss": 0.2396, |
| "step": 827 |
| }, |
| { |
| "epoch": 0.981042654028436, |
| "grad_norm": 8.545110006193829, |
| "learning_rate": 6.2123407143919e-07, |
| "loss": 0.2131, |
| "step": 828 |
| }, |
| { |
| "epoch": 0.9822274881516587, |
| "grad_norm": 7.862000410634824, |
| "learning_rate": 6.202096603503122e-07, |
| "loss": 0.1932, |
| "step": 829 |
| }, |
| { |
| "epoch": 0.9834123222748815, |
| "grad_norm": 16.06519254165904, |
| "learning_rate": 6.191847134239495e-07, |
| "loss": 0.1942, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.9845971563981043, |
| "grad_norm": 8.103880149384155, |
| "learning_rate": 6.181592352288279e-07, |
| "loss": 0.1843, |
| "step": 831 |
| }, |
| { |
| "epoch": 0.985781990521327, |
| "grad_norm": 9.399017520546249, |
| "learning_rate": 6.17133230336041e-07, |
| "loss": 0.2042, |
| "step": 832 |
| }, |
| { |
| "epoch": 0.9869668246445498, |
| "grad_norm": 11.17143516636379, |
| "learning_rate": 6.16106703319031e-07, |
| "loss": 0.2414, |
| "step": 833 |
| }, |
| { |
| "epoch": 0.9881516587677726, |
| "grad_norm": 16.347223668954097, |
| "learning_rate": 6.150796587535669e-07, |
| "loss": 0.2388, |
| "step": 834 |
| }, |
| { |
| "epoch": 0.9893364928909952, |
| "grad_norm": 24.954899681915954, |
| "learning_rate": 6.140521012177249e-07, |
| "loss": 0.1575, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.990521327014218, |
| "grad_norm": 8.054339443033578, |
| "learning_rate": 6.130240352918674e-07, |
| "loss": 0.1829, |
| "step": 836 |
| }, |
| { |
| "epoch": 0.9917061611374408, |
| "grad_norm": 11.535190625870067, |
| "learning_rate": 6.119954655586236e-07, |
| "loss": 0.2162, |
| "step": 837 |
| }, |
| { |
| "epoch": 0.9928909952606635, |
| "grad_norm": 24.1831071367917, |
| "learning_rate": 6.10966396602868e-07, |
| "loss": 0.2608, |
| "step": 838 |
| }, |
| { |
| "epoch": 0.9940758293838863, |
| "grad_norm": 8.55670621920293, |
| "learning_rate": 6.099368330117004e-07, |
| "loss": 0.1847, |
| "step": 839 |
| }, |
| { |
| "epoch": 0.995260663507109, |
| "grad_norm": 12.190546843323471, |
| "learning_rate": 6.089067793744257e-07, |
| "loss": 0.1573, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.9964454976303317, |
| "grad_norm": 24.359810277866014, |
| "learning_rate": 6.078762402825331e-07, |
| "loss": 0.2092, |
| "step": 841 |
| }, |
| { |
| "epoch": 0.9976303317535545, |
| "grad_norm": 9.263467659494774, |
| "learning_rate": 6.068452203296754e-07, |
| "loss": 0.1749, |
| "step": 842 |
| }, |
| { |
| "epoch": 0.9988151658767772, |
| "grad_norm": 10.629198605852931, |
| "learning_rate": 6.058137241116493e-07, |
| "loss": 0.1846, |
| "step": 843 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 15.124988874606263, |
| "learning_rate": 6.047817562263743e-07, |
| "loss": 0.2534, |
| "step": 844 |
| }, |
| { |
| "epoch": 1.0011848341232228, |
| "grad_norm": 12.707802633067423, |
| "learning_rate": 6.037493212738722e-07, |
| "loss": 0.2384, |
| "step": 845 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 1688, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 169, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 164206322122752.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|