{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 75.1879934828926, "eval_steps": 2000, "global_step": 150000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_accuracy": 1.3660035857594126e-07, "eval_loss": 132.875, "eval_runtime": 254.898, "eval_samples_per_second": 6486.396, "eval_steps_per_second": 12.672, "step": 0 }, { "epoch": 0.13762730525736305, "grad_norm": 49.58098602294922, "learning_rate": 2.967e-05, "loss": 178.872046875, "step": 1000 }, { "epoch": 0.2752546105147261, "grad_norm": 40.552101135253906, "learning_rate": 5.966999999999999e-05, "loss": 82.2545625, "step": 2000 }, { "epoch": 0.2752546105147261, "eval_accuracy": 0.6086885594122716, "eval_loss": 16.890625, "eval_runtime": 245.8339, "eval_samples_per_second": 6725.555, "eval_steps_per_second": 13.139, "step": 2000 }, { "epoch": 0.41288191577208916, "grad_norm": 37.14718246459961, "learning_rate": 8.966999999999999e-05, "loss": 66.408390625, "step": 3000 }, { "epoch": 0.5505092210294522, "grad_norm": 32.7165641784668, "learning_rate": 0.00011960999999999999, "loss": 59.63419921875, "step": 4000 }, { "epoch": 0.5505092210294522, "eval_accuracy": 0.6626575571303744, "eval_loss": 13.6171875, "eval_runtime": 239.1422, "eval_samples_per_second": 6913.748, "eval_steps_per_second": 13.507, "step": 4000 }, { "epoch": 0.6881365262868153, "grad_norm": 26.651721954345703, "learning_rate": 0.00014960999999999997, "loss": 55.59596484375, "step": 5000 }, { "epoch": 0.8257638315441783, "grad_norm": 25.13609504699707, "learning_rate": 0.00017961, "loss": 52.9440390625, "step": 6000 }, { "epoch": 0.8257638315441783, "eval_accuracy": 0.6820435002667244, "eval_loss": 12.4453125, "eval_runtime": 241.0405, "eval_samples_per_second": 6859.298, "eval_steps_per_second": 13.4, "step": 6000 }, { "epoch": 0.9633911368015414, "grad_norm": 25.540454864501953, "learning_rate": 0.00020960999999999997, "loss": 51.0653359375, "step": 7000 }, { "epoch": 1.1010184420589044, "grad_norm": 22.598819732666016, "learning_rate": 0.00023960999999999996, "loss": 49.669171875, "step": 8000 }, { "epoch": 1.1010184420589044, "eval_accuracy": 0.6911625811516694, "eval_loss": 11.875, "eval_runtime": 238.8748, "eval_samples_per_second": 6921.488, "eval_steps_per_second": 13.522, "step": 8000 }, { "epoch": 1.2386457473162675, "grad_norm": 21.2167911529541, "learning_rate": 0.00026957999999999995, "loss": 48.713546875, "step": 9000 }, { "epoch": 1.3762730525736306, "grad_norm": 20.751371383666992, "learning_rate": 0.00029955, "loss": 47.99215625, "step": 10000 }, { "epoch": 1.3762730525736306, "eval_accuracy": 0.6964798957051618, "eval_loss": 11.5546875, "eval_runtime": 240.5455, "eval_samples_per_second": 6873.416, "eval_steps_per_second": 13.428, "step": 10000 }, { "epoch": 1.5139003578309937, "grad_norm": 21.686861038208008, "learning_rate": 0.0003, "loss": 47.29948046875, "step": 11000 }, { "epoch": 1.6515276630883569, "grad_norm": 18.800752639770508, "learning_rate": 0.0003, "loss": 46.53960546875, "step": 12000 }, { "epoch": 1.6515276630883569, "eval_accuracy": 0.701906575930677, "eval_loss": 11.2265625, "eval_runtime": 239.4873, "eval_samples_per_second": 6903.785, "eval_steps_per_second": 13.487, "step": 12000 }, { "epoch": 1.7891549683457197, "grad_norm": 19.42099952697754, "learning_rate": 0.0003, "loss": 45.9191953125, "step": 13000 }, { "epoch": 1.9267822736030829, "grad_norm": 19.15869140625, "learning_rate": 0.0003, "loss": 45.381796875, "step": 14000 }, { "epoch": 1.9267822736030829, "eval_accuracy": 0.7061886218301343, "eval_loss": 10.9765625, "eval_runtime": 239.2568, "eval_samples_per_second": 6910.438, "eval_steps_per_second": 13.5, "step": 14000 }, { "epoch": 2.0644095788604457, "grad_norm": 16.94078826904297, "learning_rate": 0.0003, "loss": 44.90976953125, "step": 15000 }, { "epoch": 2.202036884117809, "grad_norm": 17.655250549316406, "learning_rate": 0.0003, "loss": 44.45808203125, "step": 16000 }, { "epoch": 2.202036884117809, "eval_accuracy": 0.7093567074531988, "eval_loss": 10.8203125, "eval_runtime": 241.1911, "eval_samples_per_second": 6855.016, "eval_steps_per_second": 13.392, "step": 16000 }, { "epoch": 2.339664189375172, "grad_norm": 17.694721221923828, "learning_rate": 0.0003, "loss": 44.16196484375, "step": 17000 }, { "epoch": 2.477291494632535, "grad_norm": 17.49053955078125, "learning_rate": 0.0003, "loss": 43.84825, "step": 18000 }, { "epoch": 2.477291494632535, "eval_accuracy": 0.7115680703757034, "eval_loss": 10.6640625, "eval_runtime": 239.4688, "eval_samples_per_second": 6904.32, "eval_steps_per_second": 13.488, "step": 18000 }, { "epoch": 2.614918799889898, "grad_norm": 19.09914207458496, "learning_rate": 0.0003, "loss": 43.59271875, "step": 19000 }, { "epoch": 2.7525461051472613, "grad_norm": 16.3907527923584, "learning_rate": 0.0003, "loss": 43.352640625, "step": 20000 }, { "epoch": 2.7525461051472613, "eval_accuracy": 0.7139675040013439, "eval_loss": 10.5546875, "eval_runtime": 238.1647, "eval_samples_per_second": 6942.126, "eval_steps_per_second": 13.562, "step": 20000 }, { "epoch": 2.8901734104046244, "grad_norm": 15.896549224853516, "learning_rate": 0.0003, "loss": 43.17196875, "step": 21000 }, { "epoch": 3.0278007156619875, "grad_norm": 29.67310905456543, "learning_rate": 0.0003, "loss": 42.92155859375, "step": 22000 }, { "epoch": 3.0278007156619875, "eval_accuracy": 0.7146398068421496, "eval_loss": 10.484375, "eval_runtime": 238.4701, "eval_samples_per_second": 6933.233, "eval_steps_per_second": 13.545, "step": 22000 }, { "epoch": 3.1654280209193506, "grad_norm": 16.424579620361328, "learning_rate": 0.0003, "loss": 42.65390625, "step": 23000 }, { "epoch": 3.3030553261767133, "grad_norm": 16.19496726989746, "learning_rate": 0.0003, "loss": 42.48802734375, "step": 24000 }, { "epoch": 3.3030553261767133, "eval_accuracy": 0.7170818189501579, "eval_loss": 10.3671875, "eval_runtime": 240.7935, "eval_samples_per_second": 6866.337, "eval_steps_per_second": 13.414, "step": 24000 }, { "epoch": 3.4406826314340764, "grad_norm": 15.53753662109375, "learning_rate": 0.0003, "loss": 42.357984375, "step": 25000 }, { "epoch": 3.5783099366914395, "grad_norm": 16.701377868652344, "learning_rate": 0.0003, "loss": 42.1965703125, "step": 26000 }, { "epoch": 3.5783099366914395, "eval_accuracy": 0.7182790131411184, "eval_loss": 10.2890625, "eval_runtime": 241.2004, "eval_samples_per_second": 6854.753, "eval_steps_per_second": 13.391, "step": 26000 }, { "epoch": 3.7159372419488026, "grad_norm": 15.334391593933105, "learning_rate": 0.0003, "loss": 42.05885546875, "step": 27000 }, { "epoch": 3.8535645472061657, "grad_norm": 15.341226577758789, "learning_rate": 0.0003, "loss": 41.9392421875, "step": 28000 }, { "epoch": 3.8535645472061657, "eval_accuracy": 0.719322972712139, "eval_loss": 10.2421875, "eval_runtime": 241.186, "eval_samples_per_second": 6855.162, "eval_steps_per_second": 13.392, "step": 28000 }, { "epoch": 3.991191852463529, "grad_norm": 16.253334045410156, "learning_rate": 0.0003, "loss": 41.81163671875, "step": 29000 }, { "epoch": 4.1288191577208915, "grad_norm": 15.035149574279785, "learning_rate": 0.0003, "loss": 41.617953125, "step": 30000 }, { "epoch": 4.1288191577208915, "eval_accuracy": 0.720451396648655, "eval_loss": 10.171875, "eval_runtime": 240.2855, "eval_samples_per_second": 6880.853, "eval_steps_per_second": 13.442, "step": 30000 }, { "epoch": 4.266446462978255, "grad_norm": 14.762296676635742, "learning_rate": 0.0003, "loss": 41.5138203125, "step": 31000 }, { "epoch": 4.404073768235618, "grad_norm": 14.627701759338379, "learning_rate": 0.0003, "loss": 41.4306015625, "step": 32000 }, { "epoch": 4.404073768235618, "eval_accuracy": 0.7213651673804347, "eval_loss": 10.1328125, "eval_runtime": 242.2962, "eval_samples_per_second": 6823.752, "eval_steps_per_second": 13.331, "step": 32000 }, { "epoch": 4.541701073492981, "grad_norm": 14.57941722869873, "learning_rate": 0.0003, "loss": 41.3221171875, "step": 33000 }, { "epoch": 4.679328378750344, "grad_norm": 15.291731834411621, "learning_rate": 0.0003, "loss": 41.276203125, "step": 34000 }, { "epoch": 4.679328378750344, "eval_accuracy": 0.7223401458779132, "eval_loss": 10.0703125, "eval_runtime": 239.5218, "eval_samples_per_second": 6902.793, "eval_steps_per_second": 13.485, "step": 34000 }, { "epoch": 4.8169556840077075, "grad_norm": 15.057552337646484, "learning_rate": 0.0003, "loss": 41.19701171875, "step": 35000 }, { "epoch": 4.95458298926507, "grad_norm": 15.457907676696777, "learning_rate": 0.0003, "loss": 41.10438671875, "step": 36000 }, { "epoch": 4.95458298926507, "eval_accuracy": 0.7230995451445803, "eval_loss": 10.0546875, "eval_runtime": 240.0931, "eval_samples_per_second": 6886.367, "eval_steps_per_second": 13.453, "step": 36000 }, { "epoch": 5.092210294522434, "grad_norm": 15.539594650268555, "learning_rate": 0.0003, "loss": 40.93646875, "step": 37000 }, { "epoch": 5.229837599779796, "grad_norm": 14.915628433227539, "learning_rate": 0.0003, "loss": 40.8286875, "step": 38000 }, { "epoch": 5.229837599779796, "eval_accuracy": 0.7238966529952525, "eval_loss": 10.0, "eval_runtime": 239.9098, "eval_samples_per_second": 6891.629, "eval_steps_per_second": 13.463, "step": 38000 }, { "epoch": 5.367464905037159, "grad_norm": 14.271048545837402, "learning_rate": 0.0003, "loss": 40.80625, "step": 39000 }, { "epoch": 5.505092210294523, "grad_norm": 14.605119705200195, "learning_rate": 0.0003, "loss": 40.713796875, "step": 40000 }, { "epoch": 5.505092210294523, "eval_accuracy": 0.7245612679427045, "eval_loss": 9.9609375, "eval_runtime": 240.0103, "eval_samples_per_second": 6888.743, "eval_steps_per_second": 13.458, "step": 40000 }, { "epoch": 5.642719515551885, "grad_norm": 14.748287200927734, "learning_rate": 0.0003, "loss": 40.62338671875, "step": 41000 }, { "epoch": 5.780346820809249, "grad_norm": 15.422652244567871, "learning_rate": 0.0003, "loss": 40.56144140625, "step": 42000 }, { "epoch": 5.780346820809249, "eval_accuracy": 0.7251576961266964, "eval_loss": 9.9375, "eval_runtime": 240.638, "eval_samples_per_second": 6870.772, "eval_steps_per_second": 13.423, "step": 42000 }, { "epoch": 5.917974126066611, "grad_norm": 15.326558113098145, "learning_rate": 0.0003, "loss": 40.5059375, "step": 43000 }, { "epoch": 6.055601431323975, "grad_norm": 15.331598281860352, "learning_rate": 0.0003, "loss": 40.40818359375, "step": 44000 }, { "epoch": 6.055601431323975, "eval_accuracy": 0.7254487554600376, "eval_loss": 9.8984375, "eval_runtime": 240.1599, "eval_samples_per_second": 6884.449, "eval_steps_per_second": 13.449, "step": 44000 }, { "epoch": 6.193228736581338, "grad_norm": 14.527973175048828, "learning_rate": 0.0003, "loss": 40.3428828125, "step": 45000 }, { "epoch": 6.330856041838701, "grad_norm": 15.686996459960938, "learning_rate": 0.0003, "loss": 40.3244765625, "step": 46000 }, { "epoch": 6.330856041838701, "eval_accuracy": 0.7256079479674087, "eval_loss": 9.8984375, "eval_runtime": 239.085, "eval_samples_per_second": 6915.403, "eval_steps_per_second": 13.51, "step": 46000 }, { "epoch": 6.468483347096064, "grad_norm": 14.848986625671387, "learning_rate": 0.0003, "loss": 40.312796875, "step": 47000 }, { "epoch": 6.6061106523534265, "grad_norm": 14.275111198425293, "learning_rate": 0.0003, "loss": 40.28499609375, "step": 48000 }, { "epoch": 6.6061106523534265, "eval_accuracy": 0.7262142861047188, "eval_loss": 9.875, "eval_runtime": 240.5807, "eval_samples_per_second": 6872.409, "eval_steps_per_second": 13.426, "step": 48000 }, { "epoch": 6.74373795761079, "grad_norm": 14.665587425231934, "learning_rate": 0.0003, "loss": 40.18369921875, "step": 49000 }, { "epoch": 6.881365262868153, "grad_norm": 14.547246932983398, "learning_rate": 0.0003, "loss": 40.1498828125, "step": 50000 }, { "epoch": 6.881365262868153, "eval_accuracy": 0.7269716959581425, "eval_loss": 9.8515625, "eval_runtime": 241.4549, "eval_samples_per_second": 6847.527, "eval_steps_per_second": 13.377, "step": 50000 }, { "epoch": 7.018992568125516, "grad_norm": 14.525768280029297, "learning_rate": 0.0003, "loss": 40.1036328125, "step": 51000 }, { "epoch": 7.156619873382879, "grad_norm": 14.632113456726074, "learning_rate": 0.0003, "loss": 39.9834296875, "step": 52000 }, { "epoch": 7.156619873382879, "eval_accuracy": 0.7272316426626143, "eval_loss": 9.828125, "eval_runtime": 239.3181, "eval_samples_per_second": 6908.667, "eval_steps_per_second": 13.497, "step": 52000 }, { "epoch": 7.2942471786402425, "grad_norm": 14.982499122619629, "learning_rate": 0.0003, "loss": 39.9509375, "step": 53000 }, { "epoch": 7.431874483897605, "grad_norm": 16.801025390625, "learning_rate": 0.0003, "loss": 39.891859375, "step": 54000 }, { "epoch": 7.431874483897605, "eval_accuracy": 0.7271305788939304, "eval_loss": 9.828125, "eval_runtime": 240.159, "eval_samples_per_second": 6884.477, "eval_steps_per_second": 13.449, "step": 54000 }, { "epoch": 7.569501789154968, "grad_norm": 14.868009567260742, "learning_rate": 0.0003, "loss": 39.88668359375, "step": 55000 }, { "epoch": 7.707129094412331, "grad_norm": 14.595479011535645, "learning_rate": 0.0003, "loss": 39.821890625, "step": 56000 }, { "epoch": 7.707129094412331, "eval_accuracy": 0.7280901536840519, "eval_loss": 9.7734375, "eval_runtime": 238.9096, "eval_samples_per_second": 6920.478, "eval_steps_per_second": 13.52, "step": 56000 }, { "epoch": 7.844756399669695, "grad_norm": 13.92586612701416, "learning_rate": 0.0003, "loss": 39.78269921875, "step": 57000 }, { "epoch": 7.982383704927058, "grad_norm": 15.85058307647705, "learning_rate": 0.0003, "loss": 39.72277734375, "step": 58000 }, { "epoch": 7.982383704927058, "eval_accuracy": 0.7287356832938983, "eval_loss": 9.7578125, "eval_runtime": 239.7822, "eval_samples_per_second": 6895.295, "eval_steps_per_second": 13.471, "step": 58000 }, { "epoch": 8.12001101018442, "grad_norm": 15.202603340148926, "learning_rate": 0.0003, "loss": 39.6687421875, "step": 59000 }, { "epoch": 8.257638315441783, "grad_norm": 14.994338989257812, "learning_rate": 0.0003, "loss": 39.60739453125, "step": 60000 }, { "epoch": 8.257638315441783, "eval_accuracy": 0.7289930926403759, "eval_loss": 9.7265625, "eval_runtime": 241.1318, "eval_samples_per_second": 6856.702, "eval_steps_per_second": 13.395, "step": 60000 }, { "epoch": 8.395265620699147, "grad_norm": 15.15245532989502, "learning_rate": 0.0003, "loss": 39.57180859375, "step": 61000 }, { "epoch": 8.53289292595651, "grad_norm": 15.941924095153809, "learning_rate": 0.0003, "loss": 39.5704296875, "step": 62000 }, { "epoch": 8.53289292595651, "eval_accuracy": 0.7289831970926051, "eval_loss": 9.734375, "eval_runtime": 241.0009, "eval_samples_per_second": 6860.426, "eval_steps_per_second": 13.402, "step": 62000 }, { "epoch": 8.670520231213873, "grad_norm": 14.842296600341797, "learning_rate": 0.0003, "loss": 39.53778125, "step": 63000 }, { "epoch": 8.808147536471235, "grad_norm": 17.454763412475586, "learning_rate": 0.0003, "loss": 39.540921875, "step": 64000 }, { "epoch": 8.808147536471235, "eval_accuracy": 0.7290886771189041, "eval_loss": 9.7109375, "eval_runtime": 240.0306, "eval_samples_per_second": 6888.158, "eval_steps_per_second": 13.457, "step": 64000 }, { "epoch": 8.9457748417286, "grad_norm": 13.98570442199707, "learning_rate": 0.0003, "loss": 39.512796875, "step": 65000 }, { "epoch": 9.083402146985962, "grad_norm": 18.010318756103516, "learning_rate": 0.0003, "loss": 39.4786171875, "step": 66000 }, { "epoch": 9.083402146985962, "eval_accuracy": 0.7294749784251455, "eval_loss": 9.7109375, "eval_runtime": 240.1115, "eval_samples_per_second": 6885.838, "eval_steps_per_second": 13.452, "step": 66000 }, { "epoch": 9.221029452243325, "grad_norm": 15.137900352478027, "learning_rate": 0.0003, "loss": 39.4073359375, "step": 67000 }, { "epoch": 9.358656757500688, "grad_norm": 18.228130340576172, "learning_rate": 0.0003, "loss": 39.3549765625, "step": 68000 }, { "epoch": 9.358656757500688, "eval_accuracy": 0.7301181665976199, "eval_loss": 9.671875, "eval_runtime": 239.7862, "eval_samples_per_second": 6895.179, "eval_steps_per_second": 13.47, "step": 68000 }, { "epoch": 9.49628406275805, "grad_norm": 16.575559616088867, "learning_rate": 0.0003, "loss": 39.3098828125, "step": 69000 }, { "epoch": 9.633911368015415, "grad_norm": 14.635740280151367, "learning_rate": 0.0003, "loss": 39.35287890625, "step": 70000 }, { "epoch": 9.633911368015415, "eval_accuracy": 0.7295560826467988, "eval_loss": 9.6875, "eval_runtime": 241.3494, "eval_samples_per_second": 6850.521, "eval_steps_per_second": 13.383, "step": 70000 }, { "epoch": 9.771538673272778, "grad_norm": 14.436244010925293, "learning_rate": 0.0003, "loss": 39.29956640625, "step": 71000 }, { "epoch": 9.90916597853014, "grad_norm": 14.493698120117188, "learning_rate": 0.0003, "loss": 39.31009375, "step": 72000 }, { "epoch": 9.90916597853014, "eval_accuracy": 0.7304579509385183, "eval_loss": 9.6484375, "eval_runtime": 245.6684, "eval_samples_per_second": 6730.085, "eval_steps_per_second": 13.148, "step": 72000 }, { "epoch": 10.046793283787503, "grad_norm": 15.077356338500977, "learning_rate": 0.0003, "loss": 39.2335546875, "step": 73000 }, { "epoch": 10.184420589044867, "grad_norm": 13.661473274230957, "learning_rate": 0.0003, "loss": 39.09965625, "step": 74000 }, { "epoch": 10.184420589044867, "eval_accuracy": 0.7312455280822778, "eval_loss": 9.625, "eval_runtime": 239.6349, "eval_samples_per_second": 6899.534, "eval_steps_per_second": 13.479, "step": 74000 }, { "epoch": 10.32204789430223, "grad_norm": 15.429136276245117, "learning_rate": 0.0003, "loss": 39.147140625, "step": 75000 }, { "epoch": 10.459675199559593, "grad_norm": 15.229757308959961, "learning_rate": 0.0003, "loss": 39.1339453125, "step": 76000 }, { "epoch": 10.459675199559593, "eval_accuracy": 0.731277762807936, "eval_loss": 9.609375, "eval_runtime": 238.7439, "eval_samples_per_second": 6925.282, "eval_steps_per_second": 13.529, "step": 76000 }, { "epoch": 10.597302504816955, "grad_norm": 14.771382331848145, "learning_rate": 0.0003, "loss": 39.1441796875, "step": 77000 }, { "epoch": 10.734929810074318, "grad_norm": 13.703607559204102, "learning_rate": 0.0003, "loss": 39.141265625, "step": 78000 }, { "epoch": 10.734929810074318, "eval_accuracy": 0.7310708531463442, "eval_loss": 9.609375, "eval_runtime": 239.4162, "eval_samples_per_second": 6905.836, "eval_steps_per_second": 13.491, "step": 78000 }, { "epoch": 10.872557115331682, "grad_norm": 19.041141510009766, "learning_rate": 0.0003, "loss": 39.0934140625, "step": 79000 }, { "epoch": 11.010184420589045, "grad_norm": 17.401290893554688, "learning_rate": 0.0003, "loss": 39.113875, "step": 80000 }, { "epoch": 11.010184420589045, "eval_accuracy": 0.7312406631974454, "eval_loss": 9.6015625, "eval_runtime": 238.9522, "eval_samples_per_second": 6919.246, "eval_steps_per_second": 13.517, "step": 80000 }, { "epoch": 11.147811725846408, "grad_norm": 14.292427062988281, "learning_rate": 0.0003, "loss": 39.012484375, "step": 81000 }, { "epoch": 11.28543903110377, "grad_norm": 15.462931632995605, "learning_rate": 0.0003, "loss": 39.04391796875, "step": 82000 }, { "epoch": 11.28543903110377, "eval_accuracy": 0.7316027472794033, "eval_loss": 9.6015625, "eval_runtime": 240.4477, "eval_samples_per_second": 6876.21, "eval_steps_per_second": 13.433, "step": 82000 }, { "epoch": 11.423066336361135, "grad_norm": 17.796772003173828, "learning_rate": 0.0003, "loss": 38.957421875, "step": 83000 }, { "epoch": 11.560693641618498, "grad_norm": 17.314067840576172, "learning_rate": 0.0003, "loss": 38.9495234375, "step": 84000 }, { "epoch": 11.560693641618498, "eval_accuracy": 0.7321146855990825, "eval_loss": 9.578125, "eval_runtime": 239.2384, "eval_samples_per_second": 6910.967, "eval_steps_per_second": 13.501, "step": 84000 }, { "epoch": 11.69832094687586, "grad_norm": 16.145645141601562, "learning_rate": 0.0003, "loss": 38.91906640625, "step": 85000 }, { "epoch": 11.835948252133223, "grad_norm": 13.51314640045166, "learning_rate": 0.0003, "loss": 38.91014453125, "step": 86000 }, { "epoch": 11.835948252133223, "eval_accuracy": 0.732051599943418, "eval_loss": 9.5546875, "eval_runtime": 240.3949, "eval_samples_per_second": 6877.722, "eval_steps_per_second": 13.436, "step": 86000 }, { "epoch": 11.973575557390586, "grad_norm": 15.877927780151367, "learning_rate": 0.0003, "loss": 38.933609375, "step": 87000 }, { "epoch": 12.11120286264795, "grad_norm": 15.215489387512207, "learning_rate": 0.0003, "loss": 38.8452265625, "step": 88000 }, { "epoch": 12.11120286264795, "eval_accuracy": 0.7323534973774022, "eval_loss": 9.546875, "eval_runtime": 240.1188, "eval_samples_per_second": 6885.629, "eval_steps_per_second": 13.452, "step": 88000 }, { "epoch": 12.248830167905313, "grad_norm": 15.539190292358398, "learning_rate": 0.0003, "loss": 38.8104296875, "step": 89000 }, { "epoch": 12.386457473162675, "grad_norm": 15.577831268310547, "learning_rate": 0.0003, "loss": 38.80796875, "step": 90000 }, { "epoch": 12.386457473162675, "eval_accuracy": 0.7324531616408847, "eval_loss": 9.546875, "eval_runtime": 240.2465, "eval_samples_per_second": 6881.969, "eval_steps_per_second": 13.445, "step": 90000 }, { "epoch": 12.524084778420038, "grad_norm": 14.47063159942627, "learning_rate": 0.0003, "loss": 38.865859375, "step": 91000 }, { "epoch": 12.661712083677402, "grad_norm": 13.968493461608887, "learning_rate": 0.0003, "loss": 38.81719921875, "step": 92000 }, { "epoch": 12.661712083677402, "eval_accuracy": 0.7321305936040334, "eval_loss": 9.546875, "eval_runtime": 239.1976, "eval_samples_per_second": 6912.148, "eval_steps_per_second": 13.503, "step": 92000 }, { "epoch": 12.799339388934765, "grad_norm": 28.390636444091797, "learning_rate": 0.0003, "loss": 38.815578125, "step": 93000 }, { "epoch": 12.936966694192128, "grad_norm": 27.102386474609375, "learning_rate": 0.0003, "loss": 38.82604296875, "step": 94000 }, { "epoch": 12.936966694192128, "eval_accuracy": 0.732027704335829, "eval_loss": 9.546875, "eval_runtime": 240.3497, "eval_samples_per_second": 6879.014, "eval_steps_per_second": 13.439, "step": 94000 }, { "epoch": 13.07459399944949, "grad_norm": 14.193507194519043, "learning_rate": 0.0003, "loss": 38.72788671875, "step": 95000 }, { "epoch": 13.212221304706853, "grad_norm": 18.604595184326172, "learning_rate": 0.0003, "loss": 38.6876171875, "step": 96000 }, { "epoch": 13.212221304706853, "eval_accuracy": 0.7321750300843878, "eval_loss": 9.546875, "eval_runtime": 240.1101, "eval_samples_per_second": 6885.879, "eval_steps_per_second": 13.452, "step": 96000 }, { "epoch": 13.349848609964218, "grad_norm": 16.717756271362305, "learning_rate": 0.0003, "loss": 38.7415390625, "step": 97000 }, { "epoch": 13.48747591522158, "grad_norm": 13.74322509765625, "learning_rate": 0.0003, "loss": 38.704234375, "step": 98000 }, { "epoch": 13.48747591522158, "eval_accuracy": 0.7335116918906991, "eval_loss": 9.4921875, "eval_runtime": 240.4214, "eval_samples_per_second": 6876.962, "eval_steps_per_second": 13.435, "step": 98000 }, { "epoch": 13.625103220478943, "grad_norm": 17.836227416992188, "learning_rate": 0.0003, "loss": 38.6647890625, "step": 99000 }, { "epoch": 13.762730525736306, "grad_norm": 20.256298065185547, "learning_rate": 0.0003, "loss": 38.654390625, "step": 100000 }, { "epoch": 13.762730525736306, "eval_accuracy": 0.7328628073699861, "eval_loss": 9.5078125, "eval_runtime": 240.901, "eval_samples_per_second": 6863.272, "eval_steps_per_second": 13.408, "step": 100000 }, { "epoch": 13.762730525736306, "eval_accuracy": 0.7328718517886109, "eval_loss": 9.5078125, "eval_runtime": 257.3667, "eval_samples_per_second": 6424.176, "eval_steps_per_second": 12.55, "step": 100000 }, { "epoch": 13.90035783099367, "grad_norm": 17.175275802612305, "learning_rate": 0.0003, "loss": 38.7094453125, "step": 101000 }, { "epoch": 14.037985136251033, "grad_norm": 30.791107177734375, "learning_rate": 0.0003, "loss": 38.7431796875, "step": 102000 }, { "epoch": 14.037985136251033, "eval_accuracy": 0.732853775052298, "eval_loss": 9.53125, "eval_runtime": 244.7843, "eval_samples_per_second": 6754.392, "eval_steps_per_second": 13.195, "step": 102000 }, { "epoch": 14.175612441508395, "grad_norm": 15.07434368133545, "learning_rate": 0.0003, "loss": 38.5621875, "step": 103000 }, { "epoch": 14.313239746765758, "grad_norm": 16.333436965942383, "learning_rate": 0.0003, "loss": 38.6172734375, "step": 104000 }, { "epoch": 14.313239746765758, "eval_accuracy": 0.7328456338360237, "eval_loss": 9.515625, "eval_runtime": 243.7048, "eval_samples_per_second": 6784.311, "eval_steps_per_second": 13.254, "step": 104000 }, { "epoch": 14.45086705202312, "grad_norm": 14.872163772583008, "learning_rate": 0.0003, "loss": 38.61624609375, "step": 105000 }, { "epoch": 14.588494357280485, "grad_norm": 15.491616249084473, "learning_rate": 0.0003, "loss": 38.5978203125, "step": 106000 }, { "epoch": 14.588494357280485, "eval_accuracy": 0.7325266385860558, "eval_loss": 9.53125, "eval_runtime": 241.371, "eval_samples_per_second": 6849.906, "eval_steps_per_second": 13.382, "step": 106000 }, { "epoch": 14.726121662537848, "grad_norm": 14.945006370544434, "learning_rate": 0.0003, "loss": 38.621796875, "step": 107000 }, { "epoch": 14.86374896779521, "grad_norm": 14.714298248291016, "learning_rate": 0.0003, "loss": 38.5805546875, "step": 108000 }, { "epoch": 14.86374896779521, "eval_accuracy": 0.7336863429887471, "eval_loss": 9.484375, "eval_runtime": 243.3178, "eval_samples_per_second": 6795.1, "eval_steps_per_second": 13.275, "step": 108000 }, { "epoch": 15.001376273052573, "grad_norm": 17.513687133789062, "learning_rate": 0.0003, "loss": 38.5988359375, "step": 109000 }, { "epoch": 15.139003578309937, "grad_norm": 14.208888053894043, "learning_rate": 0.0003, "loss": 38.5494453125, "step": 110000 }, { "epoch": 15.139003578309937, "eval_accuracy": 0.7334265583450897, "eval_loss": 9.4921875, "eval_runtime": 245.3975, "eval_samples_per_second": 6737.512, "eval_steps_per_second": 13.162, "step": 110000 }, { "epoch": 15.2766308835673, "grad_norm": 20.13620376586914, "learning_rate": 0.0003, "loss": 38.51769140625, "step": 111000 }, { "epoch": 15.414258188824663, "grad_norm": 14.885974884033203, "learning_rate": 0.0003, "loss": 38.52906640625, "step": 112000 }, { "epoch": 15.414258188824663, "eval_accuracy": 0.7332003955432331, "eval_loss": 9.4921875, "eval_runtime": 246.8519, "eval_samples_per_second": 6697.818, "eval_steps_per_second": 13.085, "step": 112000 }, { "epoch": 15.551885494082025, "grad_norm": 14.931363105773926, "learning_rate": 0.0003, "loss": 38.534203125, "step": 113000 }, { "epoch": 15.689512799339388, "grad_norm": 15.144700050354004, "learning_rate": 0.0003, "loss": 38.5433125, "step": 114000 }, { "epoch": 15.689512799339388, "eval_accuracy": 0.7337025970829132, "eval_loss": 9.4765625, "eval_runtime": 244.85, "eval_samples_per_second": 6752.58, "eval_steps_per_second": 13.192, "step": 114000 }, { "epoch": 15.827140104596753, "grad_norm": 17.183073043823242, "learning_rate": 0.0003, "loss": 38.4901015625, "step": 115000 }, { "epoch": 15.964767409854115, "grad_norm": 14.985239028930664, "learning_rate": 0.0003, "loss": 38.51575390625, "step": 116000 }, { "epoch": 15.964767409854115, "eval_accuracy": 0.7338189696183159, "eval_loss": 9.484375, "eval_runtime": 245.1155, "eval_samples_per_second": 6745.266, "eval_steps_per_second": 13.177, "step": 116000 }, { "epoch": 16.10239471511148, "grad_norm": 19.971887588500977, "learning_rate": 0.0003, "loss": 38.4035234375, "step": 117000 }, { "epoch": 16.24002202036884, "grad_norm": 17.1956844329834, "learning_rate": 0.0003, "loss": 38.42918359375, "step": 118000 }, { "epoch": 16.24002202036884, "eval_accuracy": 0.733730730614503, "eval_loss": 9.46875, "eval_runtime": 243.3012, "eval_samples_per_second": 6795.566, "eval_steps_per_second": 13.276, "step": 118000 }, { "epoch": 16.377649325626205, "grad_norm": 15.118714332580566, "learning_rate": 0.0003, "loss": 38.507515625, "step": 119000 }, { "epoch": 16.515276630883566, "grad_norm": 14.03774642944336, "learning_rate": 0.0003, "loss": 38.526671875, "step": 120000 }, { "epoch": 16.515276630883566, "eval_accuracy": 0.733831136300071, "eval_loss": 9.484375, "eval_runtime": 241.9438, "eval_samples_per_second": 6833.691, "eval_steps_per_second": 13.35, "step": 120000 }, { "epoch": 16.515276630883566, "eval_accuracy": 0.7365382984533457, "eval_loss": 9.328125, "eval_runtime": 320.3822, "eval_samples_per_second": 1416.711, "eval_steps_per_second": 2.769, "step": 120000 }, { "epoch": 60.65171074069432, "grad_norm": 8.715871810913086, "learning_rate": 0.0003, "loss": 34.4526328125, "step": 121000 }, { "epoch": 61.15290136608598, "grad_norm": 16.51197052001953, "learning_rate": 0.0003, "loss": 33.80009375, "step": 122000 }, { "epoch": 61.15290136608598, "eval_accuracy": 0.7592972259433672, "eval_loss": 8.328125, "eval_runtime": 322.1474, "eval_samples_per_second": 1408.948, "eval_steps_per_second": 2.753, "step": 122000 }, { "epoch": 61.65421732046622, "grad_norm": 8.568217277526855, "learning_rate": 0.0003, "loss": 33.51157421875, "step": 123000 }, { "epoch": 62.155407945857874, "grad_norm": 13.904038429260254, "learning_rate": 0.0003, "loss": 33.3759140625, "step": 124000 }, { "epoch": 62.155407945857874, "eval_accuracy": 0.7606993464209245, "eval_loss": 8.2578125, "eval_runtime": 310.5126, "eval_samples_per_second": 1461.741, "eval_steps_per_second": 2.857, "step": 124000 }, { "epoch": 62.65672390023813, "grad_norm": 9.302454948425293, "learning_rate": 0.0003, "loss": 33.2303125, "step": 125000 }, { "epoch": 63.15791452562978, "grad_norm": 10.245097160339355, "learning_rate": 0.0003, "loss": 33.114984375, "step": 126000 }, { "epoch": 63.15791452562978, "eval_accuracy": 0.7620252803249203, "eval_loss": 8.1953125, "eval_runtime": 311.281, "eval_samples_per_second": 1458.133, "eval_steps_per_second": 2.85, "step": 126000 }, { "epoch": 63.659230480010024, "grad_norm": 9.459521293640137, "learning_rate": 0.0003, "loss": 33.0674765625, "step": 127000 }, { "epoch": 64.16042110540168, "grad_norm": 12.050172805786133, "learning_rate": 0.0003, "loss": 33.0123046875, "step": 128000 }, { "epoch": 64.16042110540168, "eval_accuracy": 0.7628614283635131, "eval_loss": 8.15625, "eval_runtime": 309.2479, "eval_samples_per_second": 1467.719, "eval_steps_per_second": 2.868, "step": 128000 }, { "epoch": 64.66173705978193, "grad_norm": 8.326544761657715, "learning_rate": 0.0003, "loss": 32.89726171875, "step": 129000 }, { "epoch": 65.16292768517359, "grad_norm": 9.267374038696289, "learning_rate": 0.0003, "loss": 32.78715625, "step": 130000 }, { "epoch": 65.16292768517359, "eval_accuracy": 0.7632605632607093, "eval_loss": 8.1484375, "eval_runtime": 313.0209, "eval_samples_per_second": 1450.028, "eval_steps_per_second": 2.834, "step": 130000 }, { "epoch": 65.66424363955383, "grad_norm": 9.583052635192871, "learning_rate": 0.0003, "loss": 32.747501953125, "step": 131000 }, { "epoch": 66.16543426494549, "grad_norm": 8.761311531066895, "learning_rate": 0.0003, "loss": 32.67369140625, "step": 132000 }, { "epoch": 66.16543426494549, "eval_accuracy": 0.7639422135833412, "eval_loss": 8.1015625, "eval_runtime": 311.6656, "eval_samples_per_second": 1456.333, "eval_steps_per_second": 2.846, "step": 132000 }, { "epoch": 66.66675021932573, "grad_norm": 8.83479118347168, "learning_rate": 0.0003, "loss": 32.617767578125, "step": 133000 }, { "epoch": 67.16794084471738, "grad_norm": 8.598926544189453, "learning_rate": 0.0003, "loss": 32.5695625, "step": 134000 }, { "epoch": 67.16794084471738, "eval_accuracy": 0.7644549296283725, "eval_loss": 8.078125, "eval_runtime": 308.4248, "eval_samples_per_second": 1471.636, "eval_steps_per_second": 2.876, "step": 134000 }, { "epoch": 67.66925679909762, "grad_norm": 10.846793174743652, "learning_rate": 0.0003, "loss": 32.53196484375, "step": 135000 }, { "epoch": 68.17044742448928, "grad_norm": 23.080833435058594, "learning_rate": 0.0003, "loss": 32.47344140625, "step": 136000 }, { "epoch": 68.17044742448928, "eval_accuracy": 0.7638109525627774, "eval_loss": 8.109375, "eval_runtime": 312.4841, "eval_samples_per_second": 1452.519, "eval_steps_per_second": 2.839, "step": 136000 }, { "epoch": 68.67176337886953, "grad_norm": 11.440296173095703, "learning_rate": 0.0003, "loss": 32.4546796875, "step": 137000 }, { "epoch": 69.17295400426119, "grad_norm": 9.561952590942383, "learning_rate": 0.0003, "loss": 32.3915703125, "step": 138000 }, { "epoch": 69.17295400426119, "eval_accuracy": 0.7654183586207545, "eval_loss": 8.0234375, "eval_runtime": 311.376, "eval_samples_per_second": 1457.688, "eval_steps_per_second": 2.849, "step": 138000 }, { "epoch": 69.67426995864143, "grad_norm": 10.652801513671875, "learning_rate": 0.0003, "loss": 32.3813203125, "step": 139000 }, { "epoch": 70.17546058403309, "grad_norm": 9.549755096435547, "learning_rate": 0.0003, "loss": 32.329857421875, "step": 140000 }, { "epoch": 70.17546058403309, "eval_accuracy": 0.765731013146163, "eval_loss": 8.015625, "eval_runtime": 311.5237, "eval_samples_per_second": 1456.997, "eval_steps_per_second": 2.847, "step": 140000 }, { "epoch": 70.17546058403309, "eval_accuracy": 0.7655576478890911, "eval_loss": 8.03125, "eval_runtime": 312.782, "eval_samples_per_second": 1451.135, "eval_steps_per_second": 2.836, "step": 140000 }, { "epoch": 70.67677653841334, "grad_norm": 8.273364067077637, "learning_rate": 0.0003, "loss": 32.32880859375, "step": 141000 }, { "epoch": 71.177967163805, "grad_norm": 11.310037612915039, "learning_rate": 0.0003, "loss": 32.2803671875, "step": 142000 }, { "epoch": 71.177967163805, "eval_accuracy": 0.7654140428452689, "eval_loss": 8.0234375, "eval_runtime": 302.7715, "eval_samples_per_second": 1499.114, "eval_steps_per_second": 2.93, "step": 142000 }, { "epoch": 71.67928311818524, "grad_norm": 9.46422004699707, "learning_rate": 0.0003, "loss": 32.241615234375, "step": 143000 }, { "epoch": 72.18047374357688, "grad_norm": 9.287914276123047, "learning_rate": 0.0003, "loss": 32.22880078125, "step": 144000 }, { "epoch": 72.18047374357688, "eval_accuracy": 0.7658155554395308, "eval_loss": 8.015625, "eval_runtime": 300.7976, "eval_samples_per_second": 1508.951, "eval_steps_per_second": 2.949, "step": 144000 }, { "epoch": 72.68178969795714, "grad_norm": 9.183584213256836, "learning_rate": 0.0003, "loss": 32.233244140625, "step": 145000 }, { "epoch": 73.18298032334879, "grad_norm": 9.008417129516602, "learning_rate": 0.0003, "loss": 32.181228515625, "step": 146000 }, { "epoch": 73.18298032334879, "eval_accuracy": 0.76619202647217, "eval_loss": 7.98828125, "eval_runtime": 302.0251, "eval_samples_per_second": 1502.819, "eval_steps_per_second": 2.937, "step": 146000 }, { "epoch": 73.68429627772903, "grad_norm": 8.19743537902832, "learning_rate": 0.0003, "loss": 32.162357421875, "step": 147000 }, { "epoch": 74.18548690312069, "grad_norm": 8.455910682678223, "learning_rate": 0.0003, "loss": 32.091048828125, "step": 148000 }, { "epoch": 74.18548690312069, "eval_accuracy": 0.7663843416476586, "eval_loss": 7.97265625, "eval_runtime": 301.7215, "eval_samples_per_second": 1504.331, "eval_steps_per_second": 2.94, "step": 148000 }, { "epoch": 74.68680285750094, "grad_norm": 8.09157943725586, "learning_rate": 0.0003, "loss": 32.071322265625, "step": 149000 }, { "epoch": 75.1879934828926, "grad_norm": 12.704072952270508, "learning_rate": 0.0003, "loss": 32.044611328125, "step": 150000 }, { "epoch": 75.1879934828926, "eval_accuracy": 0.7670952482486783, "eval_loss": 7.96484375, "eval_runtime": 301.7456, "eval_samples_per_second": 1504.211, "eval_steps_per_second": 2.94, "step": 150000 }, { "epoch": 75.1879934828926, "step": 150000, "total_flos": 3.23779983669461e+19, "train_loss": 2.1457560286458333, "train_runtime": 27498.2172, "train_samples_per_second": 11171.633, "train_steps_per_second": 5.455 } ], "logging_steps": 1000, "max_steps": 150000, "num_input_tokens_seen": 0, "num_train_epochs": 76, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.23779983669461e+19, "train_batch_size": 64, "trial_name": null, "trial_params": null }